In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder

# 1️⃣ Load Dataset
df = pd.read_csv("abcd.csv")  # <-- replace with your generated dataset

# 2️⃣ Clean & Prepare Numeric Columns
numeric_cols = ["N", "P", "K", "temperature", "humidity", "ph", "rainfall"]

# Convert to numeric safely
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop invalid rows
df = df.dropna(subset=numeric_cols)

# 3️⃣ Prepare Features & Labels
X = df[numeric_cols]
y = df["crop"]   # <-- 'crop' is your target column

# 4️⃣ Encode Crop Labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 5️⃣ Save Label Encoder for Later Use
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# 6️⃣ Print Numeric Label → Crop Mapping
print("✅ Label Mapping (Number → Crop Name):")
for i, crop in enumerate(le.classes_):
    print(f"{i} → {crop}")

# 7️⃣ Create Mapping Dictionary for Reference
label_mapping = {i: crop for i, crop in enumerate(le.classes_)}

print("\n# Use this dictionary in other scripts:")
print("label_mapping = {")
for num, crop in label_mapping.items():
    print(f"    {num}: '{crop}',")
print("}")

# ✅ Optional: Show summary
print("\nDataset Summary:")
print(df.head(10))
print(f"\nTotal rows: {len(df)}")
print(f"Unique crops: {df['crop'].nunique()}")
print(f"States covered: {df['state'].nunique()}")


✅ Label Mapping (Number → Crop Name):
0 → bajra
1 → coffee
2 → cotton
3 → groundnut
4 → jowar
5 → maize
6 → onion
7 → potato
8 → pulses
9 → rice
10 → soybean
11 → sugarcane
12 → tea
13 → tomato
14 → wheat

# Use this dictionary in other scripts:
label_mapping = {
    0: 'bajra',
    1: 'coffee',
    2: 'cotton',
    3: 'groundnut',
    4: 'jowar',
    5: 'maize',
    6: 'onion',
    7: 'potato',
    8: 'pulses',
    9: 'rice',
    10: 'soybean',
    11: 'sugarcane',
    12: 'tea',
    13: 'tomato',
    14: 'wheat',
}

Dataset Summary:
            state       N      P       K  temperature  humidity    ph  \
0       Jharkhand   51.49  25.10   42.18        31.35      48.6  6.77   
1         Tripura   67.81  36.38   47.18        23.57      56.8  6.48   
2      Tamil Nadu   35.51  28.81   29.58        23.00      63.4  6.61   
3           Bihar   69.83  33.48   39.43        26.26      67.9  6.32   
4     Maharashtra  116.75  53.15  122.32        26.73      74.2  6.88   
5  Andhra Pradesh   3

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42
)
print(X_train.head(5))
print(X_train.shape)
print(" ")


print(X_test.head(5))
print(X_test.shape)
print(" ")


           N      P      K  temperature  humidity    ph  rainfall
9254   74.89  25.94  44.63        31.59      77.2  6.16     11.84
1561   75.32  38.44  49.81        27.29      80.0  6.93     11.51
1670   35.44  30.54  43.09        26.83      78.4  6.66      4.70
6087  100.69  44.61  55.28        16.75      66.5  7.80      2.40
6669   76.44  43.42  75.74        22.72      81.9  4.61      8.47
(8000, 7)
 
           N      P      K  temperature  humidity    ph  rainfall
6252   79.53  49.35  54.01        17.81      73.0  7.16      7.31
4684  101.40  52.71  44.95        20.24      40.0  7.80      0.80
1731   73.57  44.08  45.66        26.77      52.8  6.54      0.34
4742   30.55  25.53  36.73        19.75      55.4  6.66      0.85
4521   98.67  59.65  82.26        31.51      81.7  7.23      0.13
(2000, 7)
 


In [4]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}


In [5]:
results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    precision = np.mean([v["precision"] for k, v in report.items() if isinstance(v, dict)])
    recall = np.mean([v["recall"] for k, v in report.items() if isinstance(v, dict)])
    f1 = np.mean([v["f1-score"] for k, v in report.items() if isinstance(v, dict)])

    results.append({
        "Model": name,
        "Accuracy": round(acc, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4)
    })



Training Decision Tree...

Training Random Forest...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000858 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1771
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 7
[LightGBM] [Info] Start training from score -2.706801
[LightGBM] [Info] Start training from score -2.746921
[LightGBM] [Info] Start training from score -2.754749
[LightGBM] [Info] Start training from score -2.714320
[LightGBM] [Info] Start training from score -2.782639
[LightGBM] [Info] Start training from score -2.593606
[LightGBM] [Info] Start training from score -2.659260
[LightGBM] [Info] Start training from score -2.752786
[LightGBM] [Info] Start training from score -2.750827
[LightGBM] [Info] Start training from score -2.756715
[LightGBM] [Info] Start training from score -2.639808
[LightGBM] [Info] Start training from score -2.703063
[LightGBM] [Info] Start training from score

In [6]:
results_df = pd.DataFrame(results).sort_values(by="Accuracy", ascending=False)
print("\n         ================================")
print("           Model Performance Comparison")
print("         ================================")
print(results_df.to_string(index=False))
import warnings
warnings.filterwarnings("ignore")


           Model Performance Comparison
        Model  Accuracy  Precision  Recall  F1 Score
      XGBoost     0.816     0.8188  0.8185    0.8178
Random Forest     0.812     0.8121  0.8144    0.8121
     LightGBM     0.811     0.8131  0.8142    0.8127
Decision Tree     0.727     0.7324  0.7303    0.7304


In [7]:
import pickle
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Example: encode target labels if needed
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Train XGBoost model
xgb_model = XGBClassifier(
    random_state=42,
    n_estimators=300,     # More trees for better performance
    learning_rate=0.1,    # Controls speed/accuracy tradeoff
    max_depth=6,          # Controls model complexity
    subsample=0.8,        # Helps prevent overfitting
    colsample_bytree=0.8, # Random feature sampling per tree
    eval_metric="mlogloss" # Recommended for classification
)

xgb_model.fit(X_train, y_train_encoded)

# Save both model and label encoder
with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

print("✅ XGBoost model saved successfully!")


✅ XGBoost model saved successfully!
