In [20]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import top_k_accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib

# Load dataset
df = pd.read_csv("data_core.csv")

SEED = 42
np.random.seed(SEED)

# Label Encoding
le_soil = LabelEncoder()
le_crop = LabelEncoder()
le_fert = LabelEncoder()

df['Soil Type Encoded'] = le_soil.fit_transform(df['Soil Type'])
df['Crop Type Encoded'] = le_crop.fit_transform(df['Crop Type'])
df['Fertilizer Encoded'] = le_fert.fit_transform(df['Fertilizer Name'])

# Save encoders
joblib.dump(le_soil, "le_soil.pkl")
joblib.dump(le_crop, "le_crop.pkl")
joblib.dump(le_fert, "le_fert.pkl")

# Scale numerical features
numerical = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
scaler = StandardScaler()
df[numerical] = scaler.fit_transform(df[numerical])

# Clustering
cat_features = df[['Soil Type Encoded', 'Crop Type Encoded', 'Fertilizer Encoded']]
kmeans_cat = KMeans(n_clusters=3, random_state=SEED)
df['CatCluster'] = kmeans_cat.fit_predict(cat_features)

num_cluster_features = df[['Nitrogen', 'Phosphorous', 'Fertilizer Encoded']]
kmeans_num = KMeans(n_clusters=3, random_state=SEED)
df['NumCluster'] = kmeans_num.fit_predict(num_cluster_features)

# Interaction terms
df['Temp_Humidity'] = df['Temparature'] * df['Humidity']
df['Moisture_Nitrogen'] = df['Moisture'] * df['Nitrogen']

# Final features and target
features = numerical + ['Soil Type Encoded', 'Crop Type Encoded', 'CatCluster', 'NumCluster',
                        'Temp_Humidity', 'Moisture_Nitrogen']
X = df[features]
y = df['Fertilizer Encoded']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

# SMOTE for class balancing
smote = SMOTE(random_state=SEED)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# XGBoost training
xgb = XGBClassifier(
    random_state=SEED,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb.fit(X_train_res, y_train_res)

# Evaluate Top-3 accuracy
y_proba = xgb.predict_proba(X_test)
top3_acc = top_k_accuracy_score(y_test, y_proba, k=3)
print(f"\n✅ XGBoost Top-3 Accuracy: {top3_acc:.4f}")

# Save models
joblib.dump(xgb, "fertilizer_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(kmeans_cat, "cluster_model_cat.pkl")
joblib.dump(kmeans_num, "cluster_model_num.pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ XGBoost Top-3 Accuracy: 0.9406


['cluster_model_num.pkl']