# Smart Agriculture Fertilizer Prediction

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 154
np.random.seed(RANDOM_STATE)


# Importing Data

In [3]:
df = pd.read_csv("../Crop_Yield_Fertilizer.csv") 

# Data Distribution

In [4]:
print(df.shape)
df.head()

(25000, 10)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label,yield,fertilizer
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.935536,rice,71.199428,DAP
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.655537,rice,81.620199,DAP
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.964248,rice,80.47313,Gypsum
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.864034,rice,75.178196,DAP
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.71734,rice,75.485563,Gypsum


# Sanity Checks

In [17]:
# Missing values
print(df.isnull().sum())

# Target distribution 1 
print(df['fertilizer'].value_counts())

# Target distribution 2
print(df['label'].value_counts())


N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
yield          0
fertilizer     0
dtype: int64
fertilizer
Urea                 14683
MOP                   5071
SSP                   2770
Lime                   957
DAP                    685
Rock Phosphate         368
Gypsum                 288
Rhizobium              148
Potassium Nitrate       30
Name: count, dtype: int64
label
orange         1196
kidneybeans    1180
papaya         1174
maize          1168
mungbean       1164
banana         1164
chickpea       1159
rice           1149
mothbeans      1149
coconut        1145
blackgram      1145
coffee         1140
watermelon     1133
jute           1129
pigeonpeas     1122
grapes         1111
apple          1110
cotton         1108
muskmelon      1098
pomegranate    1092
lentil         1085
mango          1079
Name: count, dtype: int64


# Data Splitting

In [None]:
X = df.drop(columns=['fertilizer','label'])
y = df['fertilizer']


# Encoding Target Variable

In [8]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

num_classes = len(label_encoder.classes_)
print("Number of fertilizer classes:", num_classes)


Number of fertilizer classes: 9


# Feature Engineering Setup

In [9]:
numerical_features = X.columns.tolist()

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ]
)


# Model Initializing

In [10]:
model = XGBClassifier(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softprob',
    num_class=num_classes,
    eval_metric='mlogloss',
    random_state=RANDOM_STATE
    # n_jobs=-1
)


# Pipeline Structured

In [11]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


# Train Test Split Data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.18,
    stratify=y_encoded,
    random_state=48945
)


In [15]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(20500, 9) (4500, 9) (20500,) (4500,)


# Model Training

In [16]:
pipeline.fit(X_train, y_train)


ValueError: could not convert string to float: 'mango'