In [95]:
#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [96]:
# Loading dataset into data_set variable
data_set = pd.read_csv('Soil_samples_check_against_fertilizer_2018-2020.csv')

In [97]:
# Check null values count
data_set.isnull().sum()

POINTID                   0
Depth                     0
EC                        0
N                         0
P                         0
K                         0
OC                        0
Ca                        0
S                         0
pH_H2O                    0
Fertilizer(N:P:K)         0
Fertilizer Quantity       0
N-After Fertilization     0
P-After Fertilization     0
K-After Fertilization     0
Ca-After Fertilization    0
S-After Fertilization     0
Elevation                 0
dtype: int64

In [98]:
# Create new features representing the difference in nutrient values
data_set['delta_N'] = data_set['N-After Fertilization'] - data_set['N']
data_set['delta_P'] = data_set['P-After Fertilization'] - data_set['P']
data_set['delta_K'] = data_set['K-After Fertilization'] - data_set['K']

In [99]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1325 entries, 0 to 1324
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   POINTID                 1325 non-null   int64  
 1   Depth                   1325 non-null   object 
 2   EC                      1325 non-null   float64
 3   N                       1325 non-null   float64
 4   P                       1325 non-null   float64
 5   K                       1325 non-null   float64
 6   OC                      1325 non-null   object 
 7   Ca                      1325 non-null   float64
 8   S                       1325 non-null   float64
 9   pH_H2O                  1325 non-null   float64
 10  Fertilizer(N:P:K)       1325 non-null   object 
 11  Fertilizer Quantity     1325 non-null   int64  
 12  N-After Fertilization   1325 non-null   float64
 13  P-After Fertilization   1325 non-null   float64
 14  K-After Fertilization   1325 non-null   

In [100]:
data_set.head(20)

Unnamed: 0,POINTID,Depth,EC,N,P,K,OC,Ca,S,pH_H2O,...,Fertilizer Quantity,N-After Fertilization,P-After Fertilization,K-After Fertilization,Ca-After Fertilization,S-After Fertilization,Elevation,delta_N,delta_P,delta_K
0,47862690,0-20 cm,8.73,74.4,106.2,246.7,12.4,1591.9,12.8,4.81,...,290,74.4,106.2,329.0,1591.9,12.8,158,0.0,0.0,82.3
1,47882704,0-20 cm,5.06,56.9,132.3,195.2,16.7,919.8,28.5,4.93,...,40,113.9,132.3,195.2,919.8,38.1,500,57.0,0.0,0.0
2,47982688,0-20 cm,12.53,43.9,29.5,99.1,47.5,1539.7,15.4,4.85,...,210,121.9,29.5,99.1,1539.7,29.9,404,78.0,0.0,0.0
3,48022702,0-20 cm,21.1,20.2,88.8,225.2,28.1,1441.0,25.4,5.8,...,40,37.6,88.8,240.0,1441.0,25.4,364,17.4,0.0,14.8
4,48062708,0-20 cm,10.89,12.5,88.6,205.9,19.4,1466.7,26.2,6.48,...,150,38.6,88.6,223.9,1466.7,26.2,315,26.1,0.0,18.0
5,48122730,0-20 cm,14.46,79.1,108.6,75.0,18.0,1158.7,25.7,6.81,...,150,89.3,108.6,75.0,1221.0,25.7,137,10.2,0.0,0.0
6,48202738,0-20 cm,17.08,40.3,137.7,165.8,12.5,1022.7,29.0,6.37,...,90,68.9,137.7,165.8,1022.7,29.0,131,28.6,0.0,0.0
7,48222780,0-20 cm,23.55,78.1,31.9,262.8,22.9,1128.4,22.8,7.98,...,40,99.3,31.9,300.6,1128.4,22.8,137,21.2,0.0,37.8
8,48342774,0-20 cm,22.61,18.0,50.2,138.3,29.1,1589.9,18.7,8.09,...,40,33.3,50.2,138.3,1589.9,18.7,514,15.3,0.0,0.0
9,48442784,0-20 cm,14.94,52.6,121.9,109.5,14.4,1261.1,21.5,7.84,...,40,69.6,121.9,109.5,1261.1,21.5,232,17.0,0.0,0.0


In [101]:
data_set.describe()

Unnamed: 0,POINTID,EC,N,P,K,Ca,S,pH_H2O,Fertilizer Quantity,N-After Fertilization,P-After Fertilization,K-After Fertilization,Ca-After Fertilization,S-After Fertilization,Elevation,delta_N,delta_P,delta_K
count,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0,1325.0
mean,50411280.0,18.292143,48.714642,82.494113,163.867245,1203.790491,20.349887,6.310921,135.101887,73.798943,97.325811,185.792075,1216.589585,22.368906,786.878491,25.084302,14.831698,21.92483
std,6698252.0,17.963201,23.575075,36.41855,61.060386,276.158962,5.182507,1.181918,92.512395,32.351661,45.678811,67.620452,278.626014,7.120122,1408.13416,22.458478,25.389528,28.588397
min,27182010.0,2.89,8.4,19.5,59.2,721.6,11.0,3.46,10.0,8.4,19.5,59.2,721.6,11.0,-30.0,0.0,0.0,0.0
25%,46182650.0,8.59,28.0,49.9,109.9,966.7,16.1,5.47,50.0,49.6,59.2,132.2,975.9,16.8,209.0,0.0,0.0,0.0
50%,48002870.0,13.67,48.4,83.4,166.6,1214.6,20.6,6.38,120.0,72.6,99.0,184.6,1231.3,22.3,490.0,21.6,0.0,0.0
75%,55622350.0,21.9,69.3,113.9,218.2,1427.3,24.7,7.24,200.0,96.9,129.5,239.3,1443.9,26.7,1058.0,38.8,27.3,47.7
max,64981670.0,332.6,89.5,145.9,269.0,1688.7,29.2,8.72,350.0,163.7,225.0,348.7,1808.9,47.4,11424.0,80.5,87.3,89.9


# Preprocess the dataset

Feature selection

In [102]:
# Drop unwanted columns for fertilizer prediction
data_set = data_set.drop(['POINTID','Depth','EC','OC','Ca','S','pH_H2O','Ca-After Fertilization','S-After Fertilization','Elevation'], axis = 'columns')

In [103]:
# Print the updated dataset with additional features
data_set.head(25)

Unnamed: 0,N,P,K,Fertilizer(N:P:K),Fertilizer Quantity,N-After Fertilization,P-After Fertilization,K-After Fertilization,delta_N,delta_P,delta_K
0,74.4,106.2,246.7,MOP(0:0:60),290,74.4,106.2,329.0,0.0,0.0,82.3
1,56.9,132.3,195.2,Ammonium Sulphate(AS),40,113.9,132.3,195.2,57.0,0.0,0.0
2,43.9,29.5,99.1,Ammonium Sulphate(AS),210,121.9,29.5,99.1,78.0,0.0,0.0
3,20.2,88.8,225.2,YaraMila NK(1:0:1),40,37.6,88.8,240.0,17.4,0.0,14.8
4,12.5,88.6,205.9,YaraMila NK(1:0:1),150,38.6,88.6,223.9,26.1,0.0,18.0
5,79.1,108.6,75.0,Calcium Ammonium Nitrate(CAN),150,89.3,108.6,75.0,10.2,0.0,0.0
6,40.3,137.7,165.8,Urea(46:0:0),90,68.9,137.7,165.8,28.6,0.0,0.0
7,78.1,31.9,262.8,YaraMila NK(1:0:1),40,99.3,31.9,300.6,21.2,0.0,37.8
8,18.0,50.2,138.3,Urea(46:0:0),40,33.3,50.2,138.3,15.3,0.0,0.0
9,52.6,121.9,109.5,Urea(46:0:0),40,69.6,121.9,109.5,17.0,0.0,0.0


In [104]:
data_set['Fertilizer(N:P:K)'].unique()

array(['MOP(0:0:60)', 'Ammonium Sulphate(AS)', 'YaraMila NK(1:0:1)',
       'Calcium Ammonium Nitrate(CAN)', 'Urea(46:0:0)', 'TSP(0:46:0)',
       'YaraMila NPKS(1:1:1)'], dtype=object)

In [105]:
# Select only Considering fertilizers from 'Fertilizer(N:P:K)' column
data_set = data_set[(data_set['Fertilizer(N:P:K)'] == 'TSP(0:46:0)') | (data_set['Fertilizer(N:P:K)'] == 'MOP(0:0:60)')|
                    (data_set['Fertilizer(N:P:K)'] == 'YaraMila NK(1:0:1)')| (data_set['Fertilizer(N:P:K)'] == 'YaraMila NPKS(1:1:1)')|
                    (data_set['Fertilizer(N:P:K)'] == 'Urea(46:0:0)')
                   ]

In [106]:
data_set['Fertilizer(N:P:K)'].value_counts()

Fertilizer(N:P:K)
YaraMila NPKS(1:1:1)    203
Urea(46:0:0)            193
MOP(0:0:60)             189
TSP(0:46:0)             185
YaraMila NK(1:0:1)      182
Name: count, dtype: int64

Encoding Categorical variable

In [107]:
import pandas as pd

# Create a new DataFrame to store the encoded data
encoded_data_set = pd.DataFrame()

# Copy the float columns to the encoded DataFrame
encoded_data_set = data_set.select_dtypes(include=['float']).copy()

# Include the target variable in the encoded DataFrame
encoded_data_set['Fertilizer Quantity'] = data_set['Fertilizer Quantity']

# Perform one-hot encoding on the 'Fertilizer' column
encoded_fertilizer = pd.get_dummies(data_set['Fertilizer(N:P:K)'])

# Concatenate the encoded 'Fertilizer' column with the existing columns
data_set = pd.concat([encoded_data_set, encoded_fertilizer], axis=1)

In [108]:
data_set.head(25)

Unnamed: 0,N,P,K,N-After Fertilization,P-After Fertilization,K-After Fertilization,delta_N,delta_P,delta_K,Fertilizer Quantity,MOP(0:0:60),TSP(0:46:0),Urea(46:0:0),YaraMila NK(1:0:1),YaraMila NPKS(1:1:1)
0,74.4,106.2,246.7,74.4,106.2,329.0,0.0,0.0,82.3,290,True,False,False,False,False
3,20.2,88.8,225.2,37.6,88.8,240.0,17.4,0.0,14.8,40,False,False,False,True,False
4,12.5,88.6,205.9,38.6,88.6,223.9,26.1,0.0,18.0,150,False,False,False,True,False
6,40.3,137.7,165.8,68.9,137.7,165.8,28.6,0.0,0.0,90,False,False,True,False,False
7,78.1,31.9,262.8,99.3,31.9,300.6,21.2,0.0,37.8,40,False,False,False,True,False
8,18.0,50.2,138.3,33.3,50.2,138.3,15.3,0.0,0.0,40,False,False,True,False,False
9,52.6,121.9,109.5,69.6,121.9,109.5,17.0,0.0,0.0,40,False,False,True,False,False
11,37.3,66.4,69.5,60.9,66.4,69.5,23.6,0.0,0.0,60,False,False,True,False,False
12,58.2,116.5,203.8,58.2,177.5,203.8,0.0,61.0,0.0,210,False,True,False,False,False
15,30.0,91.6,223.9,45.9,91.6,223.9,15.9,0.0,0.0,40,False,False,True,False,False


In [109]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 952 entries, 0 to 1324
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   N                      952 non-null    float64
 1   P                      952 non-null    float64
 2   K                      952 non-null    float64
 3   N-After Fertilization  952 non-null    float64
 4   P-After Fertilization  952 non-null    float64
 5   K-After Fertilization  952 non-null    float64
 6   delta_N                952 non-null    float64
 7   delta_P                952 non-null    float64
 8   delta_K                952 non-null    float64
 9   Fertilizer Quantity    952 non-null    int64  
 10  MOP(0:0:60)            952 non-null    bool   
 11  TSP(0:46:0)            952 non-null    bool   
 12  Urea(46:0:0)           952 non-null    bool   
 13  YaraMila NK(1:0:1)     952 non-null    bool   
 14  YaraMila NPKS(1:1:1)   952 non-null    bool   
dtypes: bool(5)

Train-Test Split

In [110]:
# Separate the input features and the target variable
X = data_set.drop('Fertilizer Quantity', axis=1)
y = data_set['Fertilizer Quantity']

In [111]:
# Split the data set to train and test data sets
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=120)
x_train.shape, x_test.shape

((714, 14), (238, 14))

# Model Building

In [112]:
# Function to check the model accuracy

def model_acc(model):
    model.fit(x_train,y_train)
    accuracy=model.score(x_test,y_test)
    print('Accuracy of '+str(model)+ '---->' + str(accuracy))

Decision Tree Regressor

In [113]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor()
model_acc(dt)

Accuracy of DecisionTreeRegressor()---->0.8831057317140187


Random Forest Regression

In [114]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor=RandomForestRegressor(n_estimators=100, random_state=43)
model_acc(rf_regressor)

Accuracy of RandomForestRegressor(random_state=43)---->0.8935494539278452


In [115]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=43)
rf_regressor.fit(x_train, y_train)
rf_y_pred = rf_regressor.predict(x_test)

# Evaluate Random Forest Regressor
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_r2 = r2_score(y_test, rf_y_pred)

print("Random Forest Regressor:")
print("Mean Squared Error:", rf_mse)
print("Mean Absolute Error:", rf_mae)
print("R-squared:", rf_r2)
print()

Random Forest Regressor:
Mean Squared Error: 1094.3185294117648
Mean Absolute Error: 18.251680672268908
R-squared: 0.8935494539278452



Gradient Boosting Regressor

In [116]:
from sklearn.ensemble import GradientBoostingRegressor

gbr=GradientBoostingRegressor()
model_acc(gbr)

Accuracy of GradientBoostingRegressor()---->0.8821425709355797


In [117]:
data_set.corr()['Fertilizer Quantity']

N                       -0.020302
P                        0.021954
K                       -0.023157
N-After Fertilization   -0.236181
P-After Fertilization    0.220436
K-After Fertilization    0.181630
delta_N                 -0.320732
delta_P                  0.346432
delta_K                  0.465862
Fertilizer Quantity      1.000000
MOP(0:0:60)              0.537289
TSP(0:46:0)              0.244739
Urea(46:0:0)            -0.512882
YaraMila NK(1:0:1)      -0.232658
YaraMila NPKS(1:1:1)    -0.032902
Name: Fertilizer Quantity, dtype: float64

In [118]:
data_set.head(20)

Unnamed: 0,N,P,K,N-After Fertilization,P-After Fertilization,K-After Fertilization,delta_N,delta_P,delta_K,Fertilizer Quantity,MOP(0:0:60),TSP(0:46:0),Urea(46:0:0),YaraMila NK(1:0:1),YaraMila NPKS(1:1:1)
0,74.4,106.2,246.7,74.4,106.2,329.0,0.0,0.0,82.3,290,True,False,False,False,False
3,20.2,88.8,225.2,37.6,88.8,240.0,17.4,0.0,14.8,40,False,False,False,True,False
4,12.5,88.6,205.9,38.6,88.6,223.9,26.1,0.0,18.0,150,False,False,False,True,False
6,40.3,137.7,165.8,68.9,137.7,165.8,28.6,0.0,0.0,90,False,False,True,False,False
7,78.1,31.9,262.8,99.3,31.9,300.6,21.2,0.0,37.8,40,False,False,False,True,False
8,18.0,50.2,138.3,33.3,50.2,138.3,15.3,0.0,0.0,40,False,False,True,False,False
9,52.6,121.9,109.5,69.6,121.9,109.5,17.0,0.0,0.0,40,False,False,True,False,False
11,37.3,66.4,69.5,60.9,66.4,69.5,23.6,0.0,0.0,60,False,False,True,False,False
12,58.2,116.5,203.8,58.2,177.5,203.8,0.0,61.0,0.0,210,False,True,False,False,False
15,30.0,91.6,223.9,45.9,91.6,223.9,15.9,0.0,0.0,40,False,False,True,False,False


Hyper Parameter Tuning

In [119]:
best_model= RandomForestRegressor(n_estimators=100, random_state=43)

In [120]:
best_model.fit(x_train,y_train)

In [121]:
best_model.score(x_test,y_test)

0.8935494539278452

In [123]:
#Use Pickle library to save the model
import pickle
with open('Fertilizer_Quantity_Predictor.pickle','wb') as file:
    pickle.dump(best_model,file)


In [122]:
best_model.predict([[12.0,40.2,148.3,12.0,40.2,181.3,0.0,0.0,33.0,1,0,0,0,0]])

array([149.2])