## Introduction to the data

### Crop recommendation data:

## First step: Read data sets, check duplicates, nulls and gather statsistical information about each feature

In [173]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pyplot as plt
import warnings

In [174]:
df=pd.read_csv(r'crop_recommend.csv')
df

Unnamed: 0,Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall,label
0,25.0,129.0,195,"17,98667801°C","81,17712085%",5777271492,7237127689,Grapes
1,106.0,20.0,51,"29,73019662°C","90,97015715%",6342573112,2049035619,muskmelon
2,33.0,59.0,22,"22,64236876°C","21,59396123%",5946999529,1223886015,kidneybeans
3,89.0,9.0,47,"29,47156259°C","90,77069618%",6668382766,2875226067,muskmelon
4,62.0,49.0,37,"24,21744605°C","82,85284045%",7479248124,1661365886,jute
...,...,...,...,...,...,...,...,...
2195,1.0,27.0,36,"23,98598756°C","93,34236582%",5684995235,104991282,pomegranate
2196,31.0,25.0,38,"24,96273236°C","92,40501423%",6497366677,1094169192,pomegranate
2197,92.0,,52,"28,0106804°C","76,52808057%",5891413895,1037040783,banana
2198,28.0,58.0,81,"17,47500984°C","16,54314829%",618042747,9335034262,chickpea


In [175]:
df.dtypes

Nitrogen       float64
phosphorus     float64
potassium        int64
temperature     object
humidity        object
ph              object
rainfall        object
label           object
dtype: object

#### Observations: 
For the temperature feature, humidity, ph and rainfall, the comma to be replaced with '.'  and for the temperature feature the celsius to be removed and for humidity the percentage to be removed.                  

In [176]:
##Check duplicates
df.duplicated().sum()## no duplicates is included in the data

0

In [177]:
##Check nulls
df.isnull().sum()

Nitrogen       19
phosphorus     15
potassium       0
temperature     0
humidity        0
ph              0
rainfall        0
label           3
dtype: int64

In [178]:
df.isnull().mean()*100

Nitrogen       0.863636
phosphorus     0.681818
potassium      0.000000
temperature    0.000000
humidity       0.000000
ph             0.000000
rainfall       0.000000
label          0.136364
dtype: float64

#### Observations: 
There are null values to be filled for features except for the target the nulls will be dropped

In [179]:
##Check the data type in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Nitrogen     2181 non-null   float64
 1   phosphorus   2185 non-null   float64
 2   potassium    2200 non-null   int64  
 3   temperature  2200 non-null   object 
 4   humidity     2200 non-null   object 
 5   ph           2200 non-null   object 
 6   rainfall     2200 non-null   object 
 7   label        2197 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 137.6+ KB


#### Observations: 
The humidity, temperature, ph and rainfall features data type shall be float


In [180]:
##Gather brief statistical information for numerical columns
df.select_dtypes('number').describe()

Unnamed: 0,Nitrogen,phosphorus,potassium
count,2181.0,2185.0,2200.0
mean,50.649243,53.26087,48.149091
std,36.907563,32.983813,50.647931
min,0.0,5.0,5.0
25%,21.0,28.0,20.0
50%,37.0,51.0,32.0
75%,85.0,68.0,49.0
max,140.0,145.0,205.0


#### Observations: 
* Other features to be analyzed after converting the object into integer
* It is expected to find outliers in both nitrogen and potassium columns as the value of median is much different than mean.

## Second step: Univariate analysis and feature engineering

In [181]:
##for the target columns
df['label'].value_counts()

chickpea       100
cotton         100
maize          100
apple          100
lentil         100
coconut        100
muskmelon      100
mothbeans      100
mango          100
coffee         100
blackgram      100
mungbean       100
pigeonpeas     100
orange         100
watermelon     100
jute           100
kidneybeans    100
papaya         100
banana          99
rice            99
pomegranate     70
grapes          69
Pomegranate     30
Grapes          30
Name: label, dtype: int64

#### Observations: 
Target classes are balanced, therefore no need for undersampling or oversampling, but there are some value classes written in 
different letter, therefore it shall be modified

In [182]:
df['label'].unique()

array(['Grapes', 'muskmelon', 'kidneybeans', 'jute', 'watermelon',
       'orange', 'pigeonpeas', 'pomegranate', 'rice', 'coffee', 'mango',
       'mothbeans', 'banana', 'coconut', 'lentil', 'apple', 'Pomegranate',
       'papaya', 'cotton', 'maize', 'blackgram', 'mungbean', 'grapes',
       'chickpea', nan], dtype=object)

In [183]:
def func(x):
    """
    This function is used to unify the values of target that have same meaning but different letters(eg. wether small or capital)
    x: The class of the target
    """
    if x=='Grapes':
        return 'grapes'
    elif x=='Pomegranate':
        return 'pomegranate'
    else:
        return x

In [184]:
df['label']=df['label'].apply(func)  

In [185]:
df['label'].value_counts()

mothbeans      100
muskmelon      100
mungbean       100
blackgram      100
maize          100
cotton         100
papaya         100
apple          100
lentil         100
coconut        100
chickpea       100
mango          100
coffee         100
pomegranate    100
pigeonpeas     100
orange         100
watermelon     100
jute           100
kidneybeans    100
banana          99
rice            99
grapes          99
Name: label, dtype: int64

In [186]:
##second step drop the rows that have null value for target

In [187]:
x=list(df[df['label'].isnull()== True].index)  ##the index of the rows that have null values for the target
x

[234, 450, 802]

In [188]:
df.drop(x,axis=0,inplace=True)

In [189]:
df.reset_index(drop=True,inplace=True)

In [190]:
df

Unnamed: 0,Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall,label
0,25.0,129.0,195,"17,98667801°C","81,17712085%",5777271492,7237127689,grapes
1,106.0,20.0,51,"29,73019662°C","90,97015715%",6342573112,2049035619,muskmelon
2,33.0,59.0,22,"22,64236876°C","21,59396123%",5946999529,1223886015,kidneybeans
3,89.0,9.0,47,"29,47156259°C","90,77069618%",6668382766,2875226067,muskmelon
4,62.0,49.0,37,"24,21744605°C","82,85284045%",7479248124,1661365886,jute
...,...,...,...,...,...,...,...,...
2192,1.0,27.0,36,"23,98598756°C","93,34236582%",5684995235,104991282,pomegranate
2193,31.0,25.0,38,"24,96273236°C","92,40501423%",6497366677,1094169192,pomegranate
2194,92.0,,52,"28,0106804°C","76,52808057%",5891413895,1037040783,banana
2195,28.0,58.0,81,"17,47500984°C","16,54314829%",618042747,9335034262,chickpea


In [191]:
## plot pie chart to show the distribution between the target
px.pie(df,'label',title='Distribution of crops in data')

In [192]:
##Temperature column

In [193]:
##First remove the celsius sign as well as replace the ',' to with '.' 

In [194]:
def func2(x):
    """
    The function is used to replace the comma by dot and remove the celsius, finally converting the data type to float
    """
    return float(x.replace(',','.').strip()[0:-2])

In [195]:
df['temperature']=df['temperature'].apply(func2)

In [196]:
##plot to detect any outliers in temperature

In [197]:
px.histogram(df,x='temperature',marginal='box')

#### Observations:
Temperatures below zero celsius and above 45 degree celsius are considered as outliers and will be put as null values and 
Then in the preprocessing phase to be dealt with 

In [198]:
df['temperature'].describe()

count    2197.000000
mean       26.424644
std        19.633511
min      -200.000000
25%        22.778565
50%        25.627355
75%        28.609011
max       607.000000
Name: temperature, dtype: float64

In [199]:
def func3(x):
    """
    This function is considered as a filter such that any value for temperature greater than 45 or less than zero will 
    be placed as null value
    """
    if x<0 or x>45:
        return np.nan
    else:
        return x

In [200]:
df['temperature']=df['temperature'].apply(func3)

In [201]:
px.histogram(df,x='temperature',marginal='box')  

#### Observation:
Temperatures represented as outliers in the graph are real and true values, as for example rice is planted in high temperature about 37 degree celsius

In [202]:
px.histogram(x=np.log(df['temperature']),marginal='box')  ##No difference in outliers on graph when converting the feature to log scale

In [203]:
## Humidity column

In [204]:
##First remove the percentage sign as well as replace the ',' to with '.' 

In [205]:
def func4(x):
    """
    The function is used to replace the comma by dot and remove the percentage, finally converting the data type to float
    """
    return float(x.replace(',','.').strip()[0:-1])

In [206]:
df['humidity']=df['humidity'].apply(func2)

In [207]:
##plot to detect any outliers in the humidity

In [208]:
px.histogram(df,x='humidity',marginal='box', title='Humidity in percentage')  

#### Observation:
Humidity values are real and true as low values of humidity are indeication for dry areas, although some of them are represented as outliers on the graph.

In [209]:
df['humidity'].describe()

count    2197.000000
mean       71.468070
std        22.275839
min        14.258040
25%        60.242186
50%        80.471527
75%        89.984052
max        99.981876
Name: humidity, dtype: float64

In [210]:
px.histogram(x=np.log(df['humidity']),marginal='box') ##No difference in outliers on graph when converting the feature to log scale

In [211]:
##PH column

In [212]:
##First replace the ',' to with '.' 

In [213]:
df['ph']=df['ph'].str.replace(',','.')

In [214]:
df['ph']=df['ph'].astype('float')

In [215]:
df['ph'].dtype

dtype('float64')

In [216]:
px.histogram(df,x='ph',color_discrete_sequence=['red'],marginal='box', title='ph values ')  

#### Observation:
PH values are real as the PH value can range from 0 to 14

In [217]:
##rain fall column

In [218]:
##First replace the ',' to with '.' 

In [219]:
df['rainfall']=df['rainfall'].str.replace(',','.')

In [220]:
df['rainfall']=df['rainfall'].astype('float')

In [221]:
df['rainfall'].dtype

dtype('float64')

In [222]:
px.histogram(df,x='rainfall',color_discrete_sequence=['green'],marginal='violin', title='rainfall values ')  

#### Observation:
In reality the values presented on the graph as outliers are indication for rainy regions, therefore the outliers will be remained as they are and in case the accuracy of the model is not satisfactory the values will be dealth with

In [223]:
##Nitrogen column

In [224]:
df['Nitrogen'].isnull().sum()  ##there are null values in the nitrogen and will be dealt with in preprocessing phase

19

In [225]:
px.histogram(df,x='Nitrogen',color_discrete_sequence=['purple'],marginal='box', title='nitrogen values ')  

#### Observation:
No outliers in the nitrogen column

In [226]:
##phosphorus column

In [227]:
df['phosphorus'].isnull().sum()   ##there are null values and will be dealt with in the preprocessing phase

15

In [228]:
px.histogram(df,x='phosphorus',color_discrete_sequence=['black'],marginal='box', title='phosphorous values ')  

In [229]:
##potassium column

In [230]:
df['potassium'].isnull().sum()  ##no null values

0

In [231]:
px.histogram(df,x='potassium',color_discrete_sequence=['red'],marginal='box', title='potassium values ')  

## Second step: Bivariate analysis

In [232]:
##Check the relation between the target and the features 

In [233]:
for i in df.columns:
    figure=px.scatter(data_frame=df,x=df[i],y=df['label'])
    figure.show()

In [234]:
px.scatter(df,'Nitrogen','temperature',color='label')

In [235]:
px.scatter(df,'Nitrogen',color='label')

In [236]:
##check correlation between features and each other 
px.imshow(df.corr(),text_auto=True)





#### Observations:
Potassium is strongly correlated with phosphorous

## Third step: preprocessing and modelling

In [237]:
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler, StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay, recall_score
from sklearn.model_selection import cross_validate , train_test_split , StratifiedKFold , GridSearchCV , RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, confusion_matrix, classification_report
from xgboost import XGBClassifier



In [238]:
le=LabelEncoder()

In [239]:
z1=list(df['label'].unique())  ##list containing unique values of target
z1

['grapes',
 'muskmelon',
 'kidneybeans',
 'jute',
 'watermelon',
 'orange',
 'pigeonpeas',
 'pomegranate',
 'rice',
 'coffee',
 'mango',
 'mothbeans',
 'banana',
 'coconut',
 'lentil',
 'apple',
 'papaya',
 'cotton',
 'maize',
 'blackgram',
 'mungbean',
 'chickpea']

In [240]:
z2=list(range(1,23))
z2

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

In [241]:
z3=dict()
z2=list(range(0,22))
for o in z2:
    z3[z1[o]]=o
z3
    


{'grapes': 0,
 'muskmelon': 1,
 'kidneybeans': 2,
 'jute': 3,
 'watermelon': 4,
 'orange': 5,
 'pigeonpeas': 6,
 'pomegranate': 7,
 'rice': 8,
 'coffee': 9,
 'mango': 10,
 'mothbeans': 11,
 'banana': 12,
 'coconut': 13,
 'lentil': 14,
 'apple': 15,
 'papaya': 16,
 'cotton': 17,
 'maize': 18,
 'blackgram': 19,
 'mungbean': 20,
 'chickpea': 21}

In [242]:
df['label']=df['label'].map(z3)

In [243]:
df['label']

0        0
1        1
2        2
3        1
4        3
        ..
2192     7
2193     7
2194    12
2195    21
2196    12
Name: label, Length: 2197, dtype: int64

In [244]:
y= df['label']   ##target will be encoded

In [245]:
x=df.drop(['label'],axis=1)
x

Unnamed: 0,Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall
0,25.0,129.0,195,17.986678,81.177121,5.777271,72.371277
1,106.0,20.0,51,29.730197,90.970157,6.342573,20.490356
2,33.0,59.0,22,22.642369,21.593961,5.947000,122.388601
3,89.0,9.0,47,29.471563,90.770696,6.668383,28.752261
4,62.0,49.0,37,24.217446,82.852840,7.479248,166.136589
...,...,...,...,...,...,...,...
2192,1.0,27.0,36,23.985988,93.342366,5.684995,104.991282
2193,31.0,25.0,38,24.962732,92.405014,6.497367,109.416919
2194,92.0,,52,28.010680,76.528081,5.891414,103.704078
2195,28.0,58.0,81,17.475010,16.543148,6.180427,93.350343


In [246]:
y

0        0
1        1
2        2
3        1
4        3
        ..
2192     7
2193     7
2194    12
2195    21
2196    12
Name: label, Length: 2197, dtype: int64

In [247]:
models={'logistic regression':LogisticRegression(multi_class='ovr'),'KNeighbors':KNeighborsClassifier(),'Random Forest':RandomForestClassifier(),
     'Decision tree':DecisionTreeClassifier()}

In [248]:
for i,j in models.items():
    pl=make_pipeline(KNNImputer(n_neighbors=5),RobustScaler(), j)
    scores = cross_validate(estimator = pl , X = x , y = y , cv = StratifiedKFold(n_splits=5) ,
                        scoring='accuracy' , return_train_score=True )
    print(i)
    print(scores['train_score'].mean())
    print(scores['test_score'].mean())
    
    print('-------------------------------------')

logistic regression
0.9642695591759406
0.9608562849451232
-------------------------------------
KNeighbors
0.9879378633685638
0.9804276247670325
-------------------------------------
Random Forest
1.0
0.9963594947194035
-------------------------------------
Decision tree
1.0
0.9890764133360944
-------------------------------------


#### Observations:
The best model is KNN imputer as it has the least overfitting

Hyperparameter tuning will take place to improve the random forest, decision tree and Kneighbors

For null values, the method used for filling the nulls is KNN IMPUTER

### Hypyer parameter tuning for each model to obtain best one using gridsearch

In [249]:
##KNneighbors classifier

In [250]:
pl1=make_pipeline(KNNImputer(),RobustScaler(), KNeighborsClassifier())
pl1.steps

[('knnimputer', KNNImputer()),
 ('robustscaler', RobustScaler()),
 ('kneighborsclassifier', KNeighborsClassifier())]

In [251]:
params=[{'knnimputer__n_neighbors':list(range(1,25)),
    'kneighborsclassifier__n_neighbors':list(range(1,25,2))
}]
grid_search=GridSearchCV(estimator=pl1,param_grid=params,cv=StratifiedKFold(n_splits=5),scoring='accuracy')

In [252]:
grid_search.fit(x,y)

In [253]:
model1=grid_search.best_estimator_
model1

In [254]:
scores = cross_validate(estimator = model1 , X = x , y = y , cv = StratifiedKFold(n_splits=5) ,
                scoring='accuracy' , return_train_score=True )
print(scores['train_score'].mean())
print(scores['test_score'].mean())

0.9899864219377973
0.9827044936839926


In [255]:
x.columns

Index(['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',
       'rainfall'],
      dtype='object')

In [256]:
##Decision tree

In [257]:
pl2=make_pipeline(KNNImputer(),RobustScaler(), DecisionTreeClassifier())


In [258]:
pl2.steps

[('knnimputer', KNNImputer()),
 ('robustscaler', RobustScaler()),
 ('decisiontreeclassifier', DecisionTreeClassifier())]

In [259]:
params=[{'knnimputer__n_neighbors':[1,2,3,4,5,6,7,8,9,10],
    'decisiontreeclassifier__criterion':['gini','entropy'], 'decisiontreeclassifier__max_depth':list(range(1,20))
}]
grid_search=GridSearchCV(estimator=pl2,param_grid=params,cv=StratifiedKFold(n_splits=5),scoring='accuracy')

In [260]:
grid_search.fit(x,y)

In [261]:
grid_search.best_params_

{'decisiontreeclassifier__criterion': 'gini',
 'decisiontreeclassifier__max_depth': 18,
 'knnimputer__n_neighbors': 6}

In [262]:
model2=grid_search.best_estimator_
model2

In [263]:
pl2=make_pipeline(KNNImputer(n_neighbors=1),RobustScaler(), DecisionTreeClassifier(max_depth=18,criterion='gini'))
scores = cross_validate(estimator = pl2 , X = x , y = y , cv = StratifiedKFold(n_splits=5) ,
                scoring='accuracy' , return_train_score=True )
print(scores['train_score'].mean())
print(scores['test_score'].mean())

1.0
0.9908966659763927


In [264]:
##random forest 

In [265]:
pl3=make_pipeline(KNNImputer(),RobustScaler(), RandomForestClassifier())
pl3


In [266]:
pl3.steps

[('knnimputer', KNNImputer()),
 ('robustscaler', RobustScaler()),
 ('randomforestclassifier', RandomForestClassifier())]

In [267]:
params=[{'randomforestclassifier__n_estimators':list(range(100,105)),
     'randomforestclassifier__max_depth':list(range(1,15))}]
grid_search=GridSearchCV(estimator=pl3,param_grid=params,cv=StratifiedKFold(n_splits=5),scoring='accuracy')
                                    

In [268]:
grid_search.fit(x,y)

In [269]:
model3=grid_search.best_estimator_
model3

In [270]:
scores = cross_validate(estimator = model3 , X = x , y = y , cv = StratifiedKFold(n_splits=5) ,
                scoring='accuracy' , return_train_score=True )
print(scores['train_score'].mean())
print(scores['test_score'].mean())

0.9998862343572241
0.9959039138538


In [271]:
##Obviously random forest classifier has achieved highest accuracy score 

In [272]:
import joblib

In [273]:
joblib.dump(model3, "model.pkl")
joblib.dump(x.columns, "Inputs.pkl")

['Inputs.pkl']

### Test new data 

In [275]:
x_sample=pd.DataFrame(x.iloc[2]).T
x_sample

Unnamed: 0,Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall
2,33.0,59.0,22.0,22.642369,21.593961,5.947,122.388601


In [276]:
y_pred=model3.predict(x_sample)
y_pred[0]

2

In [277]:
for i in z3.keys():
    if z3[i]==y_pred[0]:
        print(i)
        break 

kidneybeans


In [150]:
y[2]  ##The actual value of this row

9

In [278]:
%%writefile final.py
import streamlit as st
import pandas as pd
import joblib

Inputs = joblib.load("Inputs.pkl")
model = joblib.load("model.pkl")

z3={'grapes': 0,
 'muskmelon': 1,
 'kidneybeans': 2,
 'jute': 3,
 'watermelon': 4,
 'orange': 5,
 'pigeonpeas': 6,
 'pomegranate': 7,
 'rice': 8,
 'coffee': 9,
 'mango': 10,
 'mothbeans': 11,
 'banana': 12,
 'coconut': 13,
 'lentil': 14,
 'apple': 15,
 'papaya': 16,
 'cotton': 17,
 'maize': 18,
 'blackgram': 19,
 'mungbean': 20,
 'chickpea': 21}

def prediction(Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall):
    test_df=pd.DataFrame(columns=Inputs)
    test_df.at[0,'Nitrogen']=Nitrogen
    test_df.at[0,'phosphorus']=phosphorus
    test_df.at[0,'potassium']=potassium
    test_df.at[0,'temperature']=temperature
    test_df.at[0,'humidity']=humidity
    test_df.at[0,'ph']=ph
    test_df.at[0,'rainfall']=rainfall
    result=model.predict(test_df)[0]
    for i in z3.keys():
        if z3[i]==result:
            result_sh=i
            break 
    return result_sh

def main():
    st.title('Crop recommendation prediction')
    Nitrogen=float(st.number_input('Insert the amount of nitrogen in (in kg/ha) in soil'))
    phosphorus=float(st.number_input('Insert the amount for phosphorus in (in kg/ha) in soil'))
    potassium=float(st.number_input('Insert the amount of potassium in (in kg/ha) in soil'))
    temperature=float(st.number_input('Insert the average temperature in celsius'))
    humidity=float(st.number_input('Insert the average relative humidity in percent'))
    ph=float(st.number_input('Insert PH value of soil'))
    rainfall=float(st.number_input('Insert amount for rainfall in mm'))

    if st.button("predict"):
        result=prediction(Nitrogen,phosphorus,potassium,temperature,humidity,ph,rainfall)
        st.text(f'The crop will be {result}')


if __name__ == '__main__':
    main()
    

Overwriting final.py


In [104]:
!pip install pipreqs



In [105]:
!pipreqs ./

