In [1]:

import pandas as pd
import xgboost as xg 
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

In [2]:
file_path = 'Crop_recommendation.csv'

data = pd.read_csv(file_path)


data['temperature'] = data['temperature'].astype(int)
data['humidity'] = data['humidity'].astype(int)
data['ph'] = data['ph'].astype(int)
data['rainfall'] = data['rainfall'].astype(int)

data

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20,82,6,202,rice
1,85,58,41,21,80,7,226,rice
2,60,55,44,23,82,7,263,rice
3,74,35,40,26,80,6,242,rice
4,78,42,42,20,81,7,262,rice
...,...,...,...,...,...,...,...,...
1692,117,86,48,28,82,6,116,banana
1693,114,94,53,26,76,6,118,banana
1694,110,78,50,25,78,5,98,banana
1695,94,70,48,25,84,6,91,banana


In [3]:
data.info()

print('Coolumns')
print(data.columns)

print('')
print(data['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   N            1697 non-null   int64 
 1   P            1697 non-null   int64 
 2   K            1697 non-null   int64 
 3   temperature  1697 non-null   int64 
 4   humidity     1697 non-null   int64 
 5   ph           1697 non-null   int64 
 6   rainfall     1697 non-null   int64 
 7   label        1697 non-null   object
dtypes: int64(7), object(1)
memory usage: 106.2+ KB
Coolumns
Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

label
rice          139
Soyabeans     130
banana        130
beans         125
cowpeas       122
orange        122
maize         119
coffee        110
peas          100
groundnuts    100
mango         100
grapes        100
watermelon    100
apple         100
cotton        100
Name: count, dtype: int64


In [4]:
x = data.drop(columns=['label'])
y_prime = data['label']

le = LabelEncoder()
y = le.fit_transform(y_prime)

print(x.shape)
# print(y.shape)

(1697, 7)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=17)


crop_model = xg.XGBClassifier(
    objective='multi:softprob',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=10,
    booster='gbtree',
    nthread=4
)

In [6]:
# class CustomLabelEncoder(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self.encoder = LabelEncoder()
        
#     def fit_transform(self, x, y):
#         self.encoder.fit(y)
#         return self.encoder.transform(x)

In [7]:
steps=[
     ('model', crop_model)
]

my_pipeline = Pipeline(steps)

In [8]:
my_pipeline.fit(x_train, y_train)

crop_type = my_pipeline.predict(x_test)
model_accuracy = 100 * my_pipeline.score(x_test, y_test)

print(f'The model has an accuracy of {model_accuracy:.2f}%')

The model has an accuracy of 99.71%


In [9]:
model_report = classification_report(y_test, crop_type)

print(model_report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00        27
           4       1.00      1.00      1.00        24
           5       0.97      1.00      0.98        29
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        30
           8       1.00      1.00      1.00        16
           9       1.00      0.97      0.98        29
          10       1.00      1.00      1.00        18
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        25
          13       1.00      1.00      1.00        27
          14       1.00      1.00      1.00        14

    accuracy                           1.00       340
   macro avg       1.00      1.00      1.00       340
weighted avg       1.00   

In [10]:
cross_validation_scores = cross_val_score(my_pipeline, x, y, cv=5, scoring='accuracy')

print('Cross validation scores')
print(cross_validation_scores)

Cross validation scores
[0.99117647 0.99705882 1.         1.         0.99410029]


In [11]:
# my_pipeline.save_model('model.json')
print(crop_model.feature_importances_.shape[0])

import joblib

joblib.dump(crop_model, 'model.pkl')

7


['model.pkl']

In [12]:
crops = {
    0: 'rice',
    1: 'Soyabeans',
    2: 'banana',
    3: 'beans',
    4: 'cowpeas',
    5: 'cowpeas',
    6: 'maize',
    7: 'coffee',
    8: 'peas',
    9: 'groundnuts',
    10: 'mango',
    11: 'grapes',
    12: 'watermelon',
    13: 'apple',
    14: 'cotton'
}

In [16]:
import numpy as np

vals = [[150, 52, 53, 43, 12, 6, 50]]
# vals = [[80	,71	,47	,27	,80, 6,	105]]
predictions = my_pipeline.predict_proba(vals)

pred_class = np.argmax(predictions)
certinty = 100 * np.max(predictions)

print(f'Class: {crops[pred_class]}')
print(f'{certinty:.2f}%')

Class: rice
80.26%


In [14]:
# second_class = 
model = xg.Booster().load_model('model.json')

model

In [15]:
rf_model = RandomForestClassifier()

rf_model.fit(x_train, y_train)
model_accuracy = 100 * rf_model.score(x_test, y_test)

print(f'The model has an accuracy of {model_accuracy:.2f}%')

The model has an accuracy of 100.00%
