In [7]:
# https://iarjset.com/wp-content/uploads/2022/02/IARJSET.2022.9166.pdf

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Datasets

In [8]:
dataset = pd.read_csv("/kaggle/input/soil-test-report-with-weather-for-crop-prediction/Crop_recommendation.csv")
dataset.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [9]:
dataset.columns

Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [11]:
if dataset.isnull().any().any():
#     raise Exception("dataset contains null")
    dataset = dataset.dropna()

In [12]:
crop_list = dataset["label"].unique()
crop_list

array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

# Split dataset

not using the stratify for split due to not imbalanace dataset

In [13]:
X = dataset.drop(columns = "label")
y = dataset["label"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True)

In [14]:
X.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.71734


In [15]:
y.head()

0    rice
1    rice
2    rice
3    rice
4    rice
Name: label, dtype: object

# init model

In [16]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model

# train model

In [17]:
rf_model.fit(x_train, y_train)

# predict results

In [18]:
y_predict = rf_model.predict(x_test)

# model accuracy

In [19]:
accuracy = accuracy_score(y_test, y_predict)
accuracy

0.9977272727272727

# check overfitting

In [20]:
train_set_score = rf_model.score(x_train, y_train)
test_set_score = rf_model.score(x_test, y_test)
print('Training set score: {:.4f}'.format(train_set_score))
print('Test set score: {:.4f}'.format(test_set_score))
if abs(train_set_score-test_set_score) >= 0.2:
    print("model is overfitting")
else:
    print("model is not overfitting")

Training set score: 1.0000
Test set score: 0.9977
model is not overfitting


# Classification Metrices

In [21]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        20
      banana       1.00      1.00      1.00        24
   blackgram       1.00      1.00      1.00        24
    chickpea       1.00      1.00      1.00        23
     coconut       1.00      1.00      1.00        16
      coffee       1.00      1.00      1.00        19
      cotton       1.00      1.00      1.00        17
      grapes       1.00      1.00      1.00        21
        jute       0.95      1.00      0.97        19
 kidneybeans       1.00      1.00      1.00        15
      lentil       1.00      1.00      1.00        21
       maize       1.00      1.00      1.00        23
       mango       1.00      1.00      1.00        24
   mothbeans       1.00      1.00      1.00        13
    mungbean       1.00      1.00      1.00        24
   muskmelon       1.00      1.00      1.00        29
      orange       1.00      1.00      1.00        18
      papaya       1.00    

# save model

In [22]:
joblib.dump(rf_model, "/kaggle/working/crop_recommendation_random_forest.joblib")
# /kaggle/working/

['/kaggle/working/crop_recommendation_random_forest.joblib']

# test predict

In [44]:
import json
data = {
        "Nitrogen": 90,
        "Phosphorous": 42,
        "Potassium": 43,
        "Temperature": 20,
        "Humidity": 82,
        "Ph": 6,
        "Rainfall": 202
    }
    # convert data to json format
json_data = json.dumps(datas)

In [None]:
def create_dataframe(data):
    data = json.loads(data)
    # convert data to pandas dataframe
    df = pd.DataFrame({
        'N': [float(data.get('Nitrogen'))],
        'P': [float(data.get('Phosphorous'))],
        'K': [float(data.get('Potassium'))],
        'temperature': [float(data.get('Temperature'))],
        'humidity': [float(data.get('Humidity'))],
        'ph': [float(data.get("Ph"))],
        'rainfall': [float(data.get("Rainfall"))]
    })
    return df

In [45]:
model = joblib.load("/kaggle/working/crop_recommendation_random_forest.joblib")
data = create_dataframe(json_data)
data

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90.0,42.0,43.0,20.0,82.0,6.0,202.0


In [46]:
result = model.predict(data)
result[0]

'rice'