In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Fetching and preprocessing data
* Getting dataset
* Information
* Describing data

In [None]:
df = pd.read_csv("/kaggle/input/crop-recommendation-dataset/Crop_recommendation.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['label'].unique()

## EDA
* N = Nitrogen level
* P = Phosphorus Level
* K = Potassium Level

In [None]:
# possible combinations for relational study
import matplotlib.pyplot as plt

def fact(num):
    if num < 2:
        return 1
    return fact(num-1)*num
combinations = fact(7)/(fact(2)*fact(5))
combinations

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df.drop('label',axis='columns'),figsize=(16,16),marker='.',alpha=0.4,c='red')
plt.show()

## Preparing data for model building
* label encoding

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
df['label_en'] = encoder.fit_transform(df['label'])
labels_lis = sorted(df['label'].unique())
labels_dic = {}
df.drop('label',axis='columns',inplace=True)
for i,val in enumerate(labels_lis):
    labels_dic.update({i:val})
labels_dic

In [None]:
X = df.drop('label_en',axis=1)
y = df['label_en']

## Building model
* grid search cv
* cross validation 
* splitting data into training and testing

In [None]:
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

#### ---> Grid search cv (svm)

In [None]:
def best_params(cv_df):
    """returns best score parameters"""
    best_test_score = cv_df[cv_df['mean_test_score']==cv_df['mean_test_score'].max()]
    best_test_train_score = best_test_score[best_test_score['mean_train_score']==best_test_score['mean_train_score'].max()]
    return best_test_train_score['params'].values

In [None]:
svm_cv = GridSearchCV(SVC(),param_grid={
    'C':[0.1,1,10],
    'kernel':['rbf','poly'],
},return_train_score=True)
svm_cv.fit(X,y)

svc_grid = pd.DataFrame(svm_cv.cv_results_)
svc_paras = best_params(svc_grid)
svc_paras

#### ---> Grid search cv (random foresr calssifier)

In [None]:
rf_cv = GridSearchCV(RandomForestClassifier(),param_grid={
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
},return_train_score=True)
rf_cv.fit(X,y)

rf_grid = pd.DataFrame(rf_cv.cv_results_)
rf_paras = best_params(rf_grid)
rf_paras

#### ---> Grid search cv (Decision tree)

In [None]:
dt_cv = GridSearchCV(DecisionTreeClassifier(),param_grid={
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
},return_train_score=True)
dt_cv.fit(X,y)

dt_grid = pd.DataFrame(dt_cv.cv_results_)
dt_paras = best_params(dt_grid)
dt_paras

#### ---> Grid search cv (Logistic regression)

In [None]:
lr_cv = GridSearchCV(LogisticRegression(),param_grid={
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'saga']
},return_train_score=True)
lr_cv.fit(X,y)

lr_grid = pd.DataFrame(lr_cv.cv_results_)
lr_paras = best_params(lr_grid)
lr_paras


#### ---> Grid search cv (bagging classifier)

In [None]:
bg_cv = GridSearchCV(BaggingClassifier(base_estimator=DecisionTreeClassifier()),param_grid={
    'n_estimators': [50],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'base_estimator__max_depth': [None, 5, 10]
},return_train_score=True)
bg_cv.fit(X,y)

bg_grid = pd.DataFrame(bg_cv.cv_results_)
bg_paras = best_params(bg_grid)
bg_paras

#### ---> Cross validation

In [None]:
def score(model,X,y):
    result = cross_val_score(model,X,y)
    return result,sum(result)/len(result)
models = [RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=2),BaggingClassifier(max_features=0.7, max_samples=1.0, n_estimators=50),SVC(C=10, kernel='poly'),LogisticRegression(C=10,penalty='l1',solver='liblinear'),DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_leaf=1, min_samples_split=5)]
for model in models:
    print(model,score(model,X,y))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y)
X_train.shape,y_test.shape

In [None]:
model = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=2)
model.fit(X_train,y_train)
model.score(X_train,y_train),model.score(X_test,y_test)

#### ---> Saving model

In [None]:
import joblib
joblib.dump(model,"crop recommendation")

99.4512 accuracy is achieved with RandomForestClassifier