In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib notebook

# Loading DataFrame

In [3]:
df = pd.read_csv (r'C:\Users\dp\Desktop\_\Coursera\CropRecommendation\Crop_recommendation.csv')
# pd.set_option("display.max_rows", None, "display.max_columns", None)

labels_list = list(set([i for i in df['label']]))
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


# Visualization of Data

In [3]:
fig, ax = plt.subplots(1,2)
temp_ph = ax[0].scatter(df['ph'], df['temperature'], alpha = 0.6, c = 'orange')
rain_ph = ax[1].scatter(df['ph'], df['rainfall'], alpha = 0.6)
ax[0].set_xlabel('PH')
ax[1].set_xlabel('PH')
ax[0].set_ylabel('Temperature(C)')
ax[1].set_ylabel('Rainfall(mm)')

plt.tight_layout()

<IPython.core.display.Javascript object>

In [4]:
df1 = df.groupby('label').mean()

In [5]:
fig, ax = plt.subplots(1,2)
df2 = df1.reset_index()
temp_labels = ax[0].bar(df2['label'], df2['temperature'], color = 'orange')
rainfall_labels = ax[1].bar(df2['label'], df2['rainfall'], alpha = 0.8)
ax[0].set_xlabel('Crops')
ax[1].set_xlabel('Crops')
ax[0].set_ylabel('Temperature(C)')
ax[1].set_ylabel('Rainfall(mm)')
ax[0].set_xticks(ticks = np.arange(0,len(labels_list),3))
ax[0].set_xticklabels(labels = labels_list[::3], rotation = 45, horizontalalignment = 'right', rotation_mode = 'anchor')
ax[1].set_xticks(ticks = np.arange(0,len(labels_list),3))
ax[1].set_xticklabels(labels = labels_list[::3], rotation = 45, horizontalalignment = 'right', rotation_mode = 'anchor')

plt.tight_layout()

<IPython.core.display.Javascript object>

# Classification Techniques

In [4]:
# Importing various tools

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score

In [5]:
# Importing various Classifiers

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier

# Data Pre-processing

In [57]:
X = df[df.columns[0:7]]
y = df['label']

# Using Label Encoder to convert string labels into numericals values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

#Using scaler to scale values
sc = MinMaxScaler()

In [58]:

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 0)

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test) 

K-Nearest Neighbors

In [59]:
knn = KNeighborsClassifier()
grid_params = {'n_neighbors': [1,3,5,7,9] }
knn_grid = GridSearchCV(estimator = knn, param_grid = grid_params)
knn_grid.fit(X,y)
knn_clf = KNeighborsClassifier(n_neighbors = knn_grid.best_estimator_.n_neighbors).fit(X_train_scaled,y_train)
knn_score = knn_clf.score(X_test_scaled,y_test)
knn_score
#add example

0.9872727272727273

Gaussian Naive-Bayes

In [60]:
GNB_clf = GaussianNB().fit(X_train_scaled, y_train)
GNB_pred = GNB_clf.predict(X_test_scaled)
GNB_acc_score = accuracy_score(y_test, GNB_pred)
GNB_acc_score

0.9945454545454545

Logistic Regression

In [61]:
LR_clf = LogisticRegression(C = 100, solver = 'lbfgs', max_iter = 400).fit(X_train, y_train)
LR_pred = LR_clf.predict(X_test)
LR_acc_score = accuracy_score(y_test, LR_pred)
LR_acc_score

0.9690909090909091

Decision Tree

In [62]:
dtree = DecisionTreeClassifier()
grid_params = {'max_depth':[1,2,3,4,5,6,7,8,9],'max_leaf_nodes':[2,3,4,5,6,7,8,9]}
grid_tree = GridSearchCV(estimator = dtree, param_grid = grid_params)
grid_tree.fit(X_train_scaled,y_train)
dtree_clf = DecisionTreeClassifier().fit(X_train_scaled,y_train)
dtree_score = dtree_clf.score(X_test_scaled,y_test)
dtree_score

0.9745454545454545

In [63]:
# Another important feature of decision tree is that we can find feature importance for the data.

importance = permutation_importance(dtree_clf, X_test_scaled, y_test)
feature_importance = pd.DataFrame(importance.importances_mean, index = X.columns, columns = {'Importance'})
feature_importance.sort_values(by = 'Importance', ascending = False)


# Here we can see that the feature humidity plays a crucial role followed by rainfall and potassium content

Unnamed: 0,Importance
humidity,0.519273
rainfall,0.471636
K,0.323636
P,0.281091
N,0.178545
ph,0.047273
temperature,0.034182


Support Vector Machines

In [64]:
svc = SVC()
grid_params = {'kernel' : ['linear','rbf','poly'], 'degree': [1,2,3,4,5]}
grid_svc = GridSearchCV(estimator = svc, param_grid = grid_params)
grid_svc.fit(X_train_scaled,y_train)
k, d = grid_svc.best_estimator_.kernel, grid_svc.best_estimator_.degree
svc_clf = SVC(kernel = k, degree = d).fit(X_train_scaled,y_train)
svc_score = svc_clf.score(X_test_scaled,y_test)
svc_score

0.9836363636363636

In [65]:
data = { 'Classifiers' : ['KNN', 'Naive-Bayes', 'LogisticRegression', 'SVC', 'Decision Tree'],
         'Scores' : [knn_score, GNB_acc_score, LR_acc_score, svc_score, dtree_score] 
       }
scores = pd.DataFrame(data, columns = {'Classifiers', 'Scores'})
scores

Unnamed: 0,Classifiers,Scores
0,KNN,0.987273
1,Naive-Bayes,0.994545
2,LogisticRegression,0.969091
3,SVC,0.983636
4,Decision Tree,0.974545


In [66]:
max_score = scores.max()
max_score

# Here we have used different scoring methods for different classifiers and hence this result is ambiguous.
# Hence, lets use a single scoring method to remove any ambiguity.

Classifiers         SVC
Scores         0.994545
dtype: object

# Finding Roc_Auc_Score

In [67]:
# We will find Roc_Auc Scores so that we can compare the scores of all the classifiers over a single metric.
Roc_scores = []

#For KNN classifier:

knn_roc_ovr = roc_auc_score(y_test, knn_clf.predict_proba(X_test_scaled), multi_class = 'ovr')
knn_roc_ovo = roc_auc_score(y_test, knn_clf.predict_proba(X_test_scaled), multi_class = 'ovo')
Roc_scores.append(knn_roc_ovr)

knn_roc_ovr, knn_roc_ovo

(0.998167089701862, 0.9981889992062164)

In [68]:
#For Support Vector Machine:

# Here we explicitly need to set probability parameter to True for roc auc score.
svc_clf1 = SVC(kernel = k, degree = d, probability = True) .fit(X_train_scaled, y_train)

#Using label encoder to convert labels into numeric data
y_pred_svc = label_encoder.fit_transform(svc_clf.predict(X_test_scaled))

svm_roc_ovr = roc_auc_score(y_test, svc_clf1.predict_proba(X_test_scaled), multi_class='ovr')
svm_roc_ovo = roc_auc_score(y_test, svc_clf1.predict_proba(X_test_scaled), multi_class='ovo')
Roc_scores.append(svm_roc_ovr)

svm_roc_ovo, svm_roc_ovo

(0.999780381591766, 0.999780381591766)

In [69]:
# For Decision trees:

dtree_roc_ovr = roc_auc_score(y_test, dtree_clf.predict_proba(X_test_scaled), multi_class='ovr')
dtree_roc_ovo = roc_auc_score(y_test, dtree_clf.predict_proba(X_test_scaled), multi_class='ovo')
Roc_scores.append(dtree_roc_ovr)

dtree_roc_ovr, dtree_roc_ovo

(0.9874326932596357, 0.987473085656137)

In [70]:
# For Logistic Regression: 

LR_roc_ovr = roc_auc_score(y_test, LR_clf.predict_proba(X_test_scaled), multi_class='ovr')
LR_roc_ovo = roc_auc_score(y_test, LR_clf.predict_proba(X_test_scaled), multi_class='ovo')
Roc_scores.append(LR_roc_ovr)

LR_roc_ovr, LR_roc_ovo

(0.924993233050179, 0.9250855271787077)

In [71]:
# For Gaussian Naive-Bayes:

GNB_roc_ovr = roc_auc_score(y_test, GNB_clf.predict_proba(X_test_scaled), multi_class = 'ovr')
GNB_roc_ovo = roc_auc_score(y_test, GNB_clf.predict_proba(X_test_scaled), multi_class = 'ovo')
Roc_scores.append(GNB_roc_ovr)

GNB_roc_ovr, GNB_roc_ovo

(0.9999571714898672, 0.9999626809971638)

In [72]:
data = {'Classifiers' : ['KNN', 'SVM', 'Decision Tree', 'Logistic Regression', 'Gaussian Naive Bayes'],
        'Roc_Auc_Score': Roc_scores
       }
Roc_Auc_Scores = pd.DataFrame(data, columns = {'Classifiers', 'Roc_Auc_Score'})
Roc_Auc_Scores.sort_values(by = 'Roc_Auc_Score', ascending = False, ignore_index = True)


Unnamed: 0,Classifiers,Roc_Auc_Score
0,Gaussian Naive Bayes,0.999957
1,SVM,0.99975
2,KNN,0.998167
3,Decision Tree,0.987433
4,Logistic Regression,0.924993


Since Gaussian Naive Bayes Classifier has the highest Roc_Auc_Score we will choose that classifier.

In [77]:
# Lets predict the type of crop for a particular instance from the dataset.

# instance_index = np.random.randint(0, len(df['K']))
instance = X.iloc[500]
features = [i for i in instance]
pred = np.array(features).reshape(1, -1)
pred1 = sc.transform(pred)
ans = (GNB_clf.predict(pred1), y.iloc[500])    
ans

(array([13], dtype=int64), 13)

In [87]:
# As we can see that the labels returned are the same, in this case 13, we can conclude that the actual 
# value and the prediction are the same.