# **Import the Libraries**
Following the Python Libraries required to execute our project code

1. Numpy - required to perform matrix and array operations
1. Pandas - required to data manipulation and analysis
1. Sklearn (Scikit-learn) - used for performing KNN classification
1. Matplotlib - used for adding charts and graphs
1. Pylab - optimising and visually modifying charts and graphs

In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
from scipy.interpolate import make_interp_spline
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# **Load the dataset and split**

In [4]:
loc = '../input/crop-recommendation-dataset/Crop_recommendation.csv'

data = pd.read_csv(loc)
col = list(data.columns)
classes = data["label"].unique()

xdata = data.iloc[:, 0:7].values
ydata = data.iloc[:, 7].values

plt.figure(figsize = (16, 9))
sns.countplot(classes, palette = 'rocket')
plt.xticks(rotation=90)

# **Plotting the various values of the input parameter for a particular output value.**

In [5]:
all_col = data.columns[:-1]

for col in all_col:
  plt.figure(figsize = (16, 9))
  sns.barplot(x = 'label', y = col, data = data, palette = 'rocket')
  plt.xlabel('label', fontsize = 12)
  plt.ylabel(col, fontsize = 12)
  plt.xticks(rotation=90)
  plt.title(f'{col} vs Crop')
  plt.show()

In [6]:
plt.figure(figsize = (10, 17))
sns.pairplot(data, hue = 'label', palette = 'rocket')
plt.show()

# **Split & scale the data into train and test dataset**

The data set is splited into test and train dataset based on the 80 - 20 rule and it is scaled for better performance for training. 

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(xdata, ydata, test_size=0.2, random_state=884)
x_st = StandardScaler()
xtrain = x_st.fit_transform(xtrain)
xtest = x_st.fit_transform(xtest)

# ***Optimising the training model***
**Requied only for optimising the performance of the KNN algorithm**

The training model is being optimised using trial and error method to determine the best random_state in test_train_split function and the nearest neighbor in the KNN algorithm.

In [8]:
acc_list = []
err_rate = []

neighbors = np.linspace(1, 50, 50)
neighbors = neighbors.astype(int)

for K in neighbors:
  classifier = KNeighborsClassifier(n_neighbors = K)
  classifier.fit(xtrain, ytrain)
  y_pred = classifier.predict(xtest)

  accuracy = round(acc(ytest, y_pred)*100, 3)

  acc_list.append(accuracy)
  err_rate.append(np.mean(y_pred != ytest))

xy = make_interp_spline(neighbors, acc_list)
xz = make_interp_spline(neighbors, err_rate)
x = np.linspace(1, 50, 1000)
y = xy(x)
z = xz(x)

plt.figure(figsize = (13, 7))
plt.subplot(2, 1, 1)
sns.lineplot(x, y, linewidth = 2, color = '#5C284F')
plt.xlabel('K value')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy vs K', fontweight = 'bold')
plt.xlim(min(neighbors), max(neighbors))

plt.subplot(2, 1, 2)
sns.lineplot(x, z, linewidth = 2, color = '#D96856')
plt.xlabel('K value')
plt.ylabel('Loss')
plt.title('Loss vs K', fontweight = 'bold')
plt.xlim(min(neighbors), max(neighbors))

plt.tight_layout()
plt.show()

K_opt = acc_list.index(max(acc_list))
print('\nOptimal value of K = ', K_opt)

# **K Nearest Neighbor Algorithm**

The classification is done based on the k nearest neighbour algorithm where the euclidian distance is calculated for each input output combination and the classification is done depending on the minimum euclidian distance.

In [9]:
classifier = KNeighborsClassifier(n_neighbors=K_opt+1)
classifier.fit(xtrain, ytrain)
y_pred = classifier.predict(xtest)

accuracy = acc(ytest, y_pred)*100
print('Accuracy of the training Model : ', round(accuracy, 3), '%')

# **Display the Performance of the trained model**

A confusion matrix is used to determine the performance of a trained model by mapping the training accuracy for each input and output combination. Ideally it should be a diagonal matrix but due to training uncertainity , the matrix obtained is not a diagonal matrix.

In [10]:
cm = confusion_matrix(ytest, y_pred, normalize = 'pred')

fig, ax = plt.subplots(figsize=(20,13))
sns.heatmap(cm, annot = True)
plt.xlabel('Predicted Crop', fontsize = 12)
plt.ylabel('Actual Crop', fontsize = 12)
plt.title('Confusion Matrix', fontweight = 'bold', fontsize = 15)

plt.xticks(rotation=90)
plt.yticks(rotation=0)

ax.xaxis.set_ticklabels(classes)
ax.yaxis.set_ticklabels(classes)
plt.show()