**Importing the libraries**

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from ipywidgets import interact


**uploading the dataset**

In [None]:
from google.colab import files
uploaded=files.upload()   #to upload a file

TypeError: ignored

**Reading the dataset**

In [None]:
#read dataset
data=pd.read_csv('data.csv')

In [None]:
#to check the shape of the dataset
print("Shape of the dataset :",data.shape)

In [None]:
#let's check the head of the data set
data.head()

**Description for each of the columns in the Dataset**

In [None]:
#let's check if there is any missing value present in dataset
data.isnull().sum()



* Fill-NA function is used to replace these missing values with statistical values such as mean ,median or mode  
* NA means not available
* Pandas have functions like fill-NA,drop-NA to treat missing values


In [None]:
#Let's check the crops present in this dataset
data['label'].value_counts()

**Descriptive Statistics**

In [None]:
#Let's check the summary for all the crops

print("Average Ratio of Nitrogen in the soil : {0: .2f}".format(data['N'].mean()))
print("Average Ratio of Phosphorous in the soil : {0: .2f}".format(data['P'].mean()))
print("Average Ratio of Potassium in the soil : {0: .2f}".format(data['K'].mean()))
print("Average Temperature in Celcius : {0: .2f}".format(data['temperature'].mean()))
print("Average Relative Humidity in % : {0: .2f}".format(data['humidity'].mean()))
print("Average PH Value of the soil : {0: .2f}".format(data['ph'].mean()))
print("Average Rainfall in mm : {0: .2f}".format(data['rainfall'].mean()))

In [None]:
#Lets check the summary statistics for each of the crops

@interact
def summary(crops=list(data['label'].value_counts().index)):
  x=data[data['label']==crops]
  print("---------------------------------------------------------")
  print("Statistics for Nitrogen")
  print("Minimum Nitrogen Required : ", x['N'].min())
  print("Average Nitrogen Required : ", x['N'].mean())
  print("Maximum Nitrogen Required : ", x['N'].max())
  print("---------------------------------------------------------")
  print("Statistics for Phosphorous")
  print("Minimum Phosphorous Required : ", x['P'].min())
  print("Average Phosphorous Required : ", x['P'].mean())
  print("Maximum Phosphorous Required : ", x['P'].max())
  print("---------------------------------------------------------")
  print("Statistics for Potassium")
  print("Minimum Potassium Required : ", x['K'].min())
  print("Average Pottasium Required : ", x['K'].mean())
  print("Maximum Pottasium Required : ", x['K'].max())
  print("---------------------------------------------------------")
  print("Statistics for Temperature")
  print("Minimum Temperature Required : {0:.2f}".format(x['temperature'].min()))
  print("Average Temperature Required : {0:.2f}".format(x['temperature'].mean()))
  print("Maximum Temperature Required : {0:.2f}".format(x['temperature'].max()))
  print("---------------------------------------------------------")
  print("Statistics for Humidity")
  print("Minimum Humidity Required : {0:.2f}".format(x['humidity'].min()))
  print("Average Humidity Required : {0:.2f}".format(x['humidity'].mean()))
  print("Maximum Humidity Required : {0:.2f}".format(x['humidity'].max()))
  print("---------------------------------------------------------")
  print("Statistics for PH")
  print("Minimum PH Required : {0:.2f}".format(x['ph'].min()))
  print("Average PH Required : {0:.2f}".format(x['ph'].mean()))
  print("Maximum PH Required : {0:.2f}".format(x['ph'].max()))
  print("---------------------------------------------------------")
  print("Statistics for Rainfall")
  print("Minimum Rainfall Required : {0:.2f}".format(x['rainfall'].min()))
  print("Average Rainfall Required : {0:.2f}".format(x['rainfall'].mean()))
  print("Maximum Rainfall Required : {0:.2f}".format(x['rainfall'].max()))

In [None]:
#let's compare the Average requirement for each crop with average condition
@interact
def compare(conditions=['N','P','K','temperature','ph','humidity','rainfall']):
  print("Average value for",conditions,"is {0:.2f}".format(data[conditions].mean()))
  print("---------------------------------------------------------------------")
  print("Rice : {0:.2f}".format(data[(data['label']=='rice')][conditions].mean()))
  print("Black Grams : {0:.2f}".format(data[(data['label']=='blackgram')][conditions].mean()))
  print("Banana : {0:.2f}".format(data[(data['label']=='banana')][conditions].mean()))
  print("Jute : {0:.2f}".format(data[(data['label']=='jute')][conditions].mean()))
  print("Coconut : {0:.2f}".format(data[(data['label']=='coconut')][conditions].mean()))
  print("Apple : {0:.2f}".format(data[(data['label']=='apple')][conditions].mean()))
  print("Papaya : {0:.2f}".format(data[(data['label']=='papaya')][conditions].mean()))
  print("Muskmelon : {0:.2f}".format(data[(data['label']=='muskmelon')][conditions].mean()))
  print("Grapes : {0:.2f}".format(data[(data['label']=='grapes')][conditions].mean()))
  print("Watermelon : {0:.2f}".format(data[(data['label']=='watermelon')][conditions].mean()))
  print("Kidney Beans : {0:.2f}".format(data[(data['label']=='kidneybeans')][conditions].mean()))
  print("Mung Beans : {0:.2f}".format(data[(data['label']=='mungbean')][conditions].mean()))
  print("Oranges : {0:.2f}".format(data[(data['label']=='orange')][conditions].mean()))
  print("Chick Peas : {0:.2f}".format(data[(data['label']=='chickpea')][conditions].mean()))
  print("Lentils : {0:.2f}".format(data[(data['label']=='lentil')][conditions].mean()))
  print("Cotton : {0:.2f}".format(data[(data['label']=='cotton')][conditions].mean()))
  print("Maize : {0:.2f}".format(data[(data['label']=='maize')][conditions].mean()))
  print("Moth Beans : {0:.2f}".format(data[(data['label']=='mothbeans')][conditions].mean()))
  print("Pegion Peas : {0:.2f}".format(data[(data['label']=='pegionpeas')][conditions].mean()))
  print("Mango : {0:.2f}".format(data[(data['label']=='mango')][conditions].mean()))
  print("Pomegranate: {0:.2f}".format(data[(data['label']=='pomegranate')][conditions].mean()))
  print("Coffee : {0:.2f}".format(data[(data['label']=='coffee')][conditions].mean()))

In [None]:
#let's make this function more clear and easy
@interact
def compare(conditions = ['N','P','K','temperature','ph','humidity','rainfall']):
  print("Crops which require greater than average",conditions,'\n')
  print(data[data[conditions]>data[conditions].mean()]['label'].unique())
  print("-----------------------------------------------------------")
  print("Crops which require less than average",conditions,'\n')
  print(data[data[conditions]<=data[conditions].mean()]['label'].unique())

In [None]:
plt.rcParams['figure.figsize'] = (15, 7)
plt.subplot(2,4,1)
sns.distplot(data['N'],color='pink')
plt.xlabel('Ratio of Nitrogen',fontsize=10)
plt.grid()
plt.subplot(2,4,2)
sns.distplot(data['P'],color='yellow')
plt.xlabel('Ratio of Phosphorous',fontsize=10)
plt.grid()
plt.subplot(2,4,3)
sns.distplot(data['K'],color='darkblue')
plt.xlabel('Ratio of Potassium',fontsize=10)
plt.grid()
plt.subplot(2,4,4)
sns.distplot(data['temperature'],color='green')
plt.xlabel('Ratio of Temperature',fontsize=10)
plt.grid()
plt.subplot(2,4,5)
sns.distplot(data['rainfall'],color='grey')
plt.xlabel('Ratio of Rainfall',fontsize=10)
plt.grid()
plt.subplot(2,4,6)
sns.distplot(data['humidity'],color='lightgreen')
plt.xlabel('Ratio of Humidity',fontsize=10)
plt.grid()
plt.subplot(2,4,7)
sns.distplot(data['ph'],color='black')
plt.xlabel('Ratio of PH',fontsize=10)
plt.grid()
plt.suptitle("Distribution for Agricultural Conditions",fontsize=20)
plt.show()

In [None]:
print("Some interesting patterns")
print("----------------------------------------")
print("Crops which requires very High Ratio of Nitrogen Content in Soil:",data[data['N']>120]['label'].unique())
print("Crops which requires very High Ratio of Phosphorous Content in Soil:",data[data['P']>100]['label'].unique())
print("Crops which requires very High Ratio of Potassium Content in Soil:",data[data['K']>200]['label'].unique())
print("Crops which requires very High Rainfall:",data[data['rainfall']>200]['label'].unique())
print("Crops which requires very Low Temperature:",data[data['temperature']<10]['label'].unique())
print("Crops which requires very High Temperature:",data[data['temperature']>40]['label'].unique())
print("Crops which requires very Low Humidity:",data[data['humidity']<20]['label'].unique())
print("Crops which requires very Low PH:",data[data['ph']<4]['label'].unique())
print("Crops which requires very High PH:",data[data['ph']>9]['label'].unique())

In [None]:
#Let's understand which crops can be only grown in summer , winter and rainy season
print("SUMMER CROPS")
print(data[(data['temperature']>30) & (data['humidity']>50)]['label'].unique())
print("WINTER CROPS")
print(data[(data['temperature']<20) & (data['humidity']>30)]['label'].unique())
print("RAINY CROPS")
print(data[(data['rainfall']>200) & (data['humidity']>30)]['label'].unique())

**Clustering**

In [None]:
from sklearn.cluster import KMeans

#removing the labels column
x=data.drop(['label'],axis=1)

#selecting all the values of the data
x=x.values

#checking the shape
print(x.shape)

In [None]:
#Let's determine the Optimum Number of Clusters within the Dataset

plt.rcParams['figure.figsize']=(10,4)

wcss=[]
for i in range(1,11):
  km=KMeans(n_clusters=i,init='k-means++',max_iter=300,n_init=10,random_state=0)
  km.fit(x)
  wcss.append(km.inertia_)

#Let's plot the results
plt.plot(range(1,11),wcss)
plt.title('The Elbow Method ',fontsize=20)
plt.xlabel('No.of Clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
#Let's implement the K Means algorithm to perform Clustering analysis
km=KMeans(n_clusters=4, init='k-means++',max_iter=300, n_init=10, random_state=0)
y_means = km.fit_predict(x)

#Let's find out the results
a = data['label']
y_means = pd.DataFrame(y_means)
z = pd.concat([y_means, a], axis=1)
z = z.rename(columns = {0: 'cluster'})

#Let's check the clusters of each crops
print("Lets check the Results After Applying the K Means Clustering Analysis \n")
print("Crops in First Cluster:", z[z['cluster']==0]['label'].unique())
print("---------------------------------------------------------------------")
print("Crops in Second Cluster:", z[z['cluster']==1]['label'].unique())
print("---------------------------------------------------------------------")
print("Crops in Third Cluster:", z[z['cluster']==2]['label'].unique())
print("---------------------------------------------------------------------")
print("Crops in Fourth Cluster:", z[z['cluster']==3]['label'].unique())

**visualizing the Hidden Patterns**

In [None]:
### Data Visualizations

plt.rcParams['figure.figsize'] = (15, 8)

plt.subplot(2, 4, 1)
sns.barplot(data['N'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Nitrogen', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 2)
sns.barplot(data['P'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Phosphorous', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 3)
sns.barplot(data['K'], data['label'])
plt.ylabel(' ')
plt.xlabel('Ratio of Potassium', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 4)
sns.barplot(data['temperature'], data['label'])
plt.ylabel(' ')
plt.xlabel('Temperature', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 5)
sns.barplot(data['humidity'], data['label'])
plt.ylabel(' ')
plt.xlabel('Humidity', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 6)
sns.barplot(data['ph'], data['label'])
plt.ylabel(' ')
plt.xlabel('pH of Soil', fontsize = 10)
plt.yticks(fontsize = 10)

plt.subplot(2, 4, 7)
sns.barplot(data['rainfall'], data['label'])
plt.ylabel(' ')
plt.xlabel('Rainfall', fontsize = 10)
plt.yticks(fontsize = 10)

plt.suptitle('Visualizing the Impact of Different Conditions on Crops', fontsize = 15)
plt.show()

**Predictive Modelling**

In [None]:
#lets split the Dataset for Predictive Modelling

y = data['label']
x = data.drop(['label'],axis=1)

print("Shape of x:",x.shape)
print("Shape of y:",y.shape)

In [None]:
#lets create training and testing sets for validation of results
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

print("The Shape of x train :",x_train.shape)
print("The Shape of x test :",x_test.shape)
print("The Shape of y train :",y_train.shape)
print("The Shape of y test :",y_test.shape)

**Regression**

In [None]:
#lets create a predictive model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

In [None]:
#lets evaluate the model performance
from  sklearn.metrics import confusion_matrix

#lets print the confusion matrix first
plt.rcParams['figure.figsize']=(10,10)
cm=confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True,cmap='Wistia')
plt.title('Confusion Matrix for Logistic Regression',fontsize=15)
plt.show()

In [None]:
#lets print the classification report also
from  sklearn.metrics import classification_report
cr = classification_report(y_test,y_pred)
print(cr)

In [None]:
data.head()

In [None]:
prediction = model.predict((np.array([[90,
                                       40,
                                       40,
                                       20,
                                       80,
                                       7,
                                       200]])))
print("The Suggested Crop for Given Climatc Condition is :",prediction)