## Import libraries

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report 
import pandas as pd
import numpy as np

## Import dataset

In [8]:
df = pd.read_csv(r"C:\Users\manny\Documents\Hackathon\Crop_recommendation.csv")

# Data Dictionary

#### N - ratio of Nitrogen content in soil
#### P - ratio of Phosphorous content in soil
#### K - ratio of Potassium content in soil
#### temperature - temperature in degree Celsius
#### humidity - relative humidity in %
#### ph - ph value of the soil
#### rainfall - rainfall in mm

In [9]:
pd.DataFrame({"Features from dataset":df.columns})

Unnamed: 0,Features from dataset
0,N
1,P
2,K
3,temperature
4,humidity
5,ph
6,rainfall
7,label


In [10]:
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,21,82,7,203,rice
1,85,58,41,22,80,7,227,rice
2,60,55,44,23,82,8,264,rice
3,74,35,40,26,80,7,243,rice
4,78,42,42,20,82,8,263,rice


## Celsius to Fahrenheit conversion

In [5]:
# df['temperature'] = (round(df['temperature']*(9/5)+32,0))

## See all the crops in the dataset

In [10]:
pd.DataFrame({"Crop":df['label'].unique()})

Unnamed: 0,Crop
0,rice
1,maize
2,chickpea
3,kidneybeans
4,pigeonpeas
5,mothbeans
6,mungbean
7,blackgram
8,lentil
9,pomegranate


In [11]:
pd.DataFrame({"Count":df.label.value_counts()})

Unnamed: 0,Count
maize,100
pomegranate,100
watermelon,100
mango,100
mungbean,100
lentil,100
apple,100
grapes,100
muskmelon,100
jute,100


## Checking the statistics for each feature

In [11]:
df.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,25.615455,71.487727,6.5,103.466364
std,36.917334,32.985883,50.647931,5.080555,22.274015,0.786424,54.963047
min,0.0,5.0,5.0,9.0,14.0,4.0,20.0
25%,21.0,28.0,20.0,23.0,60.0,6.0,65.0
50%,37.0,51.0,32.0,26.0,80.0,6.0,95.0
75%,84.25,68.0,49.0,29.0,90.0,7.0,124.0
max,140.0,145.0,205.0,44.0,100.0,10.0,299.0


## Seeing relationship between features

In [13]:
df.corr()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
N,1.0,-0.23146,-0.140512,0.026501,0.190687,0.096761,0.059021
P,-0.23146,1.0,0.736232,-0.127538,-0.118732,-0.138022,-0.06384
K,-0.140512,0.736232,1.0,-0.160389,0.19086,-0.169477,-0.053461
temperature,0.026501,-0.127538,-0.160389,1.0,0.205326,-0.017919,-0.030082
humidity,0.190687,-0.118732,0.19086,0.205326,1.0,-0.008502,0.094418
ph,0.096761,-0.138022,-0.169477,-0.017919,-0.008502,1.0,-0.109167
rainfall,0.059021,-0.06384,-0.053461,-0.030082,0.094418,-0.109167,1.0


## Using .5 as a threshold, Potassium and Phosphorous were the only features that had a strong relationship

## Setting up the response and predictors

In [14]:
print(len(df))

X = df[['N','P','K','temperature','humidity','ph','rainfall']]
y = df['label']

# Spliting the data into training and test sets
# Since it's a smaller dataset, you would want to do a 60/40 or 70/30 split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

2200


## Making Logistic Regression Classifier and Hypertuning

In [15]:
# create LR parameters
params = {"C": np.logspace(-3,3,7),
         "penalty": ["l1","l2"],
         "solver": ["liblinear", "newton-cg", "lbfgs"]}

# use grid search CV with various parameters to find optimal solution
model = GridSearchCV(LogisticRegression(), 
                      param_grid=params, 
                      cv=10)

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [16]:
train_data_dataset = pd.DataFrame(model.cv_results_)

In [17]:
# print out best parameters for the model
model.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [18]:
# see the best cv score 
model.best_score_

0.9805194805194806

In [19]:
y_pred = model.predict(X_test)

X_test

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
804,37,71,16,79.322,68.52,7.32,46.14
976,22,9,44,76.496,88.88,5.74,112.19
79,81,41,38,72.824,83.73,7.52,200.91
1732,50,59,47,105.386,92.09,6.75,209.87
2135,88,35,35,81.608,58.46,6.78,117.94
...,...,...,...,...,...,...,...
366,20,69,15,74.192,22.77,5.93,107.41
1433,107,11,54,83.462,91.34,6.09,29.44
524,26,50,19,81.176,51.67,6.01,32.56
1971,102,37,25,77.558,77.92,5.91,72.83


# Make a dataset comparing the prediction to the actual labels

In [20]:
opt = pd.DataFrame({"Prediction":y_pred,"Actual":y_test})

In [21]:
opt

Unnamed: 0,Prediction,Actual
804,lentil,lentil
976,pomegranate,pomegranate
79,rice,rice
1732,papaya,papaya
2135,coffee,coffee
...,...,...
366,kidneybeans,kidneybeans
1433,muskmelon,muskmelon
524,mothbeans,mothbeans
1971,cotton,cotton


In [22]:
opt["Prediction Correct?"] = opt["Actual"] == opt["Prediction"]

In [23]:
opt['Prediction Correct?'].value_counts()

True     649
False     11
Name: Prediction Correct?, dtype: int64

In [24]:
round(647/660,2)

0.98

In [25]:
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,69.584,82.0,6.5,202.94,rice
1,85,58,41,71.186,80.32,7.04,226.66,rice
2,60,55,44,73.4,82.32,7.84,263.96,rice
3,74,35,40,79.682,80.16,6.98,242.86,rice
4,78,42,42,68.234,81.6,7.63,262.72,rice


In [27]:
# Defining a new instance
Xnew = [[50,50,50, 75,80,7.53,230], [70,40,40,90,80,8.00,200]]

# Making the prediction
ynew = model.predict(Xnew)
print("X=%s, Predicted=%s" % (Xnew[0], ynew[0]))

X=[70, 40, 40, 90, 80, 8.0, 200], Predicted=jute




# What are the next steps?:

#### As we more collect data, we can consult with farmers on farming tactics:

#### Have a webpage where farmers can input their soil levels, temperature, etc to give recommendations.

#### What crop they should produce based on input.

#### How much water they would need to produce a crop efficiently.

#### The time of year to produce a certain crop efficiently.

#### Whether there soil is too acidic or basic for a crop.

#### If they would need to change the ratio of N,P, and K in soil.