<a href="https://colab.research.google.com/github/AjanakuBen/Kaggle-Data-Science-Projects/blob/main/Predictive_Modelling_for_Agriculture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

HOW MACHINE LEARNING HELPS FARMERS SELECT THE BEST CROPS

Measuring essential soil metrics such as nitrogen, phosphorous, potassium levels, and pH value is an important aspect of assessing soil condition. However, it can be an expensive and time-consuming process, which can cause farmers to prioritize which metrics to measure based on their budget constraints.

In this project, apply machine learning to build a multi-class classification model to predict the type of "crop", while using techniques to avoid multicollinearity, which is a concept where two or more features are highly correlated.

In [14]:
#import the necessary libraries for the project.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

In [3]:
#Load the dataset
df = pd.read_csv("/content/soil_measures.csv", encoding = "latin")

DATA PREPROCESSING AND EXPLORATION

In [4]:
df.head()

Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice


In [5]:
df.tail()

Unnamed: 0,N,P,K,ph,crop
2195,107,34,32,6.780064,coffee
2196,99,15,27,6.086922,coffee
2197,118,33,30,6.362608,coffee
2198,117,32,34,6.758793,coffee
2199,104,18,30,6.779833,coffee


In [6]:
df.shape

(2200, 5)

In [7]:
df.columns

Index(['N', 'P', 'K', 'ph', 'crop'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       2200 non-null   int64  
 1   P       2200 non-null   int64  
 2   K       2200 non-null   int64  
 3   ph      2200 non-null   float64
 4   crop    2200 non-null   object 
dtypes: float64(1), int64(3), object(1)
memory usage: 86.1+ KB


In [9]:
df.describe()

Unnamed: 0,N,P,K,ph
count,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,6.46948
std,36.917334,32.985883,50.647931,0.773938
min,0.0,5.0,5.0,3.504752
25%,21.0,28.0,20.0,5.971693
50%,37.0,51.0,32.0,6.425045
75%,84.25,68.0,49.0,6.923643
max,140.0,145.0,205.0,9.935091


In [12]:
df

Unnamed: 0,N,P,K,ph,crop
0,90,42,43,6.502985,rice
1,85,58,41,7.038096,rice
2,60,55,44,7.840207,rice
3,74,35,40,6.980401,rice
4,78,42,42,7.628473,rice
...,...,...,...,...,...
2195,107,34,32,6.780064,coffee
2196,99,15,27,6.086922,coffee
2197,118,33,30,6.362608,coffee
2198,117,32,34,6.758793,coffee


In [13]:
df.corr()

  df.corr()


Unnamed: 0,N,P,K,ph
N,1.0,-0.23146,-0.140512,0.096683
P,-0.23146,1.0,0.736232,-0.138019
K,-0.140512,0.736232,1.0,-0.169503
ph,0.096683,-0.138019,-0.169503,1.0


In [29]:
#encode the categorical variable
label = LabelEncoder()
en_target = label.fit_transform(df["crop"])
crop_en = pd.Series(en_target, name='crop_en')

In [37]:
df_new = pd.concat([df, crop_en], axis = 1)
df_new.describe()

Unnamed: 0,N,P,K,ph,crop_en
count,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,6.46948,10.5
std,36.917334,32.985883,50.647931,0.773938,6.345731
min,0.0,5.0,5.0,3.504752,0.0
25%,21.0,28.0,20.0,5.971693,5.0
50%,37.0,51.0,32.0,6.425045,10.5
75%,84.25,68.0,49.0,6.923643,16.0
max,140.0,145.0,205.0,9.935091,21.0


In [33]:
df_new.corr()['crop_en'].drop('crop_en')

  df_new.corr()['crop_en'].drop('crop_en')


N    -0.031130
P    -0.491006
K    -0.346417
ph   -0.012253
Name: crop_en, dtype: float64

In [34]:
#identify the features and target variables
X = df_new.drop(["crop", "crop_en"], axis = 1)
y = df_new["crop_en"]

In [36]:
#split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [42]:
#apply logistic regression
logic = LogisticRegression( max_iter = 2000)

In [43]:
logic.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
y_pred = logic.predict(X_test)
y_pred

array([16,  1,  6, 11, 16,  3, 17, 10,  1, 16, 15, 19, 21, 11, 14,  0,  0,
        4,  4,  0,  2,  7, 14,  9,  0, 20, 14, 21, 20,  5, 12, 20,  1,  5,
       20,  2,  7,  7, 10,  8,  4, 17,  4, 11, 10,  8,  6, 18,  3, 14, 17,
       19, 18, 21, 12, 13, 17, 17,  5, 18, 10, 14,  5,  1, 17, 20, 16,  4,
       11, 19,  1, 21, 15, 10,  1, 12,  9, 13, 14,  4,  5, 15, 15,  2,  4,
       19, 16,  6,  4, 17,  9, 14,  9,  6,  8,  0,  6,  3, 20, 15, 21, 19,
       16, 13,  9,  7, 18, 10,  8, 20,  0, 14, 13,  7, 20, 15,  2,  9,  1,
       16, 16,  3, 11, 12, 11, 13,  1, 19,  4, 10,  6,  3,  8, 13, 13,  5,
        7,  3, 20, 11,  6, 20, 19, 12, 20, 17,  1, 16, 18, 10, 11,  3,  6,
       21, 13, 11,  9, 21,  7, 14, 11,  3, 11, 11, 18,  5, 16, 11,  0, 19,
        0, 19, 15,  0,  6, 10, 10,  3,  2, 13,  0, 21,  6, 21, 15, 15, 16,
       21,  5, 18,  7,  3,  2,  5, 21,  5,  7,  7, 20, 17,  4,  0, 15, 13,
       19,  9, 13,  9, 17, 15,  3,  7, 11,  3,  1, 19, 19, 11, 16,  8,  6,
        6, 11,  0, 14,  2

In [48]:
#calculate the f1 score
f1_score(y_test, y_pred, average = "weighted")

0.673633036234843

In [49]:
scale = StandardScaler()

In [51]:
X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)

In [52]:
logic.fit(X_train_scaled, y_train)

In [53]:
logic.predict(X_test_scaled)

array([16,  1,  6, 11, 16,  3, 20, 10,  1, 16, 15, 19, 21, 11, 14,  0,  0,
       16, 19,  0,  2,  7, 12,  9,  0, 20, 12, 21, 20,  5, 12, 20,  1,  5,
       20,  2,  7,  7, 10,  8,  4, 17,  4, 11, 10,  8,  6, 10,  3, 14, 17,
       19, 10, 21, 19, 13, 17, 17,  5, 10,  2, 13,  5,  1, 17, 20, 16,  4,
        2, 19,  1, 21, 15, 10,  1, 12,  9,  2, 14, 12,  5, 15, 15, 18,  4,
       19, 16,  6,  4, 17,  9, 14,  9,  6,  8,  0,  6,  3, 20, 15, 21, 19,
       16, 10,  9,  7, 18, 10,  8, 20,  0, 13, 14,  7, 17, 15,  2, 18,  1,
       16, 16,  3, 11, 12, 11,  2,  1, 19,  4, 14,  6, 17,  8, 13,  2, 11,
        7,  3, 20, 11,  6, 20, 19, 12, 20, 17,  1, 16,  9, 10, 11,  3,  6,
       21,  2, 11,  9, 21,  7, 13,  2,  3, 11, 11, 18,  5, 16, 11,  0, 12,
        0, 19, 15,  0,  6, 10, 10,  3,  2, 13,  0, 21,  6, 21, 15, 15, 16,
       15,  5,  9,  0,  3,  2,  5, 21,  5,  7,  7, 20, 17,  4,  0, 15, 14,
       19,  9, 13, 14, 17, 15,  3,  7, 11,  3,  1, 19, 19, 11, 16,  8,  6,
        6, 11,  7, 14,  2

In [55]:
#calculate the f1 score
model_performance = f1_score(y_test, logic.predict(X_test_scaled), average = "weighted")
model_performance

0.6790715736844755