In [131]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [4]:
file = r"C:\Users\AKIN-JOHNSON\Desktop\Workspace\Crop recommendation\Crop_recommendation.csv"
df = pd.read_csv(file)
df.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [20]:
# changing column names for easy readability
df = df.rename(columns={'N': 'Nitrogen_Ratio',
                       'P': 'Phosphorous_Ratio',
                       'K': 'Potassium_Ratio',
                       'label': 'Recommendations'})

In [22]:
df.head(1)

Unnamed: 0,Nitrogen_Ratio,Phosphorous_Ratio,Potassium_Ratio,temperature,humidity,ph,rainfall,Recommendations
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice


In [24]:
# checking for missin values
df.isna().sum()

Nitrogen_Ratio       0
Phosphorous_Ratio    0
Potassium_Ratio      0
temperature          0
humidity             0
ph                   0
rainfall             0
Recommendations      0
dtype: int64

In [26]:
# checking for duplicates
df.duplicated().sum()

0

In [28]:
# checking the datatypes of each columns
df.dtypes

Nitrogen_Ratio         int64
Phosphorous_Ratio      int64
Potassium_Ratio        int64
temperature          float64
humidity             float64
ph                   float64
rainfall             float64
Recommendations       object
dtype: object

In [33]:
# Cleaning up the recommendations column
df['Recommendations'] = df['Recommendations'].str.capitalize()
df

Unnamed: 0,Nitrogen_Ratio,Phosphorous_Ratio,Potassium_Ratio,temperature,humidity,ph,rainfall,Recommendations
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,Rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,Coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,Coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,Coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,Coffee


### Machine Leaarning

In [35]:
# I'm not going to be dropping any columns in this dataset because i believe all columnns are useful for the prediction

In [61]:
# organizing what i want to put in my column transformer which will eventually enter a pipeline
scaler = ['Nitrogen_Ratio', 'Phosphorous_Ratio', 'Potassium_Ratio', 'temperature', 'ph', 'rainfall', 'humidity']

# create a column transformer for scaler and encoder
transformer = ColumnTransformer(transformers=[('Standard_Scaler', StandardScaler(), scaler)],   # Standared Scaler for numerical columns
                                                remainder='passthrough')                           # Pass through any columns not specified

In [65]:
transformer

In [67]:
# create a pipeline with a model
pipeline = Pipeline(steps=[('Column_transformer', transformer),
                           ('Processor', RandomForestClassifier(random_state=5))])

In [71]:
pipeline

In [73]:
# split dataset into features and target
X = df.drop('Recommendations', axis=1)
y = df['Recommendations']

In [81]:
# Encode the target variable separately
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# i couldn't put the label encoder into column_transformer because it was a series and not a dataframe

In [83]:
# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1760, 7)
(1760,)
(440, 7)
(440,)


In [87]:
# train the model
model = pipeline.fit(X_train, y_train)
model

In [89]:
# lets predict the testing data
y_test_pred = model.predict(X_test)
y_test_pred

array([ 7, 15,  4,  3, 21, 17, 17, 19, 18,  0,  4, 17, 19, 11,  2,  3,  6,
       15, 13, 19, 15,  3,  5, 19,  2,  8, 16, 21, 20, 16,  0, 13,  8,  8,
        1, 13, 17, 14,  9,  2, 12,  3, 11, 17,  1,  2,  4, 16, 16, 11,  5,
       13, 18, 16, 12, 16, 15, 18, 14, 13,  6,  2,  3,  6, 14, 14,  3, 17,
        4,  1,  8, 11, 19, 21,  1,  9, 17,  0,  7,  3,  3,  1,  0, 11,  0,
       15,  1,  3, 20, 17, 18, 16, 12,  5, 15, 19, 18, 21,  5,  7, 20,  4,
       11, 11, 13,  2, 16, 18, 18, 11, 18, 12, 12, 19, 10,  2, 18, 15,  0,
       10,  4, 15, 10, 11, 12,  5,  6,  2,  5, 18,  2, 11,  4,  5, 16,  4,
       14,  2, 11,  0, 16, 11,  0, 17, 15, 19,  2, 19, 13, 14,  7,  1, 10,
       20, 18,  5, 14, 13, 21,  8, 10,  0, 20, 14, 14, 16,  8,  9, 18, 18,
       15,  7, 11, 12,  9,  3, 21, 19, 10, 10, 21,  7, 17,  5,  0,  4, 21,
       17,  6,  8,  5,  3,  0, 19, 17,  7, 11, 21,  4,  1, 17, 11,  2,  6,
       20, 13,  5,  4, 11, 14, 13, 10,  3,  6, 15, 21, 11, 11,  2, 15, 17,
        0, 16,  5,  0,  6

In [125]:
# lets evaluate the mmodel
accuracy = accuracy_score(y_test, y_test_pred)
accuracy = accuracy * 100
print(f'Accuracy Score = {accuracy:.2f}%')

Accuracy Score = 99.77%


In [133]:
# performing hyperparameter tuning

param_grid = {
    'Processor__n_estimators': [100, 200, 300],
    'Processor__max_depth': [10, 20, 30, None],
    'Processor__min_samples_split': [2, 5, 10],
    'Processor__min_samples_leaf': [1, 2, 4],
    'Processor__max_features': ['sqrt', 'log2']
}

randomized_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
randomized_search.fit(X_train, y_train)

print("Best Estimator:", randomized_search.best_estimator_)
print("Best parameters:", randomized_search.best_params_)
print("Best cross-validation score:", randomized_search.best_score_)

Best Estimator: Pipeline(steps=[('Column_transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('Standard_Scaler',
                                                  StandardScaler(),
                                                  ['Nitrogen_Ratio',
                                                   'Phosphorous_Ratio',
                                                   'Potassium_Ratio',
                                                   'temperature', 'ph',
                                                   'rainfall', 'humidity'])])),
                ('Processor',
                 RandomForestClassifier(max_depth=10, min_samples_split=5,
                                        random_state=5))])
Best parameters: {'Processor__max_depth': 10, 'Processor__max_features': 'sqrt', 'Processor__min_samples_leaf': 1, 'Processor__min_samples_split': 5, 'Processor__n_estimators': 100}
Best cross-validation score: 0.99431818

In [137]:
# Access the best pipeline with the best hyperparameters
best_pipeline = randomized_search.best_estimator_

# Now you can use this pipeline to make predictions or retrain it on the training data
best_pipeline.fit(X_train, y_train)

# You can also directly use this pipeline to make predictions on test data
y_test_pred = best_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred)
accuracy = accuracy * 100
print(f'Accuracy Score = {accuracy:.2f}%')

Accuracy Score = 99.77%


In [143]:
# Save the pipeline
joblib.dump(best_pipeline, 'crop recommendation.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [145]:
!pip list

Package                           Version
--------------------------------- ------------------
absl-py                           2.1.0
aext-assistant                    4.0.15
aext-assistant-server             4.0.15
aext-core                         4.0.15
aext-core-server                  4.0.15
aext-panels                       4.0.15
aext-panels-server                4.0.15
aext-share-notebook               4.0.15
aext-share-notebook-server        4.0.15
aext-shared                       4.0.15
aiobotocore                       2.12.3
aiohttp                           3.9.5
aioitertools                      0.7.1
aiosignal                         1.2.0
alabaster                         0.7.16
altair                            5.0.1
anaconda-anon-usage               0.4.4
anaconda-catalogs                 0.2.0
anaconda-client                   1.12.3
anaconda-cloud-auth               0.5.1
anaconda-navigator                2.6.0
anaconda-project                  0.11.1
annotated-ty

