In [None]:
import pandas as pd

import seaborn as sns
import numpy as np

# STEP 1: READING AND PROCESSING DATA
Dataset used: https://www.kaggle.com/datasets/atharvaingle/crop-recommendation-dataset

In [None]:
dataset = pd.read_csv("./data/crop.rec.csv")
dataset

In [None]:
dataset.isna().sum().sum()

There are no null values!

In [None]:
labels_unique = list(set(dataset["label"]))
dataset["label"] = dataset["label"].map(lambda x: labels_unique.index(x))
dataset

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=42)
test_dataset = dataset.drop(train_dataset.index)

# STEP 2: FEATURE VISUALISATION/SELECTION

In [None]:
sns.pairplot(
    train_dataset[["N", "K", "temperature", "humidity", "ph", "rainfall"]],
    diag_kind="kde",
)
train_dataset.describe().transpose()

In [None]:
X_train = train_dataset.copy().drop("label", axis=1)
X_test = test_dataset.copy().drop("label", axis=1)

In [None]:
Y_train = train_dataset.copy()["label"]
Y_test = test_dataset.copy()["label"]

## Normalizing the Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

# STEP 3: MODEL SELECTION

In [None]:
### INITIALIZATION ###

from sklearn.metrics import mean_absolute_error, mean_squared_error
import time


def run_model(MODEL: type):
    begin = time.time()

    ret = {}

    m = MODEL()

    m.fit(X_train, Y_train)

    ret["RMSE"] = mean_squared_error(m.predict(X_test), Y_test, squared=False)
    ret["MAE"] = mean_absolute_error(m.predict(X_test), Y_test)
    ret["score"] = m.score(X_test, Y_test)
    ret["time"] = time.time() - begin

    return ret

# 3.1 TESTING VARIOUS CLASSIFICATION MODELS

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

clf_models = [
    SVC,
    KNeighborsClassifier,
    RandomForestClassifier,
    DecisionTreeClassifier,
]

for ModelType in clf_models:
    print(f"> testing model: {ModelType.__name__}")
    scores = run_model(ModelType)
    print(f"  MAE:        {scores['RMSE']}")
    print(f"  RMSE:       {scores['MAE']}")
    print(f"  accuracy:   {(scores['score'] * 100) : .{3}}%")
    print(f"  time taken: {scores['time']:.{3}}s")

# 3.2 TESTING REGRESSION MODELS: TODO (DEBUGGING)

In [None]:
### SOME SETUP ###
# we'll predict the pH of the soil

X_train = train_dataset.copy().drop("ph", axis=1)
X_test = test_dataset.copy().drop("ph", axis=1)

In [None]:
Y_train = train_dataset.copy()["ph"]
Y_test = test_dataset.copy()["ph"]

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train

In [None]:
from sklearn.linear_model import BayesianRidge, ARDRegression

rg_models = [
    BayesianRidge,
    ARDRegression,
]

for ModelType in rg_models:
    print(f"> testing model: {ModelType.__name__}")
    scores = run_model(ModelType)
    print(f"  MAE:        {scores['RMSE']}")
    print(f"  RMSE:       {scores['MAE']}")
    print(f"  accuracy:   {scores['score']}")
    print(f"  time taken: {scores['time']:.{3}}s")

^ Low error but low accuracy, check for bugs...

# TODO:
1. *Regression* - Fix regression error.
2. *LSTM Cell* - Helpful for predicting given a range.

In [None]:
### TESTS ###
reg = BayesianRidge()

reg.fit(X_train, Y_train)

In [None]:
reg.predict(X_test), Y_test