# ml_models.ipynb
- Implementation of Logistic Regression, Random Forest Classifier, Decision Tree Classifier, Gradient Boosting Classifier.
- It aims to perform the evaluation and comparison of various machine learning models on three different datasets (dataset_v1, dataset_v2, and dataset_v3).
- For each fold in the K-Fold and Stratified K-Fold Cross Validation, the script trains each of these models on the training set and evaluates their performance on the test set. 

In [None]:
import os
import sys
from dotenv import load_dotenv

# sklearn
from sklearn.linear_model import LogisticRegression  # type:ignore
from sklearn.ensemble import (  # type:ignore
    RandomForestClassifier,
    GradientBoostingClassifier,
)
from sklearn.tree import DecisionTreeClassifier  # type:ignore
from sklearn.svm import SVC, LinearSVC  # type:ignore
from sklearn.model_selection import KFold, StratifiedKFold  # type:ignore
from sklearn.metrics import f1_score  # type:ignore
from sklearn.metrics import roc_curve, auc  # type:ignore

import numpy as np
import pandas as pd
import sqlalchemy as sq

# Mathplotlib
from matplotlib import pyplot as plt  # type: ignore

sys.path.append("../")
from Shared.DataService import DataService

from Datasets.DataCreation import getDatasetV1, getDatasetV2, getDatasetV3

Purpose:
- The purpose of the provided code is to facilitate the initialization of various machine learning classifiers based on a given model type.

Psuedocode:
- The provided code defines a dictionary called model_dict, which maps strings to corresponding machine learning classifier classes. 
- It also defines a function called model_initializer, which initializes and returns an instance of a specified classifier based on the given model_type.

In [None]:
model_dict = {
    "logistic_regression": LogisticRegression,
    "random_forest": RandomForestClassifier,
    "decision_tree": DecisionTreeClassifier,
    "gradient_boost": GradientBoostingClassifier,
    # "svc": SVC,
    # "linear_svc": LinearSVC,
}


def model_initializer(model_type: str, random_state: int = 42, max_depth: int = 2):
    # if network then initialize the network model differently (passing X-train, xtest, ytest ...)
    # currently haven't had any networks.
    if model_type == "random_forest":
        model = model_dict[model_type](random_state=random_state, max_depth=max_depth)
    else:
        model = model_dict[model_type](random_state=random_state)
    return model

In [None]:
dataset_v1 = getDatasetV1(None)
dataset_v1

In [None]:
dataset_v2 = getDatasetV2(None)
dataset_v2

In [None]:
dataset_v3 = getDatasetV3(None)
dataset_v3

Purpose:
- The purpose of the provided code is to evaluate multiple machine learning models using k-fold cross-validation. 

Psuedocode:
- It takes input features X and corresponding labels y and performs k-fold cross-validation on the data. 
- For each fold, it trains different machine learning classifiers from the model_dict, evaluates their performance using various metrics.

In [None]:
def evaluate_models_kfold(X: np.ndarray, y: np.ndarray, numOfFolds: int) -> None:
    kf = KFold(
        n_splits=numOfFolds
    )  # we need to modify it to make sure the outliers dont fall into 1 bin
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"Fold {i}: ")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for model_type in model_dict:
            model = model_initializer(model_type).fit(X_train, y_train.squeeze())
            y_pred = model.predict(X_test)
            print(
                f"model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}"
            )
            fpr, tpr, t = roc_curve(y_test, y_pred)
            auc_ = auc(fpr, tpr)
            # plt.plot(fpr, tpr, marker=".")
            print("auc score: ", auc_)
        print(f"End the fold {i}\n")

Purpose:
- The purpose of the provided code is to evaluate multiple machine learning models using stratified k-fold cross-validation. 

Psuedocode:
- It takes input features X and corresponding labels y as NumPy arrays and performs stratified k-fold cross-validation on the data. 
- For each fold, it trains different machine learning classifiers from the model_dict, evaluates their performance using various metrics.

In [None]:
def evaluate_models_stratifiedKFold(
    X: np.ndarray, y: np.ndarray, numOfFolds: int
) -> None:
    kf = StratifiedKFold(
        n_splits=numOfFolds, random_state=None, shuffle=False
    )  # we need to modify it to make sure the outliers dont fall into 1 bin
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):
        print(f"Fold {i}: ")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for model_type in model_dict:
            model = model_initializer(model_type).fit(X_train, y_train.squeeze())
            y_pred = model.predict(X_test)
            print(
                f"model type: {model_type}, and its r^2 score is : {model.score(X_test, y_test)} and f1_score is: {f1_score(y_test, y_pred)}"
            )
            fpr, tpr, t = roc_curve(y_test, y_pred)
            auc_ = auc(fpr, tpr)
            # plt.plot(fpr, tpr, marker=".")
            print("auc score: ", auc_)
        print(f"End the fold {i}\n")

Purpose :

- The purpose of this code is to experiment on different datasets with different models.

Psuedocode :
- This code first prepares the input features and target labels from the dataset. 
- It proceeds to evaluate multiple machine learning models using both k-fold cross-validation and stratified k-fold cross-validation with 5 folds on different datasets.

# Experimenting on dataset v1

In [None]:
X = np.array(dataset_v1.drop(columns=["has_ergot", "year"])).squeeze()
y = np.array(dataset_v1["has_ergot"]).squeeze()

print("X data: ", X)
print("X shape: {} \n".format(X.shape))
print("y data: ", y)
print("y shape: ", y.shape)

### K-Fold Cross Validation

In [None]:
evaluate_models_kfold(X, y, 5)

### Stratified KFold

Note :
- KFold validation does not preserve the split of the output variable while splitting the data. For example, it is possible that if we have ten samples where 5 of them has incidence = True, and 5 of them has incidence = False, KFold can randomly put all positive (incidence = True) in 1 bin and all negative in another bin. To avoid that, we can use Stratified KFold - preserve the split in the original dataset in training dataset.

In [None]:
evaluate_models_stratifiedKFold(X, y, 5)

# Experimenting on dataset v2

In [None]:
X_2 = np.array(dataset_v2.drop(columns=["has_ergot", "year"])).squeeze()
y_2 = np.array(dataset_v2["has_ergot"]).squeeze()

print("X data: ", X)
print("X shape: {} \n".format(X_2.shape))
print("y data: ", y_2)
print("y shape: ", y_2.shape)

In [None]:
evaluate_models_kfold(X_2, y_2, 5)

In [None]:
evaluate_models_stratifiedKFold(X_2, y_2, 5)

# Experimenting on dataset v3

In [None]:
X_3 = np.array(dataset_v3.drop(columns=["incidence", "year"])).squeeze()
y_3 = np.array(dataset_v3["incidence"]).squeeze()

print("X data: ", X_3)
print("X shape: {} \n".format(X_3.shape))
print("y data: ", y_3)
print("y shape: ", y_3.shape)

In [None]:
evaluate_models_kfold(X_3, y_3, 5)

In [None]:
evaluate_models_stratifiedKFold(X_3, y_3, 5)