<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Modelling" data-toc-modified-id="Modelling-1">Modelling</a></span><ul class="toc-item"><li><span><a href="#Baseline-model" data-toc-modified-id="Baseline-model-1.1">Baseline model</a></span></li></ul></li></ul></div>

<center><h1>Modelling</h1></center>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import (
    RandomUnderSampler,
    RepeatedEditedNearestNeighbours,
    TomekLinks,
)

In [None]:
data = Path("./Data_removed_outlier_iqr.csv")

In [None]:
df = pd.read_csv(data)

In [None]:
df.head()

## Baseline model

We can start with a simple baseline model: `DecisionTree` as this would not require scaling the data

I have already shuffled the columns so I will directly split them into `TRAIN` and `TEST`. We will further use the `TRAIN` set to do KFold cross validation.

In [None]:
X = df.iloc[:, :16].values
y = df.Class.astype("category").cat.codes.values

In [None]:
X_sclaed = StandardScaler().fit_transform(X)
y_scaled = y

In [None]:
lbl2idx = {}
for k, v in enumerate(df.Class.astype("category").cat.categories):
    lbl2idx[v] = k

In [None]:
lbl2idx

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
dt = DecisionTreeClassifier()

In [None]:
def plot(metrics, metric_name, splits=10):

    splits = list(range(1, splits + 1))
    plt.plot(splits, metrics)
    plt.xlabel("splits")
    plt.ylabel(f"{metric_name}")
    plt.show()

    print(f"Average validation {metric_name}: {sum(metrics)/10}")

In [None]:
kfold = KFold(n_splits=10)
metrics = []

for train_idx, val_idx in tqdm(kfold.split(X_train, y_train), total=10):
    X_t, y_t = X_train[train_idx], y_train[train_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    dt.fit(X_t, y_t)

    y_pred = dt.predict(X_val)

    score = accuracy_score(y_pred, y_val)
    metrics.append(score)

In [None]:
y_pred_test = dt.predict(X_test)
accuracy_score(y_pred_test, y_test)

In [None]:
scores = np.sum(metrics, axis=0) / 10
for c, score in zip(list(lbl2idx.keys()), scores):
    print(f"Averaged f1-score for class {c}: {score}")

In [None]:
X_sam, y_sam = RandomOverSampler().fit_resample(X_train, y_train)

In [None]:
kf = KFold(n_splits=10, shuffle=True)
metrics = []
for train_idx, val_idx in tqdm(kf.split(X_sam, y_sam), total=kf.get_n_splits()):
    X_train, y_train = X_sam[train_idx], y_sam[train_idx]
    X_val, y_val = X_sam[val_idx], y_sam[val_idx]
    dt.fit(X_train, y_train)

    y_pred = dt.predict(X_val)
    metrics.append(f1_score(y_pred, y_val, average=None))

In [None]:
np.sum(metrics, axis=0) / 10