<center><h1>Modelling</h1></center>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from tqdm import tqdm

from sklearn.model_selection import train_test_split, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_curve,
)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import (
    RandomUnderSampler,
    RepeatedEditedNearestNeighbours,
    TomekLinks,
)

In [2]:
data = Path("./Data_removed_outlier_iqr.csv")

In [3]:
df = pd.read_csv(data)

In [4]:
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,35280,717.703,264.99525,170.035245,1.558472,0.766994,35772,211.943132,0.703616,0.986246,0.860694,0.7998,0.007511,0.001896,0.63968,0.996923,DERMASON
1,83296,1142.638,446.765889,239.013317,1.869209,0.844861,84270,325.662035,0.702588,0.988442,0.801709,0.728932,0.005364,0.000934,0.531342,0.99319,CALI
2,35594,689.634,254.572928,178.441837,1.426644,0.713214,35966,212.884213,0.811629,0.989657,0.940479,0.836241,0.007152,0.002157,0.699298,0.99765,DERMASON
3,52710,872.7,326.039383,207.39945,1.572036,0.771592,53280,259.06072,0.677419,0.989302,0.869707,0.794569,0.006186,0.001521,0.63134,0.992488,SIRA
4,62855,1004.759,413.879306,194.299306,2.130112,0.882954,63781,282.894807,0.59834,0.985482,0.782395,0.68352,0.006585,0.000887,0.4672,0.995188,HOROZ


## Baseline model

We can start with a simple baseline model: `DecisionTree` as this would not require scaling the data

I have already shuffled the columns so I will directly split them into `TRAIN` and `TEST`. We will further use the `TRAIN` set to do KFold cross validation.

In [5]:
X = df.iloc[:, :16].values
y = df.Class.astype("category").cat.codes.values

In [6]:
X_sclaed = StandardScaler().fit_transform(X)
y_scaled = y

In [7]:
lbl2idx = {}
for k, v in enumerate(df.Class.astype("category").cat.categories):
    lbl2idx[v] = k

In [8]:
lbl2idx

{'BARBUNYA': 0,
 'BOMBAY': 1,
 'CALI': 2,
 'DERMASON': 3,
 'HOROZ': 4,
 'SEKER': 5,
 'SIRA': 6}

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [10]:
dt = DecisionTreeClassifier()

In [11]:
def plot(metrics, metric_name, splits=10):

    splits = list(range(1, splits + 1))
    plt.plot(splits, metrics)
    plt.xlabel("splits")
    plt.ylabel(f"{metric_name}")
    plt.show()

    print(f"Average validation {metric_name}: {sum(metrics)/10}")

In [12]:
kfold = KFold(n_splits=10)
metrics = []

for train_idx, val_idx in tqdm(kfold.split(X_train, y_train), total=10):
    X_t, y_t = X_train[train_idx], y_train[train_idx]
    X_val, y_val = X_train[val_idx], y_train[val_idx]

    dt.fit(X_t, y_t)

    y_pred = dt.predict(X_val)

    score = accuracy_score(y_pred, y_val)
    metrics.append(score)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.01it/s]


In [13]:
y_pred_test = dt.predict(X_test)
accuracy_score(y_pred_test, y_test)

0.8782409761057448

In [14]:
scores = np.sum(metrics, axis=0) / 10
for c, score in zip(list(lbl2idx.keys()), scores):
    print(f"Averaged f1-score for class {c}: {score}")

TypeError: 'numpy.float64' object is not iterable

In [None]:
X_sam, y_sam = RandomOverSampler().fit_resample(X_train, y_train)

In [None]:
kf = KFold(n_splits=10, shuffle=True)
metrics = []
for train_idx, val_idx in tqdm(kf.split(X_sam, y_sam), total=kf.get_n_splits()):
    X_train, y_train = X_sam[train_idx], y_sam[train_idx]
    X_val, y_val = X_sam[val_idx], y_sam[val_idx]
    dt.fit(X_train, y_train)

    y_pred = dt.predict(X_val)
    metrics.append(f1_score(y_pred, y_val, average=None))

In [None]:
np.sum(metrics, axis=0) / 10