In [1]:
# set up working catalog
import sys
from pathlib import Path
project_path = str(Path().cwd().parent.parent.resolve())
if project_path not in sys.path:
    sys.path.append(project_path)
    
# imports
from common.utils import get_data, get_preprocessor
from common.custom_logistic_regression import CustomLogisticRegressionMulticlass

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data = get_data()

X = data.drop(columns=["Target"])
y = data["Target"]

numerical_column_names = X.select_dtypes(include=["number"]).columns.tolist()
categorical_column_names = X.select_dtypes(include=["object"]).columns.tolist()

preprocessor = get_preprocessor(numerical_column_names, categorical_column_names)

X = preprocessor.fit_transform(X)

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=6)

datasets = []
datasets_oversampled = []
datasets_undersampled = []

for train_indices, test_indices in kfold.split(X, y):
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    datasets.append((X_train, X_test, y_train, y_test))
    
    smote = SMOTE(random_state = 6)
    X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)
    datasets_oversampled.append((X_train_oversampled, X_test, y_train_oversampled, y_test))
    
    rus = RandomUnderSampler(random_state=6)
    X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
    datasets_undersampled.append((X_train_under, X_test, y_train_under, y_test))

In [4]:
def train_and_pred(model, datasets):
    results_train = {"Accuracy": [], "Precision": [], "Recall": [], "F1": []}
    results_test = {"Accuracy": [], "Precision": [], "Recall": [], "F1": []}
    
    for dataset in datasets:
        model.fit(dataset[0], dataset[2])
        
        y_pred_train = model.predict(dataset[0])
        results_train["Accuracy"].append(accuracy_score(dataset[2], y_pred_train))
        results_train["Precision"].append(precision_score(dataset[2], y_pred_train, average="weighted"))
        results_train["Recall"].append(recall_score(dataset[2], y_pred_train, average="weighted"))
        results_train["F1"].append(f1_score(dataset[2], y_pred_train, average="weighted"))
        
        y_pred_test = model.predict(dataset[1])
        results_test["Accuracy"].append(accuracy_score(dataset[3], y_pred_test))
        results_test["Precision"].append(precision_score(dataset[3], y_pred_test, average="weighted"))
        results_test["Recall"].append(recall_score(dataset[3], y_pred_test, average="weighted"))
        results_test["F1"].append(f1_score(dataset[3], y_pred_test, average="weighted"))
        
    return pd.DataFrame(results_train).T, pd.DataFrame(results_test).T, np.mean(results_train["Accuracy"]), np.mean(results_test["Accuracy"])
    
    

# Sklearn - normal

In [5]:
model = LogisticRegression(max_iter=200)

results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.809618312811401)


Unnamed: 0,0,1,2,3,4
Accuracy,0.809833,0.813224,0.810116,0.810964,0.803955
Precision,0.800962,0.80463,0.802034,0.802076,0.79491
Recall,0.809833,0.813224,0.810116,0.810964,0.803955
F1,0.79958,0.802943,0.800515,0.800923,0.793358


Test (accuracy mean: 0.7773548585014188)


Unnamed: 0,0,1,2,3,4
Accuracy,0.778531,0.768362,0.776271,0.768362,0.795249
Precision,0.766319,0.757346,0.762015,0.754844,0.782902
Recall,0.778531,0.768362,0.776271,0.768362,0.795249
F1,0.769135,0.751287,0.764688,0.753869,0.783462


# Sklearn - oversampled

In [6]:
model = LogisticRegression(max_iter=200)

results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets_oversampled)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.7933596917211861)


Unnamed: 0,0,1,2,3,4
Accuracy,0.792374,0.804892,0.786727,0.795523,0.787282
Precision,0.795439,0.807743,0.78901,0.798034,0.789621
Recall,0.792374,0.804892,0.786727,0.795523,0.787282
F1,0.792697,0.80526,0.786863,0.795788,0.787625


Test (accuracy mean: 0.7561093642150472)


Unnamed: 0,0,1,2,3,4
Accuracy,0.744633,0.753672,0.748023,0.750282,0.783937
Precision,0.775777,0.76036,0.773131,0.76993,0.801843
Recall,0.744633,0.753672,0.748023,0.750282,0.783937
F1,0.756174,0.756608,0.7571,0.756787,0.790232


# Sklearn - undersampled

In [7]:
model = LogisticRegression(max_iter=200)

results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets_undersampled)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.7735843401670015)


Unnamed: 0,0,1,2,3,4
Accuracy,0.778646,0.764423,0.776242,0.777778,0.770833
Precision,0.78063,0.767293,0.778234,0.778008,0.771769
Recall,0.778646,0.764423,0.776242,0.777778,0.770833
F1,0.778484,0.764491,0.775854,0.777254,0.770091


Test (accuracy mean: 0.7346343022215405)


Unnamed: 0,0,1,2,3,4
Accuracy,0.718644,0.735593,0.733333,0.728814,0.756787
Precision,0.765266,0.750526,0.766044,0.759903,0.792274
Recall,0.718644,0.735593,0.733333,0.728814,0.756787
F1,0.734879,0.741603,0.745309,0.738875,0.768336


# Custom - normal

In [8]:
model = CustomLogisticRegressionMulticlass(epochs=600, batch_size=256, learning_rate=0.01)

results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.7818718460799198)


Unnamed: 0,0,1,2,3,4
Accuracy,0.778468,0.785533,0.782142,0.785533,0.777684
Precision,0.761711,0.77232,0.769103,0.769808,0.76249
Recall,0.778468,0.785533,0.782142,0.785533,0.777684
F1,0.753492,0.763625,0.761939,0.763464,0.754199


Test (accuracy mean: 0.7723838740189686)


Unnamed: 0,0,1,2,3,4
Accuracy,0.772881,0.762712,0.764972,0.762712,0.798643
Precision,0.758298,0.743379,0.738859,0.747045,0.788379
Recall,0.772881,0.762712,0.764972,0.762712,0.798643
F1,0.750831,0.735891,0.742843,0.735855,0.778375


# Custom - oversampled

In [9]:
model = CustomLogisticRegressionMulticlass(epochs=600, batch_size=256, learning_rate=0.01)
results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets_oversampled)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.7522334144874236)


Unnamed: 0,0,1,2,3,4
Accuracy,0.751227,0.756162,0.751182,0.75602,0.746577
Precision,0.75454,0.759531,0.754396,0.759726,0.749122
Recall,0.751227,0.756162,0.751182,0.75602,0.746577
F1,0.750582,0.755528,0.750819,0.755829,0.745779


Test (accuracy mean: 0.7531709998210496)


Unnamed: 0,0,1,2,3,4
Accuracy,0.740113,0.760452,0.741243,0.742373,0.781674
Precision,0.766716,0.763318,0.766411,0.761372,0.800966
Recall,0.740113,0.760452,0.741243,0.742373,0.781674
F1,0.750098,0.761057,0.750422,0.748486,0.788213


# Custom - undersampled

In [10]:
model = CustomLogisticRegressionMulticlass(epochs=600, batch_size=256, learning_rate=0.01)

results_train, results_test, mean_train, mean_test = train_and_pred(model, datasets_undersampled)

print(f"Train (accuracy mean: {mean_train})")
display(results_train)
print(f"Test (accuracy mean: {mean_test})")
display(results_test)

Train (accuracy mean: 0.7225917395019488)


Unnamed: 0,0,1,2,3,4
Accuracy,0.720313,0.714209,0.72299,0.727322,0.728125
Precision,0.723128,0.716801,0.725866,0.728639,0.729998
Recall,0.720313,0.714209,0.72299,0.727322,0.728125
F1,0.718748,0.71276,0.721826,0.725103,0.725405


Test (accuracy mean: 0.7425459518879259)


Unnamed: 0,0,1,2,3,4
Accuracy,0.738983,0.748023,0.736723,0.723164,0.765837
Precision,0.772338,0.752986,0.760933,0.745541,0.790239
Recall,0.738983,0.748023,0.736723,0.723164,0.765837
F1,0.750807,0.749596,0.745655,0.730959,0.774052
