In [2]:
import csv
from datetime import datetime

import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier

from get_data import load_data
from utils import calculate_metrics
from xgboost import XGBClassifier

imb_rate = 0.01  # Imbalance rate
min_class = [1]  # Minority classes
maj_class = [0]  # Majority classes
datasource = "credit"  # The dataset to be selected
columns = ["gmean", "fmeasure", "MCC", "MAE", "precision", "recall", "TP", "TN", "FP", "FN"]
scale = 578  # scale = (y_train.shape[0] - y_train.sum()) // y_train.sum()  # Proportion of majority to minority rows

In [3]:
def repeat_experiment(model, modelname: str, n_repetition: int=100, log_every: int=10):
    """Repeats the process of splitting data, training model and generating stats several times."""
    with open(f"./logs_alt/{modelname}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writeheader()

        for i in range(n_repetition):
            X_train, y_train, X_test, y_test, X_val, y_val = load_data(datasource, imb_rate, min_class, maj_class, print_stats=False)
            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            stats = calculate_metrics(y_test, y_pred)  # Get stats as dictionairy
            writer.writerow(stats)  # Write dictionairy as row

            if not i % log_every:
                print(f"{i}: FN: {stats.get('FN')}, FP: {stats.get('FP')}")

In [4]:
model = XGBClassifier(objective="binary:logitraw", scale_pos_weight=scale, eval_metric="aucpr", max_delta_step=1)
repeat_experiment(model, "XGB")

0: FN: 17, FP: 11
10: FN: 17, FP: 8
20: FN: 14, FP: 8
30: FN: 16, FP: 12
40: FN: 15, FP: 11
50: FN: 17, FP: 11
60: FN: 17, FP: 6
70: FN: 16, FP: 8
80: FN: 17, FP: 7
90: FN: 15, FP: 7


In [5]:
model = LogisticRegression(C=0.01, max_iter=1_000, class_weight={1: scale // 24})
repeat_experiment(model, "LR")

0: FN: 14, FP: 53
10: FN: 12, FP: 58
20: FN: 14, FP: 55
30: FN: 14, FP: 57
40: FN: 13, FP: 50
50: FN: 14, FP: 55
60: FN: 12, FP: 59
70: FN: 14, FP: 52
80: FN: 15, FP: 57
90: FN: 13, FP: 53


In [6]:
model = SGDClassifier(loss="log", alpha=0.001, max_iter=10_000, early_stopping=True, class_weight={1: scale // 24})
repeat_experiment(model, "SGD")

0: FN: 15, FP: 48
10: FN: 12, FP: 6511
20: FN: 18, FP: 44
30: FN: 21, FP: 31
40: FN: 19, FP: 50
50: FN: 25, FP: 39
60: FN: 20, FP: 49
70: FN: 21, FP: 39
80: FN: 18, FP: 39
90: FN: 18, FP: 43


In [9]:
fn_xgb = "./logs_alt/XGB_20200923_142438.csv"
fn_lr = "./logs_alt/LR_20200923_144822.csv"
fn_sgd = "./logs_alt/SGD_20200923_150542.csv"

for fn in (fn_xgb, fn_lr, fn_sgd):
    df = pd.read_csv(fn, sep=',')
    print(df.FP.mean(), df.FN.mean(), df.fmeasure.mean().round(6))

9.19 16.39 0.865271
56.21 13.74 0.718327
793.13 19.89 0.699466
