In [1]:
import pandas as pd
import numpy as np
import warnings
from collections import Counter

warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, fbeta_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings

import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
from optuna.samplers import TPESampler

from utils import get_subburst_preserved_train_test, lee_liu_score

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
import matplotlib.pyplot as plt

## Preprocessing


In [2]:
df = pd.read_csv("./data/features_extracted/combined_2021_23_catalog.csv")
pd.set_option("display.max_columns", None)
df.columns = df.columns.str.strip()

FEATURES = [
    "ra",
    "snr_fitb",
    "log_dm_exc_ymw16",
    "log_bc_width",
    "log_flux",
    "log_fluence",
    "sp_idx",
    "sp_run",
    "log_in_duration",
    "log_peak_freq",
    "log_fre_width",
    "log_T_B",
    "log_energy",
]
FEATURE_LABELS = [
    "Right Ascension",
    "SNR (fitburst)",
    "Excess DM (YMW16)",
    "Boxcar width",
    "Flux",
    "Fluence",
    "Spectral index",
    "Spectral running",
    "Rest-frame width",
    "Peak frequency",
    "Frequency width",
    "Brightness temperature",
    "Burst energy",
]
X = df[FEATURES]
y = df["is_repeater"]
df

Unnamed: 0,tns_name,repeater_name,ra,dec,gl,gb,exp_up,exp_low,bonsai_snr,bonsai_dm,snr_fitb,dm_fitb,dm_exc_ne2001,dm_exc_ymw16,bc_width,scat_time,flux,fluence,sub_num,width_fitb,sp_idx,sp_run,high_freq,low_freq,peak_freq,chi_sq,dof,flag_frac,is_repeater,is_pcc_candidate,catalog,redshift,fre_width,fre_width_ob,in_duration,energy,luminosity,T_B,log_dm_fitb,log_bonsai_dm,log_dm_exc_ne2001,log_dm_exc_ymw16,log_bc_width,log_scat_time,log_flux,log_fluence,log_width_fitb,log_high_freq,log_low_freq,log_peak_freq,log_fre_width,log_redshift,log_in_duration,log_energy,log_luminosity,log_T_B
0,FRB20180725A,-9999,93.420,67.070,147.29,21.29,30.0,-9999.0,19.20,716.6,33.20,715.80930,644.2,635.4,0.00295,0.001100,1.70,4.10,0,0.000296,38.20,-45.80,760.1,485.3,607.4,371857.954,371481,0.403,0,0,2021,0.640740,450.875425,274.8,0.180406,2.827944e+40,1.923870e+43,5.515622e+35,2.854797,2.855277,2.809021,2.803047,-2.530178,-2.958607,0.230449,0.612784,-3.528708,2.880871,2.686010,2.783475,2.654057,-0.193318,-0.743748,40.451471,43.284176,35.741595
1,FRB20180727A,-9999,197.720,26.420,24.76,85.60,10.4,-9999.0,10.40,642.1,12.20,642.13400,620.9,622.4,0.00295,0.001700,0.58,2.31,0,0.001390,3.80,-9.20,800.2,400.2,493.3,382969.318,381818,0.387,0,0,2021,0.614818,645.927163,400.0,0.860778,1.189571e+40,4.823143e+42,2.622746e+35,2.807626,2.807603,2.793022,2.794070,-2.530178,-2.769551,-0.236572,0.363612,-2.856985,2.903199,2.602277,2.693111,2.810184,-0.211253,-0.065109,40.075391,42.683330,35.418756
2,FRB20180729A,-9999,199.400,55.580,115.26,61.16,21.0,-9999.0,32.00,108.4,206.60,109.59418,78.8,86.8,0.00098,0.000157,11.70,17.00,0,0.000100,16.46,-30.21,692.7,400.2,525.6,264732.041,186953,0.399,0,0,2021,0.002248,293.157605,292.5,0.099776,1.070358e+36,7.383140e+38,4.845901e+32,2.039787,2.035029,1.896526,1.938520,-3.008774,-3.802995,1.068186,1.230449,-4.000000,2.840545,2.602277,2.720655,2.467101,-2.648161,-1.000975,36.029529,38.868241,32.685375
3,FRB20180729B,-9999,89.930,56.500,156.90,15.68,21.0,-9999.0,12.40,318.6,22.00,317.22350,223.2,198.8,0.00197,0.000660,0.92,1.20,0,0.000314,14.50,-14.60,800.2,441.8,657.5,425139.488,421337,0.323,0,0,2021,0.157566,414.871625,358.4,0.271259,4.966122e+38,4.407270e+41,3.166148e+34,2.501365,2.503246,2.348694,2.298416,-2.705534,-3.180456,-0.036212,0.079181,-3.503070,2.903199,2.645226,2.817896,2.617914,-0.802538,-0.566616,38.696017,41.644170,34.500531
4,FRB20180730A,-9999,57.390,87.190,125.11,25.11,270.0,214.0,69.50,849.2,89.80,848.90410,789.7,790.5,0.00492,0.002073,5.20,27.00,0,0.000468,4.27,-11.31,759.2,400.2,483.5,429165.844,417689,0.329,0,0,2021,0.802405,647.063272,359.0,0.259653,2.335510e+41,8.107252e+43,1.508095e+36,2.928859,2.929010,2.897462,2.897902,-2.308035,-2.683401,0.716003,1.431364,-3.329754,2.880356,2.602277,2.684396,2.810947,-0.095607,-0.585606,41.368382,43.908874,36.178429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,FRB20210313E,FRB20201221B,124.199,48.781,170.55,33.81,90.3,-9999.0,10.00,510.3,15.43,510.35400,459.5,464.4,0.02064,0.001730,0.25,1.13,0,0.001260,34.30,-183.00,491.8,400.2,439.6,352048.133,351911,43.475,1,0,2023,0.433921,131.347155,91.6,0.878710,2.534754e+39,8.041233e+41,1.421451e+33,2.707872,2.707826,2.662286,2.666892,-1.685290,-2.761954,-0.602060,0.053078,-2.899629,2.691789,2.602277,2.643058,2.118421,-0.362589,-0.056155,39.403936,41.905323,33.152732
726,FRB20210331F,-9999,122.070,72.350,142.57,31.55,135.0,108.0,24.28,288.3,50.27,288.42000,237.9,243.1,0.00885,0.002500,1.02,6.10,0,0.005000,58.00,-88.00,662.7,480.9,564.5,497640.634,492355,47.278,0,1,2023,0.175508,213.707304,181.8,4.253481,2.705696e+39,5.318324e+41,2.945764e+33,2.460025,2.459845,2.376394,2.385785,-2.053057,-2.602060,0.008600,0.785330,-2.301030,2.821317,2.682055,2.751664,2.329819,-0.755704,0.628745,39.432279,41.725775,33.469198
727,FRB20210331F,-9999,122.070,72.350,142.57,31.55,135.0,108.0,24.28,288.3,50.27,288.42000,237.9,243.1,0.00885,0.002500,1.02,6.10,1,0.001254,47.50,-119.20,578.4,428.0,497.6,497640.634,492355,47.278,0,1,2023,0.175508,176.796362,150.4,1.066773,2.385039e+39,4.688039e+41,3.791099e+33,2.460025,2.459845,2.376394,2.385785,-2.053057,-2.602060,0.008600,0.785330,-2.901702,2.762228,2.631444,2.696880,2.247473,-0.755704,0.028072,39.377495,41.670991,33.578765
728,FRB20210426B,-9999,122.070,72.350,142.57,31.55,135.0,108.0,12.49,292.8,31.13,288.92000,238.4,243.6,0.00786,0.010000,1.60,7.60,0,0.005000,-8.20,15.00,800.2,613.2,800.2,1872864.206,1866568,45.490,0,1,2023,0.176116,219.933602,187.0,4.251283,4.812697e+39,1.191640e+42,2.936170e+33,2.460778,2.466571,2.377306,2.386677,-2.104577,-2.000000,0.204120,0.880814,-2.301030,2.903199,2.787602,2.903199,2.342292,-0.754202,0.628520,39.682389,42.076145,33.467781


## Machine learning


In [3]:
np.random.seed(RANDOM_SEED)
# 1. We split the data into training and validation sets, but make sure that sub-bursts for a given burst are either all in the training set or all in the validation set
X_train, X_val, y_train, y_val = get_subburst_preserved_train_test(
    df, X, y, test_size=0.2, stratify=True
)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_val = pd.DataFrame(X_val, columns=X.columns)
y_train = pd.Series(y_train)
y_val = pd.Series(y_val)
print(f"{X_train.shape=} {y_train.shape=} {X_val.shape=} {y_val.shape=}")

# 2. We scale the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_scaled = scaler.transform(X)
print(f"{X_scaled.shape=}\n{X_train_scaled.shape=} {X_val_scaled.shape=}")

# 3. We oversample the training set to balance the classes
print(f"Before SMOTE: {Counter(y_train)}")
sm = SMOTE()
X_train_scaled_sm, y_train_sm = sm.fit_resample(X_train_scaled, y_train)
print(f"{X_train_scaled_sm.shape=} {y_train_sm.shape=}")
print(f"After SMOTE: {Counter(y_train_sm)}")

X_train.shape=(586, 13) y_train.shape=(586,) X_val.shape=(144, 13) y_val.shape=(144,)
X_scaled.shape=(730, 13)
X_train_scaled.shape=(586, 13) X_val_scaled.shape=(144, 13)
Before SMOTE: Counter({0: 410, 1: 176})
X_train_scaled_sm.shape=(820, 13) y_train_sm.shape=(820,)
After SMOTE: Counter({0: 410, 1: 410})


In [4]:
NUM_TRIALS = 100
optimised_models = []
models_info = []

### Decision tree


In [5]:
np.random.seed(RANDOM_SEED)


def dt_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = DecisionTreeClassifier(
        min_samples_split=trial.suggest_int("min_samples_split", 2, 32),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 32),
        criterion=trial.suggest_categorical("criterion", ["gini", "entropy"]),
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(dt_objective, n_trials=NUM_TRIALS)
best_params = study.best_params
dt = DecisionTreeClassifier(**best_params, random_state=RANDOM_SEED)
dt.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(dt)
data = {
    "model": "DecisionTreeClassifier",
    "params": best_params,
    "ll_score": study.best_value,
}
print(data)
models_info.append(data)

{'model': 'DecisionTreeClassifier', 'params': {'min_samples_split': 20, 'min_samples_leaf': 2, 'criterion': 'gini'}, 'll_score': 1.7776479181884588}


### Random forest


In [6]:
np.random.seed(RANDOM_SEED)


def rf_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 32),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 32),
        criterion=trial.suggest_categorical("criterion", ["gini", "entropy"]),
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(rf_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

rf = RandomForestClassifier(**best_params, random_state=RANDOM_SEED)
rf.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(rf)
data = {
    "model": "RandomForestClassifier",
    "params": best_params,
    "ll_score": study.best_value,
}
print(data)
models_info.append(data)

{'model': 'RandomForestClassifier', 'params': {'n_estimators': 245, 'min_samples_split': 21, 'min_samples_leaf': 10, 'criterion': 'entropy'}, 'll_score': 2.0572476939541677}


### SVM


In [7]:
np.random.seed(RANDOM_SEED)


def svm_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = SVC(
        C=trial.suggest_float("C", 1e-2, 1e2, log=True),
        degree=trial.suggest_int("degree", 1, 8),
        kernel="linear",  # fix to linear so we can access coefficients later
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(svm_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

svm = SVC(**best_params, kernel="linear", random_state=RANDOM_SEED)
svm.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(svm)
data = {"model": "SVC", "params": best_params, "ll_score": study.best_value}
print(data)
models_info.append(data)

{'model': 'SVC', 'params': {'C': 0.053370327626039576, 'degree': 2}, 'll_score': 2.2439737034331633}


### AdaBoost


In [8]:
np.random.seed(RANDOM_SEED)


def adaboost_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1),
        algorithm=trial.suggest_categorical("algorithm", ["SAMME", "SAMME.R"]),
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(adaboost_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

adaboost = AdaBoostClassifier(**best_params, random_state=RANDOM_SEED)
adaboost.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(adaboost)
data = {
    "model": "AdaBoostClassifier",
    "params": best_params,
    "ll_score": study.best_value,
}
print(data)
models_info.append(data)

{'model': 'AdaBoostClassifier', 'params': {'n_estimators': 379, 'learning_rate': 0.2799743084246729, 'algorithm': 'SAMME.R'}, 'll_score': 2.4371881945199942}


### LightGBM


In [9]:
np.random.seed(RANDOM_SEED)


def lgbm_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        subsample=trial.suggest_float("subsample", 0.1, 1.0),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0),
        random_state=RANDOM_SEED,
        verbosity=-1,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(lgbm_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

lightgbm = LGBMClassifier(**best_params, random_state=RANDOM_SEED, verbosity=-1)
lightgbm.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(lightgbm)
data = {"model": "LGBMClassifier", "params": best_params, "ll_score": study.best_value}
print(data)
models_info.append(data)

{'model': 'LGBMClassifier', 'params': {'n_estimators': 314, 'learning_rate': 0.7634745502856656, 'subsample': 0.5517416382671432, 'colsample_bytree': 0.8748572996659076}, 'll_score': 2.4067619743295414}


### XGBoost


In [10]:
np.random.seed(RANDOM_SEED)


def xgb_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = xgb.XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        eta=trial.suggest_float("eta", 1e-3, 1e1, log=True),
        gamma=trial.suggest_float("gamma", 1e-3, 1e1, log=True),
        min_child_weight=trial.suggest_float("min_child_weight", 1e-3, 1e1, log=True),
        max_delta_step=trial.suggest_float("max_delta_step", 1e-3, 1e1, log=True),
        max_leaves=trial.suggest_int("max_leaves", 2, 256),
        max_bin=trial.suggest_int("max_bin", 2, 256),
        subsample=trial.suggest_float("subsample", 0.1, 1.0),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0),
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)

    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(xgb_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

xgb_model = xgb.XGBClassifier(**best_params)
xgb_model.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(xgb_model)
data = {"model": "XGBClassifier", "params": best_params, "ll_score": study.best_value}
print(data)
models_info.append(data)

{'model': 'XGBClassifier', 'params': {'n_estimators': 472, 'eta': 4.463589298990465, 'gamma': 0.015812457773552488, 'min_child_weight': 0.0013235039876917161, 'max_delta_step': 0.02148492608287815, 'max_leaves': 165, 'max_bin': 81, 'subsample': 0.8088819609029114, 'colsample_bytree': 0.4255638819241213}, 'll_score': 2.527100073046019}


In [11]:
0.02148492608287815 / (10 ** (-2))

2.148492608287815

### Logistic regression


In [12]:
np.random.seed(RANDOM_SEED)


def lr_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = LogisticRegression(
        tol=trial.suggest_float("tol", 1e-5, 1e-3, log=True),
        C=trial.suggest_float("C", 1e-2, 1e2, log=True),
        solver=trial.suggest_categorical("solver", ["liblinear", "newton-cholesky"]),
        max_iter=trial.suggest_int("max_iter", 100, 500),
        random_state=RANDOM_SEED,
    )
    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(lr_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

lr = LogisticRegression(**best_params)
lr.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(lr)
data = {
    "model": "LogisticRegression",
    "params": best_params,
    "ll_score": study.best_value,
}
print(data)
models_info.append(data)

{'model': 'LogisticRegression', 'params': {'tol': 5.6115164153345e-05, 'C': 63.512210106407046, 'solver': 'liblinear', 'max_iter': 162}, 'll_score': 2.071360341630612}


### LDA


In [13]:
np.random.seed(RANDOM_SEED)


def lda_objective(trial):
    np.random.seed(RANDOM_SEED)
    classifier_obj = LinearDiscriminantAnalysis(
        solver=trial.suggest_categorical("solver", ["svd", "lsqr", "eigen"]),
        # Only used if the solver is SVD
        store_covariance=trial.suggest_categorical("store_covariance", [True, False]),
        tol=trial.suggest_float("tol", 1e-5, 1e-3, log=True),
    )
    if classifier_obj.solver != "svd":
        classifier_obj.shrinkage = trial.suggest_float("shrinkage", 0.0, 1.0)

    classifier_obj.fit(X_train_scaled_sm, y_train_sm)
    predictions = classifier_obj.predict(X_val_scaled)
    ll_score = lee_liu_score(y_val, predictions)
    return ll_score


sampler = TPESampler(seed=RANDOM_SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(lda_objective, n_trials=NUM_TRIALS)
best_params = study.best_params

lda = LinearDiscriminantAnalysis(**best_params)
lda.fit(X_train_scaled_sm, y_train_sm)
optimised_models.append(lda)
data = {
    "model": "LinearDiscriminantAnalysis",
    "params": best_params,
    "ll_score": study.best_value,
}
print(data)
models_info.append(data)

{'model': 'LinearDiscriminantAnalysis', 'params': {'solver': 'svd', 'store_covariance': False, 'tol': 0.000462258900102083}, 'll_score': 2.2439737034331633}


## Model Analysis


In [14]:
models_df = pd.DataFrame(models_info)
models_df

Unnamed: 0,model,params,ll_score
0,DecisionTreeClassifier,"{'min_samples_split': 20, 'min_samples_leaf': ...",1.777648
1,RandomForestClassifier,"{'n_estimators': 245, 'min_samples_split': 21,...",2.057248
2,SVC,"{'C': 0.053370327626039576, 'degree': 2}",2.243974
3,AdaBoostClassifier,"{'n_estimators': 379, 'learning_rate': 0.27997...",2.437188
4,LGBMClassifier,"{'n_estimators': 314, 'learning_rate': 0.76347...",2.406762
5,XGBClassifier,"{'n_estimators': 472, 'eta': 4.463589298990465...",2.5271
6,LogisticRegression,"{'tol': 5.6115164153345e-05, 'C': 63.512210106...",2.07136
7,LinearDiscriminantAnalysis,"{'solver': 'svd', 'store_covariance': False, '...",2.243974


### Retraining


To be able to analyse the models with greater confidence, we retrain each of them with their ideal hyperparameters on randomized training sets a 1000 times


In [15]:
np.random.seed(RANDOM_SEED)

models_recalls = [[] for _ in range(len(optimised_models))]
models_llscores = [[] for _ in range(len(optimised_models))]
candidates = {}

for j in range(1000):
    if j % 100 == 0:
        print(f"Trial {j}")
    X_train, X_val, y_train, y_val = get_subburst_preserved_train_test(
        df, X, y, test_size=0.2, stratify=True
    )
    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_val = pd.DataFrame(X_val, columns=X.columns)
    y_train = pd.Series(y_train)
    y_val = pd.Series(y_val)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_scaled = scaler.transform(X)

    sm = SMOTE()
    X_train_scaled_sm, y_train_sm = sm.fit_resample(X_train_scaled, y_train)

    for i, model in enumerate(optimised_models):
        predictions = model.predict(X_val_scaled)
        recall = recall_score(y_val, predictions)
        ll_score = lee_liu_score(y_val, predictions)
        models_recalls[i].append(recall)
        models_llscores[i].append(ll_score)

Trial 0
Trial 100
Trial 200
Trial 300
Trial 400
Trial 500
Trial 600
Trial 700
Trial 800
Trial 900


In [17]:
models_recalls_mean = [np.mean(recalls) for recalls in models_recalls]
models_recalls_std = [np.std(recalls) for recalls in models_recalls]
models_llscores_mean = [np.mean(llscores) for llscores in models_llscores]
models_llscores_std = [np.std(llscores) for llscores in models_llscores]

models_df["recall_mean"] = models_recalls_mean
models_df["recall_std"] = models_recalls_std
models_df["llscore_mean"] = models_llscores_mean
models_df["llscore_std"] = models_llscores_std

sorted_models = models_df.sort_values("recall_mean", ascending=False)
NUM_SELECTED_MODELS = 3
TOP_MODEL_INDICES = list(sorted_models[:NUM_SELECTED_MODELS].index)
TOP_MODELS = [optimised_models[i] for i in TOP_MODEL_INDICES]
TOP_MODEL_NAMES = sorted_models["model"].values[:NUM_SELECTED_MODELS]
sorted_models

Unnamed: 0,model,params,ll_score,recall_mean,recall_std,llscore_mean,llscore_std,fittime_mean,fittime_std
2,SVC,"{'C': 0.053370327626039576, 'degree': 2}",2.243974,0.844602,0.056898,2.069516,0.217661,0.004148,0.000518
7,LinearDiscriminantAnalysis,"{'solver': 'svd', 'store_covariance': False, '...",2.243974,0.843165,0.057591,2.09414,0.219028,0.001091,0.000217
6,LogisticRegression,"{'tol': 5.6115164153345e-05, 'C': 63.512210106...",2.07136,0.833235,0.06079,2.057242,0.224674,0.00114,0.000237
1,RandomForestClassifier,"{'n_estimators': 245, 'min_samples_split': 21,...",2.057248,0.823439,0.060648,2.081527,0.220618,0.302034,0.025809
3,AdaBoostClassifier,"{'n_estimators': 379, 'learning_rate': 0.27997...",2.437188,0.812825,0.065727,2.08964,0.239789,0.436932,0.019913
5,XGBClassifier,"{'n_estimators': 472, 'eta': 4.463589298990465...",2.5271,0.80891,0.06351,2.211336,0.250621,1.940661,2.417792
4,LGBMClassifier,"{'n_estimators': 314, 'learning_rate': 0.76347...",2.406762,0.79854,0.065712,2.153348,0.248306,0.153323,0.028639
0,DecisionTreeClassifier,"{'min_samples_split': 20, 'min_samples_leaf': ...",1.777648,0.695889,0.085825,1.602646,0.280352,0.007979,0.006386


In [18]:
models_df.to_csv("data/supervised_models_optimised.csv", index=False)
models_df[
    [
        "model",
        "recall_mean",
        "recall_std",
        "llscore_mean",
        "llscore_std",
    ]
].sort_values("recall_mean", ascending=False).to_latex(
    "tables/supervised_models_optimised.tex", index=False
)