In [1]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import yaml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna

from src.data.prepare_data import prepare_data
from src.models.utils import train_splits, imbalanced_sampling, set_global_seed
from src.models.tuner import HyperParamSearch
from src.models.model import Classifier
from src.models.evaluation import Evaluation
from src.models.tracker import launch_mlflow, log_run

In [2]:
# read config
with open("../config.yml", "r") as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

# ensure reproducibility
set_global_seed(seed=config["general"]["seed"])

# set experiment tracking
launch_mlflow()

# algorithm to be explored
ALGORITHM = "XGBClassifier"

In [3]:
# load and prepare data
df = pd.read_csv("../" + config["data_loader"]["path"])
df = prepare_data(df=df)
display(df.head())

# check class distributions
df["label"].value_counts(normalize=True)

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,label
0,-1.99658,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964,0
1,-1.99658,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342474,0
2,-1.996558,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160684,0
3,-1.996558,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534,0
4,-1.996537,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403,0


label
0    0.998273
1    0.001727
Name: proportion, dtype: float64

**Train and Test Set**
- Split whole set into train, validation and test sets using strat sampling
- Apply oversampling as the number of positive instances is small

In [4]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df["label"]
    , test_size=config["train_test_split"]["test_size"]
    , random_state=config["general"]["seed"]
    , shuffle=True
    , stratify=df["label"]
    )

# split dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train
    , test_size=config["train_test_split"]["test_size"]
    , random_state=config["general"]["seed"]
    , shuffle=True
    , stratify=y_train
    )

In [5]:
# apply oversampling to the train set
X_train_rs, y_train_rs = imbalanced_sampling(
    method="over"
    , X_train=X_train
    , y_train=y_train
)

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_train_rs.value_counts(normalize=True)
    )

label
0    0.998273
1    0.001727
Name: proportion, dtype: float64 label
0    0.5
1    0.5
Name: proportion, dtype: float64


**Optimization & Evaluation**
- Hyper-parameters - search which hyper-parameters optimize scoring metric for the given algorithm in the validation set
- Evaluation - retrieve best hyper-parameters and recover full training set, applying over sampling, to evaluate results on test set

In [6]:
# set tuner for hyperparam optimization
tuner = HyperParamSearch(config=config['optimization'], algorithm=ALGORITHM)

def objective(trial) -> float:
    return tuner.fit(
        X=X_train_rs, y=y_train_rs, trial=trial,
        X_val=X_val, y_val=y_val
        )

# set study
study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=config["general"]["seed"])
    )
study.optimize(objective, n_trials=config["optimization"]["n_trials"])

# logging experiment
log_run(experiment_name="ModelSelection", study=study, tuner=tuner)

[I 2025-06-23 18:46:15,828] A new study created in memory with name: no-name-ef8fa899-3a2e-47a3-9359-81be295efe82
[I 2025-06-23 18:46:17,322] Trial 0 finished with value: 0.99956 and parameters: {'n_estimators': 39, 'max_depth': 20, 'learning_rate': 0.36626497696389115, 'gamma': 5.986584841970366, 'subsample': 0.5780093202212182}. Best is trial 0 with value: 0.99956.
[I 2025-06-23 18:46:18,142] Trial 1 finished with value: 0.9929 and parameters: {'n_estimators': 17, 'max_depth': 4, 'learning_rate': 0.43322189674169265, 'gamma': 6.011150117432088, 'subsample': 0.8540362888980227}. Best is trial 0 with value: 0.99956.
[I 2025-06-23 18:46:18,932] Trial 2 finished with value: 0.99867 and parameters: {'n_estimators': 4, 'max_depth': 20, 'learning_rate': 0.41638887775941047, 'gamma': 2.1233911067827616, 'subsample': 0.5909124836035503}. Best is trial 0 with value: 0.99956.
[I 2025-06-23 18:46:19,997] Trial 3 finished with value: 0.99919 and parameters: {'n_estimators': 20, 'max_depth': 8, 'l

🏃 View run 23JUN2025 at: http://127.0.0.1:5000/#/experiments/710398032874731682/runs/54028a65ebf5474bb3153bbdef959ff9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/710398032874731682
🏃 View run XGBClassifier at: http://127.0.0.1:5000/#/experiments/710398032874731682/runs/1361e07b53a545ec887ae0ec80fd9d53
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/710398032874731682


In [7]:
# fit model on whole training set
X_train, y_train = pd.concat([X_train, X_val]), pd.concat([y_train, y_val])
X_train_rs, y_train_rs = imbalanced_sampling(
    method='over'
    , X_train=X_train
    , y_train=y_train
)

# set algorithm best hyperparams
hyperparams = (
    config["optimization"]["param_grid"][ALGORITHM]["fixed"]
    | study.best_trial.params
)

clf = Classifier(algorithm=ALGORITHM, **hyperparams)
clf.fit(X=X_train_rs, y=y_train_rs)

# test set evaluation
eval = Evaluation(clf=clf, threshold=0.5)
display(eval.fit(train=(X_train_rs, y_train_rs), test=(X_test, y_test)))

Unnamed: 0_level_0,accuracy,precision,recall,f1_score
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,0.99991,0.99981,1.0,0.99991
test,0.9993,0.8,0.8,0.8
