In [1]:
import yaml
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import optuna

from src.data.prepare_data import prepare_data
from src.models.utils import train_splits, imbalanced_sampling
from src.models.tuner import HyperParamSearch, LabelWeightSearch
from src.models.model import Classifier
from src.models.evaluation import Evaluation

In [2]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader= yaml.SafeLoader)
del file

In [3]:
# load and prepare data
df = pd.read_csv(config['data_loader']['path'])
df = prepare_data(df=df)
display(df.head())

# check class distributions
df['label'].value_counts(normalize=True)

Unnamed: 0,time,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v21,v22,v23,v24,v25,v26,v27,v28,amount,label
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


label
0    0.998273
1    0.001727
Name: proportion, dtype: float64

**Train and Test Set**
- Split whole set into train and test sets using strat sampling
- Apply oversampling as the number of positive instances is small
- Split whole train set into multiple train sets

In [4]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:,:-1], df['label']
    , test_size=config['train_test_split']['test_size']
    , random_state=123
    , shuffle=True
    , stratify=df['label']
    )

In [5]:
# apply oversampling to the train set
X_train_rs, y_train_rs = imbalanced_sampling(
    method='over'
    , X_train=X_train
    , y_train=y_train
)

# check class distributions
print(
    y_train.value_counts(normalize=True)
    , y_train_rs.value_counts(normalize=True)
    )

label
0    0.998274
1    0.001726
Name: proportion, dtype: float64 label
0    0.5
1    0.5
Name: proportion, dtype: float64


In [6]:
# split train sets into multiple sets
train = train_splits(X_train_rs, y_train_rs, config['train_test_split'])

**Optimization**
- Hyper-parameters - search which hyper-parameters optimize scoring metric for the given algorithm
- sample_weight - search optimal weights for labels in order to optimize scoring metric

To avoid overfitting, these optimizations are done in different training sets.

In [7]:
# set tuner for hyperparam optimization
tuner = HyperParamSearch(config=config['optimization'], algorithm="XGBClassifier")

def objective(trial) -> float:
    return tuner.fit(X=train[1].iloc[:,:-1], y=train[1].iloc[:,-1], trial=trial)

# set study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=10)

print(
    f"Best trial:\n",
    f"- params: {study.best_trial.params}\n",
    f"- score: {study.best_trial.value}", "\n"*2
)

[I 2025-06-09 17:15:41,915] A new study created in memory with name: no-name-7b21ffcd-ae70-4c13-8060-fba768c8d90c
[I 2025-06-09 17:15:50,898] Trial 0 finished with value: 0.9996297652452274 and parameters: {'n_estimators': 39, 'max_depth': 20, 'learning_rate': 0.36626497696389115, 'gamma': 5.986584841970366, 'subsample': 0.5780093202212182}. Best is trial 0 with value: 0.9996297652452274.
[I 2025-06-09 17:15:58,974] Trial 1 finished with value: 0.9963272693985885 and parameters: {'n_estimators': 17, 'max_depth': 4, 'learning_rate': 0.43322189674169265, 'gamma': 6.011150117432088, 'subsample': 0.8540362888980227}. Best is trial 0 with value: 0.9996297652452274.
[I 2025-06-09 17:16:04,584] Trial 2 finished with value: 0.9987449052491364 and parameters: {'n_estimators': 4, 'max_depth': 20, 'learning_rate': 0.41638887775941047, 'gamma': 2.1233911067827616, 'subsample': 0.5909124836035503}. Best is trial 0 with value: 0.9996297652452274.
[I 2025-06-09 17:16:12,041] Trial 3 finished with val

Best trial:
 - params: {'n_estimators': 67, 'max_depth': 8, 'learning_rate': 0.2605139425677276, 'gamma': 5.4671027934327965, 'subsample': 0.5924272277627636}
 - score: 0.9996297657249904 




In [8]:
# set label weight tuner using previous study
tuner = LabelWeightSearch(
    config = config["optimization"],
    estimator=Classifier(algorithm="XGBClassifier", **study.best_trial.params)
)

def objective(trial) -> float:
    return tuner.fit(X=train[2].iloc[:,:-1], y=train[2].iloc[:,-1], trial=trial)

# set study
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=10)

print(
    f"Best trial:\n",
    f"- params: {study.best_trial.params}\n",
    f"- score: {study.best_trial.value}", "\n"*2
)

[I 2025-06-09 17:17:44,967] A new study created in memory with name: no-name-a20a0046-be0d-4ced-9fe3-8646d2d4794e
[I 2025-06-09 17:18:06,402] Trial 0 finished with value: 0.999906 and parameters: {'weight_0': 38, 'weight_1': 96}. Best is trial 0 with value: 0.999906.
[I 2025-06-09 17:18:27,570] Trial 1 finished with value: 0.9999100000000001 and parameters: {'weight_0': 74, 'weight_1': 60}. Best is trial 1 with value: 0.9999100000000001.
[I 2025-06-09 17:18:50,296] Trial 2 finished with value: 0.9998800000000001 and parameters: {'weight_0': 16, 'weight_1': 16}. Best is trial 1 with value: 0.9999100000000001.
[I 2025-06-09 17:19:11,771] Trial 3 finished with value: 0.999718 and parameters: {'weight_0': 6, 'weight_1': 87}. Best is trial 1 with value: 0.9999100000000001.
[I 2025-06-09 17:19:28,921] Trial 4 finished with value: 0.999886 and parameters: {'weight_0': 61, 'weight_1': 71}. Best is trial 1 with value: 0.9999100000000001.
[I 2025-06-09 17:19:42,564] Trial 5 finished with value: 

Best trial:
 - params: {'weight_0': 84, 'weight_1': 22}
 - score: 0.9999399999999999 




In [9]:
# set algorithm best hyperparams and sample weight
class_weights = {int(key[-1]): value for key, value in study.best_trial.params.items()}
sample_weight = np.array([class_weights[class_id] for class_id in y_train_rs])
hyperparams = (
    config["optimization"]["param_grid"]["XGBClassifier"]["fixed"]
    | tuner.model.model.get_params()
)

# fit model on whole training set
clf = Classifier(algorithm="XGBClassifier", **hyperparams)
clf.fit(X=X_train_rs, y=y_train_rs, sample_weight=sample_weight)

# test set evaluation
eval = Evaluation(clf=clf, threshold=0.5)
eval.fit(
    train=(X_train_rs, y_train_rs),
    test=(X_test, y_test)
    )

Unnamed: 0_level_0,accuracy,precision,recall,f1_score
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train,1.0,1.0,1.0,1.0
test,0.99951,0.90909,0.8,0.85106
