In [2]:
import pandas as pd
import numpy as np
import catboost
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold


In [3]:
train_1 = pd.read_pickle('../data/prepared_data/train_1.pkl')
train_2 = pd.read_pickle('../data/prepared_data/train_2.pkl')

In [4]:
dataset_1 = train_1.drop(columns='Timestamp')
dataset_2 = train_2.drop(columns='Timestamp')

TARGET = 'label'
dataset_1.shape, dataset_2.shape

((54000, 226), (230400, 226))

In [5]:
y = dataset_1[TARGET].to_numpy()
ones_list = np.ones((len(y)))
zeros_list = np.zeros((len(y)))
zeros_list[0] = 1

In [6]:
def calc_menrics(y_true, y_pred):
    return {'precision_score': precision_score(y_true, y_pred),
     'recall_score': recall_score(y_true, y_pred),
     'f1_score': f1_score(y_true, y_pred)}


In [7]:
calc_menrics(ones_list, y)

{'precision_score': 1.0,
 'recall_score': 0.055203703703703706,
 'f1_score': 0.10463136835085379}

In [8]:
calc_menrics(zeros_list, y)

{'precision_score': 0.0, 'recall_score': 0.0, 'f1_score': 0.0}

In [9]:
def objective(trial, dataset):

    param = {
        "iterations": trial.suggest_int('iterations', 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "depth": trial.suggest_int("depth", 1, 3),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 5),
        
    }
    metrics = []
    kf = KFold(n_splits=3, shuffle=False)
    for i, (train_index, test_index) in enumerate(kf.split(dataset)):
        _train = dataset.iloc[train_index]
        _test = dataset.iloc[test_index]
        
        gbm = catboost.CatBoostClassifier(**param,  thread_count=1)
        gbm.fit(_train.drop(columns=TARGET), _train[TARGET], verbose=0)
        
        predictions = gbm.predict(_test.drop(columns=TARGET))
        
        f1_metric = f1_score(_test[TARGET], predictions)
        metrics.append(f1_metric)
    return np.mean(metrics)



In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, dataset_2), n_trials=1000, n_jobs=-1)

[I 2024-10-19 19:36:37,520] A new study created in memory with name: no-name-c5f7385d-927c-434d-a401-1b1f4f50ae43
[I 2024-10-19 19:36:53,238] Trial 4 finished with value: 0.0 and parameters: {'iterations': 6, 'learning_rate': 0.007839110443890516, 'depth': 2, 'subsample': 0.17327014190058115, 'colsample_bylevel': 0.06143105974701143, 'min_data_in_leaf': 92, 'l2_leaf_reg': 1.8021688685354027}. Best is trial 4 with value: 0.0.
[I 2024-10-19 19:36:53,282] Trial 6 finished with value: 0.009602710374119942 and parameters: {'iterations': 5, 'learning_rate': 0.14550597381269134, 'depth': 2, 'subsample': 0.8308444428963763, 'colsample_bylevel': 0.12706471488823995, 'min_data_in_leaf': 10, 'l2_leaf_reg': 2.3506326987118036}. Best is trial 6 with value: 0.009602710374119942.
[I 2024-10-19 19:36:53,859] Trial 2 finished with value: 0.014010507880910683 and parameters: {'iterations': 9, 'learning_rate': 0.025105086028149443, 'depth': 2, 'subsample': 0.46749783553078694, 'colsample_bylevel': 0.1070

In [11]:
# best_params ={'iterations': 254, 'learning_rate': 0.2897722788222583, 'depth': 10, 'subsample': 0.653282163956773, 'colsample_bylevel': 0.8619983177427659, 'min_data_in_leaf': 56}

In [15]:
classifier = catboost.CatBoostClassifier(**study.best_params,  thread_count=-1)
classifier.fit(dataset_1.drop(columns=TARGET), dataset_1[TARGET], verbose=3)


0:	learn: 0.6911375	total: 8.11ms	remaining: 8.11ms
1:	learn: 0.6891678	total: 16.1ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x24bc5ecb450>

In [16]:
import pickle
with open('../models/catboost_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [17]:
predictions = classifier.predict(dataset_1.drop(columns=TARGET))

print(f1_score(dataset_1[TARGET], predictions))
print(recall_score(dataset_1[TARGET], predictions))
print(precision_score(dataset_1[TARGET], predictions))

0.7102684277801449
0.5592083193559209
0.9731465265615878


In [18]:
predictions = classifier.predict(dataset_2.drop(columns=TARGET))

print(f1_score(dataset_2[TARGET], predictions))
print(recall_score(dataset_2[TARGET], predictions))
print(precision_score(dataset_2[TARGET], predictions))

0.098941156049297
0.2713316672616923
0.060501525806023616
