In [1]:
import pandas as pd
import numpy as np

import sklearn

import os

import pickle as pkl

import optuna


from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve,balanced_accuracy_score,f1_score,classification_report
from tqdm import tqdm

In [2]:
with open('./data/train_set_.pkl','rb') as f:
    train =  pkl.load(f)

with open('./data/val_set_.pkl','rb') as f:
    val =  pkl.load(f)
    
# with open('./data/test_set_5_slides_.pkl','rb') as f:
#     test =  pkl.load(f)

len(train),len(val)#,len(test)

(2, 2)

In [3]:
len(train[0]),len(val[0])#,len(test[0])

(134656, 33722)

In [4]:
train_x,train_y = train
val_x,val_y = val
# test_x,test_y = test

# ML models

In [5]:
len(train_x),len(val_x),len(train_x)+len(val_x),len(train_x+val_x)

(134656, 33722, 168378, 168378)

In [6]:
import gc
gc.collect()

0

In [9]:
def objective(trial):

    n_estimators_hp = trial.suggest_int('n_estimators', 10, 300)
    max_depth_hp =  trial.suggest_int('max_depth', 1, 20)
    min_child_samples_hp = trial.suggest_int('min_child_samples', 10, 50)
    num_leaves_hp = trial.suggest_int('num_leaves', 2, 15)
    learning_rate_hp = trial.suggest_float('learning_rate',1e-8,1)

    max_bin_hp = trial.suggest_int('max_bin',5,10)
    lambda_l2_hp = trial.suggest_float('reg_lambda',0,10)
    bagging_fraction_hp = trial.suggest_float('subsample',1e-8,1)
    
    model = LGBMClassifier(n_estimators = n_estimators_hp, 
                          max_depth = max_depth_hp, 
                          min_child_samples = min_child_samples_hp,
                          num_leaves = num_leaves_hp,
                          learning_rate = learning_rate_hp,
                          max_bin = max_bin_hp,
                          reg_lambda = lambda_l2_hp,
                          subsample = bagging_fraction_hp,
                          random_state = 0,is_unbalance=True)
    
    model.fit(train_x[:len(train[0])//2], train_y[:len(train[0])//2])
    pred_probs = model.predict_proba(val_x)

    roc_val = roc_auc_score(val_y, pred_probs[:,1])
    
    return roc_val

optuna.logging.set_verbosity(optuna.logging.INFO) 

study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=2022))
study.optimize(objective, n_trials=20)

[32m[I 2023-03-22 17:37:54,429][0m A new study created in memory with name: no-name-3d9583eb-58c3-46ab-9bb7-89729bb6d205[0m
[32m[I 2023-03-22 17:37:59,715][0m Trial 0 finished with value: 0.851985005746516 and parameters: {'n_estimators': 12, 'max_depth': 10, 'min_child_samples': 14, 'num_leaves': 2, 'learning_rate': 0.6854075973890158, 'max_bin': 7, 'reg_lambda': 8.976572264432697, 'subsample': 0.6474520742687456}. Best is trial 0 with value: 0.851985005746516.[0m
[32m[I 2023-03-22 17:39:01,668][0m Trial 1 finished with value: 0.9051666908954035 and parameters: {'n_estimators': 271, 'max_depth': 15, 'min_child_samples': 44, 'num_leaves': 13, 'learning_rate': 0.8335795857223447, 'max_bin': 10, 'reg_lambda': 3.680444370049888, 'subsample': 0.49483763493691135}. Best is trial 1 with value: 0.9051666908954035.[0m
[32m[I 2023-03-22 17:39:17,392][0m Trial 2 finished with value: 0.9024893556771251 and parameters: {'n_estimators': 108, 'max_depth': 13, 'min_child_samples': 50, 'num

In [None]:
hp = {'n_estimators': 223, 'max_depth': 3, 'min_child_samples': 32, 'num_leaves': 8, 'learning_rate': 0.3639262728213733, 'max_bin': 9, 'reg_lambda': 6.842497791502761, 'subsample': 0.18858002802827115}

model = LGBMClassifier(hp, random_state = 0,is_unbalance=True )
model.fit(train_x[:len(train[0])//2], train_y[:len(train[0])//2])

In [None]:
# with open('./data/test_set_5_slides_.pkl','rb') as f:
#     test =  pkl.load(f)

test_x,test_y = test

roc_auc_score(test_y, model.predict_proba(test_y)[:,1])