# Прогнозируем задержки самолетов

In [1]:
!pip install catboost lightgbm optuna -q    

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [3]:
RANDOM_STATE = 111
DATASET_PATH = 'https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/flight_delays_train.csv'

In [4]:
data = pd.read_csv(DATASET_PATH)

X = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'] == 'Y'

X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732
1,c-4,c-20,c-3,1548,US,PIT,MCO,834
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423


Создайте список номеров колонок с категориальными признаками для бустингов




In [14]:
cat_features = [col for col in X.columns if X[col].dtypes == 'object']

Разобъем данные на обучение и контроль

In [15]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

In [16]:
Xtrain.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
41207,c-4,c-18,c-1,1457,CO,EWR,TPA,998
28283,c-11,c-1,c-2,1225,UA,DEN,BOS,1754
34619,c-6,c-16,c-5,1650,YV,IAD,CAE,401
8789,c-5,c-18,c-4,923,AA,SLC,DFW,988
38315,c-2,c-14,c-2,1839,AA,STL,SAN,1558


## Модели с параметрами по умолчанию

Обучите CatBoost с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [22]:
cat = CatBoostClassifier()
cat.fit(Xtrain, ytrain, plot=True, cat_features=cat_features)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.065101
0:	learn: 0.6596421	total: 84.3ms	remaining: 1m 24s
1:	learn: 0.6303141	total: 202ms	remaining: 1m 40s
2:	learn: 0.6051089	total: 287ms	remaining: 1m 35s
3:	learn: 0.5837305	total: 426ms	remaining: 1m 46s
4:	learn: 0.5664102	total: 520ms	remaining: 1m 43s
5:	learn: 0.5516951	total: 652ms	remaining: 1m 48s
6:	learn: 0.5392252	total: 785ms	remaining: 1m 51s
7:	learn: 0.5275450	total: 922ms	remaining: 1m 54s
8:	learn: 0.5187290	total: 1.04s	remaining: 1m 54s
9:	learn: 0.5095510	total: 1.2s	remaining: 1m 59s
10:	learn: 0.5008009	total: 1.35s	remaining: 2m 1s
11:	learn: 0.4942997	total: 1.45s	remaining: 1m 59s
12:	learn: 0.4886401	total: 1.6s	remaining: 2m 1s
13:	learn: 0.4850381	total: 1.65s	remaining: 1m 56s
14:	learn: 0.4791028	total: 1.8s	remaining: 1m 58s
15:	learn: 0.4748786	total: 1.93s	remaining: 1m 58s
16:	learn: 0.4713617	total: 2.09s	remaining: 2m 1s
17:	learn: 0.4679934	total: 2.23s	remaining: 2m 1s
18:	learn: 0.4646427	total: 2.36s	remaining: 2m 1s

<catboost.core.CatBoostClassifier at 0x198c3029030>

In [24]:
ypred_cat = cat.predict_proba(Xtest)[:,1]
print(f'Roc-Auc: {roc_auc_score(ytest, ypred_cat)}')

Roc-Auc: 0.7667654664040842


Обучите LightGBM с гиперпараметрами по умолчанию.

## Quiz
Чему равен ROC-AUC на тестовых данных? Ответ округлите до сотых.

In [26]:
for c in X.columns:
    col_type = X[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        Xtrain[c] = Xtrain[c].astype('category')
        Xtest[c] = Xtest[c].astype('category')

In [27]:
light_clf = LGBMClassifier()
light_clf.fit(Xtrain, ytrain) 

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


In [29]:
ypred_light = light_clf.predict_proba(Xtest)[:,1]
roc_auc_score(ytest, ypred_light)

0.7341149074685321

## Optuna

Выделим дополнительную валидационную выборку.

In [30]:
Xtrain_new, Xval, ytrain_new, yval = train_test_split(Xtrain, ytrain, test_size=0.25, random_state=RANDOM_STATE)

Создайте функцию objective_lgbm, в которой среди гиперпараметров

* num_leaves = trial.suggest_int("num_leaves", 10, 100)
* n_estimators = trial.suggest_int("n_estimators", 10, 1000)

подберите оптимальные, обучая LGBM на Xtrain_new, ytrain_new и проверяя качество (ROC-AUC) на Xval.

Используйте 30 эпох обучения Optuna.


In [40]:
import optuna

def objective_lgbm(trial):
    num_leaves = trial.suggest_int('num_leaves', 10, 100)
    n_estimators = trial.suggest_int('n_estimators', 10, 100)
    model = LGBMClassifier(num_leaves=num_leaves, n_estimators=n_estimators)
    model.fit(Xtrain_new, ytrain_new)
    ypred = model.predict_proba(Xval)[:, 1]
    roc_auc = roc_auc_score(yval, ypred)
    return roc_auc

study = optuna.create_study(direction="maximize")
study.optimize(objective_lgbm, n_trials=30)

best_params = study.best_params
print('Best parameters:', best_params)

[I 2023-11-03 13:31:40,676] A new study created in memory with name: no-name-510da0e2-e41e-42ce-a616-26e5313b6618


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:41,048] Trial 0 finished with value: 0.7234993863082714 and parameters: {'num_leaves': 61, 'n_estimators': 90}. Best is trial 0 with value: 0.7234993863082714.
[I 2023-11-03 13:31:41,213] Trial 1 finished with value: 0.7225607307024797 and parameters: {'num_leaves': 62, 'n_estimators': 32}. Best is trial 0 with value: 0.7234993863082714.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:41,334] Trial 2 finished with value: 0.7251566682766695 and parameters: {'num_leaves': 16, 'n_estimators': 36}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:41,884] Trial 3 finished with value: 0.7198568525303747 and parameters: {'num_leaves': 76, 'n_estimators': 95}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:42,291] Trial 4 finished with value: 0.723857470607704 and parameters: {'num_leaves': 83, 'n_estimators': 62}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:42,675] Trial 5 finished with value: 0.722000698626196 and parameters: {'num_leaves': 53, 'n_estimators': 89}. Best is trial 2 with value: 0.7251566682766695.
[I 2023-11-03 13:31:42,794] Trial 6 finished with value: 0.7245570470171483 and parameters: {'num_leaves': 36, 'n_estimators': 22}. Best is trial 2 with value: 0.7251566682766695.
[I 2023-11-03 13:31:42,888] Trial 7 finished with value: 0.7199469218053237 and parameters: {'num_leaves': 36, 'n_estimators': 12}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_col_

[I 2023-11-03 13:31:43,318] Trial 8 finished with value: 0.7218986420425906 and parameters: {'num_leaves': 58, 'n_estimators': 100}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:43,584] Trial 9 finished with value: 0.7238320706701549 and parameters: {'num_leaves': 38, 'n_estimators': 71}. Best is trial 2 with value: 0.7251566682766695.
[I 2023-11-03 13:31:43,740] Trial 10 finished with value: 0.724946132031726 and parameters: {'num_leaves': 11, 'n_estimators': 42}. Best is trial 2 with value: 0.7251566682766695.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:43,879] Trial 11 finished with value: 0.7248794160806529 and parameters: {'num_leaves': 10, 'n_estimators': 42}. Best is trial 2 with value: 0.7251566682766695.
[I 2023-11-03 13:31:44,028] Trial 12 finished with value: 0.7262889390179295 and parameters: {'num_leaves': 15, 'n_estimators': 43}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:44,220] Trial 13 finished with value: 0.7255826014740296 and parameters: {'num_leaves': 23, 'n_estimators': 50}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:44,657] Trial 14 finished with value: 0.725434422989436 and parameters: {'num_leaves': 100, 'n_estimators': 56}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:44,912] Trial 15 finished with value: 0.725760602043573 and parameters: {'num_leaves': 25, 'n_estimators': 74}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:45,191] Trial 16 finished with value: 0.723342381802213 and parameters: {'num_leaves': 27, 'n_estimators': 76}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:45,512] Trial 17 finished with value: 0.7233454151760498 and parameters: {'num_leaves': 44, 'n_estimators': 76}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:45,778] Trial 18 finished with value: 0.7259957067892608 and parameters: {'num_leaves': 23, 'n_estimators': 65}. Best is trial 12 with value: 0.7262889390179295.
[I 2023-11-03 13:31:45,983] Trial 19 finished with value: 0.7257638912441189 and parameters: {'num_leaves': 20, 'n_estimators': 59}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:46,244] Trial 20 finished with value: 0.7246446493916879 and parameters: {'num_leaves': 48, 'n_estimators': 49}. Best is trial 12 with value: 0.7262889390179295.
[I 2023-11-03 13:31:46,461] Trial 21 finished with value: 0.725477182596533 and parameters: {'num_leaves': 20, 'n_estimators': 62}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:46,706] Trial 22 finished with value: 0.7230124384407843 and parameters: {'num_leaves': 30, 'n_estimators': 62}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:46,837] Trial 23 finished with value: 0.7239499245530489 and parameters: {'num_leaves': 15, 'n_estimators': 29}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:47,061] Trial 24 finished with value: 0.7235350924075311 and parameters: {'num_leaves': 30, 'n_estimators': 53}. Best is trial 12 with value: 0.7262889390179295.
[I 2023-11-03 13:31:47,275] Trial 25 finished with value: 0.7249632176012284 and parameters: {'num_leaves': 17, 'n_estimators': 67}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:47,619] Trial 26 finished with value: 0.7226444865397142 and parameters: {'num_leaves': 40, 'n_estimators': 85}. Best is trial 12 with value: 0.7262889390179295.
[I 2023-11-03 13:31:47,815] Trial 27 finished with value: 0.7261283346646068 and parameters: {'num_leaves': 31, 'n_estimators': 42}. Best is trial 12 with value: 0.7262889390179295.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108


[I 2023-11-03 13:31:48,032] Trial 28 finished with value: 0.7262576550660704 and parameters: {'num_leaves': 31, 'n_estimators': 46}. Best is trial 12 with value: 0.7262889390179295.
[I 2023-11-03 13:31:48,304] Trial 29 finished with value: 0.7279810865121141 and parameters: {'num_leaves': 64, 'n_estimators': 44}. Best is trial 29 with value: 0.7279810865121141.


[LightGBM] [Info] Number of positive: 10730, number of negative: 45520
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 56250, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.190756 -> initscore=-1.445108
[LightGBM] [Info] Start training from score -1.445108
Best parameters: {'num_leaves': 64, 'n_estimators': 44}


Обучите модель с найденными гиперпараметрами на Xtrain, ytrain и оцените ROC-AUC на тестовых данных.

In [42]:
lgbm = LGBMClassifier(num_leaves=64, n_estimators=44)
lgbm.fit(Xtrain, ytrain)
ypred = lgbm.predict_proba(Xtest)[:,1]
roc_auc_score(ytest, ypred)

[LightGBM] [Info] Number of positive: 14346, number of negative: 60654
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.191280 -> initscore=-1.441714
[LightGBM] [Info] Start training from score -1.441714


0.7347706192474899

## Quiz

Чему равно количество листьев в LGBM после подбора гиперпараметров?

Ответ: num_leaves =64, но гиперпараметры сильно меняются от запуска к запуску