In [18]:
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_parquet('./data/featurized.parquet')

In [3]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [4]:
# LGB params from kaggle
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.2 ,
    'verbose': 0,
    'num_leaves': 100,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'num_rounds': 100,
    'metric' : 'auc'
}

In [9]:
# 5 fold CV
preds = np.zeros_like(y, dtype=float)
models = []
for train_idx, test_idx in KFold(5, shuffle=True).split(X, y):
    train_ds = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    test_ds = lgb.Dataset(X.iloc[test_idx], y.iloc[test_idx])
    # We can use early stopping because of the holdout set existence ("./data/test.csv")
    # In a real world application we would tune the model on a CV and 
    # measure the best model performance on the holdout
    # However, 100 boosting rounds is too few and the model is still underfit
    model = lgb.train(params, train_set=train_ds, valid_sets=test_ds, verbose_eval=5, early_stopping_rounds=25)
    models.append(model)
    preds[test_idx] = model.predict(X.iloc[test_idx])



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's auc: 0.735304
[10]	valid_0's auc: 0.749804
[15]	valid_0's auc: 0.760779
[20]	valid_0's auc: 0.769853
[25]	valid_0's auc: 0.775135
[30]	valid_0's auc: 0.780003
[35]	valid_0's auc: 0.782621
[40]	valid_0's auc: 0.785615
[45]	valid_0's auc: 0.788041
[50]	valid_0's auc: 0.789811
[55]	valid_0's auc: 0.792023
[60]	valid_0's auc: 0.793357
[65]	valid_0's auc: 0.795003
[70]	valid_0's auc: 0.79588
[75]	valid_0's auc: 0.796627
[80]	valid_0's auc: 0.797364
[85]	valid_0's auc: 0.798325
[90]	valid_0's auc: 0.798783
[95]	valid_0's auc: 0.799573
[100]	valid_0's auc: 0.800103
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.800103
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 25 rounds
[5]	valid_0's auc: 0.73486
[10]	valid_0's auc: 0.749639
[15]	valid_0's auc: 0.760135
[20]	valid_0's

Померяем ROC-AUC (как в соревновании на кэгле) и Mean User ROC-AUC

In [11]:
# OOF ROC-AUC
roc_auc_score(y, preds)

0.8002746071855672

In [12]:
res_ds = pd.DataFrame({'msno': X['msno'], 'true': y, 'pred': preds})

In [15]:
group_aucs = res_ds.groupby('msno').apply(
    lambda x: roc_auc_score(x.true, x.pred) if len(set(x.true)) > 1 else np.nan
)

In [17]:
# Mean User ROC-AUC
np.nanmean(group_aucs)

0.6823059012149877

In [21]:
with open('oof_lgbm_models.pkl', 'wb') as f:
    pickle.dump(models, f)