In [1]:
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_parquet('./data/featurized.parquet')

# Ranking requires sorting by group
sorted_df = df.sort_values(by='msno').reset_index(drop=True)

In [3]:
def mean_user_auc(y_true, y_pred, group):
    out = 0.
    i = 0
    valid_user_count = 0
    for j in np.cumsum(group):
        if j - i < 2 or len(set(y_true[i:j])) < 2:
            # Intractable, skip group
            continue
        out += roc_auc_score(y_true[i:j], y_pred[i:j])
        valid_user_count += 1
        i = j
    out /= valid_user_count
    
    return out

def lgb_mean_user_auc(y_pred, train_data):
    out = mean_user_auc(train_data.label, y_pred, train_data.group)
    return 'mean_user_auc', out, True


def train_lgb(df, params):
    X = df.iloc[:, 1:-1]
    groups = df.iloc[:, 0]
    y = df.iloc[:, -1]
    
    preds = np.zeros_like(y, dtype=float)
    models = []
    for train_idx, test_idx in GroupKFold(5).split(X, y, groups):
        train_group = groups.iloc[train_idx].groupby(groups.iloc[train_idx].values).count().tolist()
        test_group = groups.iloc[test_idx].groupby(groups.iloc[test_idx].values).count().tolist()

        train_ds = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx], group=train_group)
        test_ds = lgb.Dataset(X.iloc[test_idx], y.iloc[test_idx], group=test_group)

        # We can use early stopping because of the holdout set existence ("./data/test.csv")
        # In a real world application we would tune the model on a CV and 
        # measure the best model performance on the holdout
        # However, 100 boosting rounds is too few and the model is still underfit
        model = lgb.train(
            params, 
            train_set=train_ds, 
            valid_sets=test_ds, 
            fobj=None, 
            feval=lgb_mean_user_auc, 
            verbose_eval=5, 
            early_stopping_rounds=25
        )
        models.append(model)
        preds[test_idx] = model.predict(X.iloc[test_idx])
        
    return preds, models

## Ranker

In [4]:
# LGB params from kaggle, slightly modified
params = {
    'objective': 'lambdarank',
    'metric': 'None',
    'boosting': 'gbdt',
    'learning_rate': 0.2 ,
    'verbose': 0,
    'num_leaves': 100,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'num_rounds': 50,
}

In [5]:
preds, models = train_lgb(sorted_df, params)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.639422
[10]	valid_0's mean_user_auc: 0.645121
[15]	valid_0's mean_user_auc: 0.647666
[20]	valid_0's mean_user_auc: 0.64917
[25]	valid_0's mean_user_auc: 0.650575
[30]	valid_0's mean_user_auc: 0.651076
[35]	valid_0's mean_user_auc: 0.651168
[40]	valid_0's mean_user_auc: 0.651078
[45]	valid_0's mean_user_auc: 0.65108
[50]	valid_0's mean_user_auc: 0.650854
Did not meet early stopping. Best iteration is:
[33]	valid_0's mean_user_auc: 0.65138




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.63767
[10]	valid_0's mean_user_auc: 0.642331
[15]	valid_0's mean_user_auc: 0.645261
[20]	valid_0's mean_user_auc: 0.646927
[25]	valid_0's mean_user_auc: 0.647716
[30]	valid_0's mean_user_auc: 0.648157
[35]	valid_0's mean_user_auc: 0.648419
[40]	valid_0's mean_user_auc: 0.648462
[45]	valid_0's mean_user_auc: 0.648626
[50]	valid_0's mean_user_auc: 0.648456
Did not meet early stopping. Best iteration is:
[41]	valid_0's mean_user_auc: 0.648676




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.639359
[10]	valid_0's mean_user_auc: 0.645089
[15]	valid_0's mean_user_auc: 0.648028
[20]	valid_0's mean_user_auc: 0.649791
[25]	valid_0's mean_user_auc: 0.650263
[30]	valid_0's mean_user_auc: 0.650451
[35]	valid_0's mean_user_auc: 0.65089
[40]	valid_0's mean_user_auc: 0.650706
[45]	valid_0's mean_user_auc: 0.650915
[50]	valid_0's mean_user_auc: 0.650692
Did not meet early stopping. Best iteration is:
[37]	valid_0's mean_user_auc: 0.651115




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.636178
[10]	valid_0's mean_user_auc: 0.642076
[15]	valid_0's mean_user_auc: 0.644857
[20]	valid_0's mean_user_auc: 0.646914
[25]	valid_0's mean_user_auc: 0.647965
[30]	valid_0's mean_user_auc: 0.64833
[35]	valid_0's mean_user_auc: 0.649095
[40]	valid_0's mean_user_auc: 0.649119
[45]	valid_0's mean_user_auc: 0.649523
[50]	valid_0's mean_user_auc: 0.649354
Did not meet early stopping. Best iteration is:
[47]	valid_0's mean_user_auc: 0.649655




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.639239
[10]	valid_0's mean_user_auc: 0.645243
[15]	valid_0's mean_user_auc: 0.647813
[20]	valid_0's mean_user_auc: 0.649942
[25]	valid_0's mean_user_auc: 0.651406
[30]	valid_0's mean_user_auc: 0.651232
[35]	valid_0's mean_user_auc: 0.650999
[40]	valid_0's mean_user_auc: 0.65122
[45]	valid_0's mean_user_auc: 0.650813
[50]	valid_0's mean_user_auc: 0.649839
Did not meet early stopping. Best iteration is:
[28]	valid_0's mean_user_auc: 0.651464


Померяем ROC-AUC (как в соревновании на кэгле) и Mean User ROC-AUC

In [8]:
# OOF ROC-AUC
roc_auc_score(sorted_df.iloc[:, -1], preds)

0.678172545848034

In [10]:
# Mean User ROC-AUC
mean_user_auc(sorted_df.iloc[:, -1], preds, sorted_df.groupby('msno')['song_id'].count().tolist())

0.6506320865055217

In [11]:
with open('oof_lgbm_models.pkl', 'wb') as f:
    pickle.dump(models, f)