In [1]:
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_parquet('./data/featurized.parquet')

# Ranking requires sorting by group
sorted_df = df.sort_values(by='msno').reset_index(drop=True)

In [3]:
def mean_user_auc(y_true, y_pred, group):
    out = 0.
    i = 0
    valid_user_count = 0
    for j in np.cumsum(group):
        if j - i < 2 or len(set(y_true[i:j])) < 2:
            # Intractable, skip group
            continue
        out += roc_auc_score(y_true[i:j], y_pred[i:j])
        valid_user_count += 1
        i = j
    out /= valid_user_count
    
    return out

def lgb_mean_user_auc(y_pred, train_data):
    out = mean_user_auc(train_data.label, y_pred, train_data.group)
    return 'mean_user_auc', out, True


def train_lgb(df, params):
    X = df.iloc[:, 1:-1]
    groups = df.iloc[:, 0]
    y = df.iloc[:, -1]
    
    preds = np.zeros_like(y, dtype=float)
    models = []
    for train_idx, test_idx in GroupKFold(5).split(X, y, groups):
        train_group = groups.iloc[train_idx].groupby(groups.iloc[train_idx].values).count().tolist()
        test_group = groups.iloc[test_idx].groupby(groups.iloc[test_idx].values).count().tolist()

        train_ds = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx], group=train_group)
        test_ds = lgb.Dataset(X.iloc[test_idx], y.iloc[test_idx], group=test_group)

        # We can use early stopping because of the holdout set existence ("./data/test.csv")
        # In a real world application we would tune the model on a CV and 
        # measure the best model performance on the holdout
        # However, 100 boosting rounds is too few and the model is still underfit
        model = lgb.train(
            params, 
            train_set=train_ds, 
            valid_sets=test_ds, 
            fobj=None, 
            feval=lgb_mean_user_auc, 
            verbose_eval=5, 
            early_stopping_rounds=25
        )
        models.append(model)
        preds[test_idx] = model.predict(X.iloc[test_idx])
        
    return preds, models

## Ranker

In [4]:
# LGB params from kaggle, slightly modified
params = {
    'objective': 'lambdarank',
    'metric': 'None',
    'boosting': 'gbdt',
    'learning_rate': 0.2 ,
    'verbose': 0,
    'num_leaves': 100,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 1,
    'feature_fraction': 0.9,
    'feature_fraction_seed': 1,
    'max_bin': 256,
    'num_rounds': 50,
}

In [5]:
preds, models = train_lgb(sorted_df, params)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.648818
[10]	valid_0's mean_user_auc: 0.652595
[15]	valid_0's mean_user_auc: 0.654164
[20]	valid_0's mean_user_auc: 0.655628
[25]	valid_0's mean_user_auc: 0.655526
[30]	valid_0's mean_user_auc: 0.655562
[35]	valid_0's mean_user_auc: 0.655026
[40]	valid_0's mean_user_auc: 0.654969
[45]	valid_0's mean_user_auc: 0.654528
Early stopping, best iteration is:
[24]	valid_0's mean_user_auc: 0.655895




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.644931
[10]	valid_0's mean_user_auc: 0.648527
[15]	valid_0's mean_user_auc: 0.650729
[20]	valid_0's mean_user_auc: 0.651409
[25]	valid_0's mean_user_auc: 0.651837
[30]	valid_0's mean_user_auc: 0.65165
[35]	valid_0's mean_user_auc: 0.651303
[40]	valid_0's mean_user_auc: 0.651079
[45]	valid_0's mean_user_auc: 0.651221
[50]	valid_0's mean_user_auc: 0.651552
Did not meet early stopping. Best iteration is:
[28]	valid_0's mean_user_auc: 0.651916




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.649525
[10]	valid_0's mean_user_auc: 0.652198
[15]	valid_0's mean_user_auc: 0.652867
[20]	valid_0's mean_user_auc: 0.653329
[25]	valid_0's mean_user_auc: 0.653784
[30]	valid_0's mean_user_auc: 0.654665
[35]	valid_0's mean_user_auc: 0.654707
[40]	valid_0's mean_user_auc: 0.654195
[45]	valid_0's mean_user_auc: 0.654314
[50]	valid_0's mean_user_auc: 0.654078
Did not meet early stopping. Best iteration is:
[32]	valid_0's mean_user_auc: 0.654844




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.646433
[10]	valid_0's mean_user_auc: 0.651047
[15]	valid_0's mean_user_auc: 0.653625
[20]	valid_0's mean_user_auc: 0.655441
[25]	valid_0's mean_user_auc: 0.655438
[30]	valid_0's mean_user_auc: 0.65533
[35]	valid_0's mean_user_auc: 0.655184
[40]	valid_0's mean_user_auc: 0.65503
[45]	valid_0's mean_user_auc: 0.655101
Early stopping, best iteration is:
[24]	valid_0's mean_user_auc: 0.655668




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.




Training until validation scores don't improve for 25 rounds
[5]	valid_0's mean_user_auc: 0.64813
[10]	valid_0's mean_user_auc: 0.651782
[15]	valid_0's mean_user_auc: 0.653855
[20]	valid_0's mean_user_auc: 0.65525
[25]	valid_0's mean_user_auc: 0.655511
[30]	valid_0's mean_user_auc: 0.654815
[35]	valid_0's mean_user_auc: 0.65416
[40]	valid_0's mean_user_auc: 0.653746
[45]	valid_0's mean_user_auc: 0.653606
[50]	valid_0's mean_user_auc: 0.653556
Early stopping, best iteration is:
[25]	valid_0's mean_user_auc: 0.655511


Померяем ROC-AUC (как в соревновании на кэгле) и Mean User ROC-AUC

In [6]:
# OOF ROC-AUC
roc_auc_score(sorted_df.iloc[:, -1], preds)

0.6812341920424946

In [7]:
# Mean User ROC-AUC
mean_user_auc(sorted_df.iloc[:, -1], preds, sorted_df.groupby('msno')['song_id'].count().tolist())

0.6549377243665163

In [8]:
with open('oof_lgbm_models.pkl', 'wb') as f:
    pickle.dump(models, f)