In [1]:
import gc
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb

from utils import timer, reduce_memory_usage
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
with timer('Load training data types'):
    train_features_dtypes = pd.read_csv('train_features_dtypes_nov_7.csv')
    train_features_dtypes = train_features_dtypes.loc[train_features_dtypes['features'] != 'click_time']
    feature_dtypes_dict = dict(zip(train_features_dtypes['features'], train_features_dtypes['dtype']))

[Load training data types done in 0.009 s.]


In [3]:
with timer('Load training data'):
    train = pd.read_csv('train_features_nov_7.csv', dtype=feature_dtypes_dict, usecols=train_features_dtypes['features'].tolist())
    print(f'Training data size: {train.shape}')

Training data size: (15000000, 30)
[Load training data done in 47.246 s.]


In [4]:
train.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,click_time_year,click_time_month,click_time_day,click_time_hour,...,ip_os_device_count,ip_os_channel_count,ip_app_device_count,ip_app_channel_count,ip_os_app_device_count,ip_os_app_channel_count,ip_os_device_channel_count,ip_app_device_channel_count,ip_device_os_click_time_prev_diff,ip_app_channel_click_time_prev_diff
0,70712,2,1,32,237,0,2017,11,7,0,...,3,3,82,69,3,3,3,69,,
1,45892,3,1,25,424,0,2017,11,7,0,...,144,6,318,11,28,4,6,11,,
2,37774,8,2,13,145,0,2017,11,7,0,...,41,14,15,158,2,14,2,15,0.0,
3,41179,2,1,13,122,0,2017,11,7,0,...,8,1,15,3,1,1,1,2,,
4,83111,15,1,8,245,0,2017,11,7,0,...,6,2,50,8,2,2,2,8,,


In [5]:
categorical_features = ['ip', 'app', 'device', 'os', 'channel']
numerical_features = [c for c in train.columns if c.split('_')[-1] in ['count', 'diff']]
features = categorical_features.copy() + numerical_features.copy()
target = 'is_attributed'

In [6]:
train[categorical_features].nunique()

ip         76268
app          384
device       999
os           295
channel      175
dtype: int64

In [None]:
# "high cardinal"

In [7]:
categorical_features.remove('ip')

# Logistic Regression

In [8]:
train[numerical_features].isnull().sum()

ip_count                                     0
os_count                                     0
app_count                                    0
device_count                                 0
channel_count                                0
ip_os_count                                  0
ip_app_count                                 0
ip_device_count                              0
ip_channel_count                             0
ip_os_app_count                              0
ip_os_device_count                           0
ip_os_channel_count                          0
ip_app_device_count                          0
ip_app_channel_count                         0
ip_os_app_device_count                       0
ip_os_app_channel_count                      0
ip_os_device_channel_count                   0
ip_app_device_channel_count                  0
ip_device_os_click_time_prev_diff       929241
ip_app_channel_click_time_prev_diff    3759524
dtype: int64

In [9]:
train[numerical_features[-2:]].max()

ip_device_os_click_time_prev_diff      16123.0
ip_app_channel_click_time_prev_diff    16161.0
dtype: float64

In [None]:
max_val = train['ip_device_os_click_time_prev_diff'].max()
train['ip_device_os_click_time_prev_diff'] = train['ip_device_os_click_time_prev_diff'].fillna(max_val + 1)

max_val = train['ip_app_channel_click_time_prev_diff'].max()
train['ip_app_channel_click_time_prev_diff'] = train['ip_app_channel_click_time_prev_diff'].fillna(max_val + 1)

In [None]:
def run_lr_kfold_cv(X_train, y_train, X_test=None, classifier_params=None, **kwargs):
    """
    Run logistic regression with K folds cross validation
    If test data is given, the final prediction (probabilities) for test data are averaged over all folds
    :param X_train          : np.array, training data
    :param y_train          : np.array, training label
    :param X_test           : np.array, test data
    :param classifier_params: dict, classifier training parameters
    :param kwargs           : other parameters needed
    :return:
    """

    folds = kwargs.get('n_folds', 5)
    shuffle = kwargs.get('shuffle', True)
    random_state = kwargs.get('random_state', 2020)
    
    # use sklearn KFold
    kfold = KFold(n_splits=folds, shuffle=shuffle, random_state=random_state)
    oof_preds = np.zeros(y_train.shape[0])
    oof_preds_proba = np.zeros(y_train.shape[0])
    output = {}
    clf_list = []

    if classifier_params is None:
        classifier_params = {}
    if not isinstance(classifier_params, dict):
        raise ValueError('Argument `classifier_params` has to be dictionary or None by default.')

    for n_fold, (trn_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
        X_train_, X_val_ = X_train[trn_idx], X_train[val_idx]
        y_train_, y_val_ = y_train[trn_idx], y_train[val_idx]
        
        clf = LogisticRegression(**classifier_params)
        
        # sklearn estimators fixed methods: .fit() - training, .predict() - test / validation
        clf.fit(X_train_, y_train_)
        oof_preds[val_idx] = clf.predict(X_val_)
        oof_preds_proba[val_idx] = clf.predict_proba(X_val_)[:, 1]
        
        clf_list.append(clf)

    # save out-of-fold predictions
    output['oof_preds'] = oof_preds
    output['oof_preds_proba'] = oof_preds_proba

    # run prediction on test data
    if X_test is not None:
        test_preds_proba = np.zeros(X_test.shape[0])
        for clf_ in clf_list:
            test_preds_proba += clf_.predict_proba(X_test)[:, 1] / folds
        output['test_preds_proba'] = test_preds_proba

    return output

# LightGBM

In [14]:
def run_lgbm_kfold_cv(X_train, y_train, X_test=None, features=None, features_categorical=None, train_params=None, **kwargs):
    """
    Run lightgbm with K folds cross validation
    :param X_train              : np.array, training data
    :param y_train              : np.array, training label
    :param X_test               : np.array, test data
    :param train_params         : dict, lightgbm training parameters
    :param features             : list (str), features
    :param features_categorical : list (str), categorical features
    :param kwargs               : other parameters needed for running lightgbm
    :return:
                                - lightgbm model
                                - validation data predictions
                                - test data predictions
                                - feature importance (gain and split) dataframe
    """
    folds = kwargs.get('n_folds', 5)
    shuffle = kwargs.get('shuffle', True)
    random_state = kwargs.get('random_state', 2020)
    num_boost_round = kwargs.get('num_boost_round', 1000)
    early_stopping_rounds = kwargs.get('early_stopping_rounds', 200)
    verbose_eval = kwargs.get('verbose_eval', 100)
    kfold = KFold(n_splits=folds, shuffle=shuffle, random_state=random_state)
    oof_preds = np.zeros(y_train.shape[0])
    oof_preds_proba = np.zeros(y_train.shape[0])
    output = {}
    clf_list = []
    
    if train_params is None:
        train_params = {}

    for n_fold, (trn_idx, val_idx) in enumerate(kfold.split(X_train, y_train)):
        print(f'\n\n\tRunning fold {n_fold + 1} . . .\n\n')
        X_train_, X_val_ = X_train[trn_idx], X_train[val_idx]
        y_train_, y_val_ = y_train[trn_idx], y_train[val_idx]
        
        X_train_lgb = lgb.Dataset(X_train_, y_train_, feature_name=features, categorical_feature=features_categorical)
        X_val_lgb = lgb.Dataset(X_val_, y_val_, feature_name=features, categorical_feature=features_categorical)
        
        clf = lgb.train(train_params, train_set=X_train_lgb,
                        valid_sets=[X_train_lgb, X_val_lgb],
                        num_boost_round=num_boost_round,
                        early_stopping_rounds=early_stopping_rounds,
                        verbose_eval=verbose_eval)
        oof_preds[val_idx] = clf.predict(X_val_, num_iteration=clf.best_iteration)
        clf_list.append(clf)

    # save out-of-fold predictions
    output['oof_preds'] = oof_preds

    # run prediction on test data
    if X_test is not None:
        test_preds_proba = np.zeros(X_test.shape[0])
        for clf_ in clf_list:
            test_preds_proba += clf_.predict(X_test) / folds
        output['test_preds_proba'] = test_preds_proba

    return output

In [11]:
X_train = train[features].values
y_train = train[target].values
print(X_train.shape, y_train.shape)

(15000000, 25) (15000000,)


In [12]:
# clear space
del train
gc.collect()

0

In [15]:
train_params = {
    'metric'           : 'auc',
    'learning_rate'    : 0.2,
    'max_depth'        : 5,
    'num_leaves'       : 31,
    'min_data_in_leaf' : 20,
    'feature_fraction' : 0.6,
    'data_random_seed' : 2020,
    'lambda_l1'        : 1,
    'lambda_l2'        : 1
}
output = run_lgbm_kfold_cv(X_train, y_train, features=features, features_categorical=categorical_features, train_params=train_params)



	Running fold 1 . . .


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.973609	valid_1's auc: 0.971338
[200]	training's auc: 0.975318	valid_1's auc: 0.971585
[300]	training's auc: 0.976429	valid_1's auc: 0.971192
[400]	training's auc: 0.977025	valid_1's auc: 0.97036
Early stopping, best iteration is:
[230]	training's auc: 0.975793	valid_1's auc: 0.972141


	Running fold 2 . . .


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.973017	valid_1's auc: 0.971526
[200]	training's auc: 0.975146	valid_1's auc: 0.972534
[300]	training's auc: 0.976251	valid_1's auc: 0.971295
Early stopping, best iteration is:
[157]	training's auc: 0.974671	valid_1's auc: 0.972854


	Running fold 3 . . .


Training until validation scores don't improve for 200 rounds.
[100]	training's auc: 0.97383	valid_1's auc: 0.971382
[200]	training's auc: 0.97547	valid_1's auc: 0.971595
[300]	training's auc: 0.976487	valid_1's auc: 0.970669
Early 