In [1]:
import pandas as pd
import numpy as np
import collections
import random
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
%matplotlib inline

In [185]:
train = pd.read_csv('./data/train.csv')
valid = pd.read_csv('./data/validation.csv')
test  = pd.read_csv('./data/test.csv')

In [186]:
X_all = pd.concat([train.drop('click',axis=1), 
                   valid.drop('click', axis=1),
                   test], axis=0)
X_all.shape

(3038281, 24)

In [203]:
# drop useless columns
drop_cols = ['bidprice','payprice', 'bidid', 'userid', 'url', 'urlid' ]

train_data = train.drop(drop_cols, axis=1)
valid_data = valid.drop(drop_cols, axis=1)

# for simplification, drop many-value features 
smp_drop_cols = ['IP', 'domain', 'slotid', 'usertag']
train_data = train_data.drop(smp_drop_cols, axis=1)
valid_data = valid_data.drop(smp_drop_cols, axis=1)

# for simplification, drop categorical features 
cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']
train_data = train_data.drop(cat_feats, axis=1)
valid_data = valid_data.drop(cat_feats, axis=1)

### deal with many-values features

### usertage

In [60]:
usertag = X_all['usertag'].str.split(',')

In [69]:
from sklearn.preprocessing import MultiLabelBinarizer

In [70]:
mlb = MultiLabelBinarizer()

In [73]:
usertag_dm = mlb.fit_transform(usertag)

In [100]:
usertag_dm.shape

(3038281, 69)

In [188]:
def convt_cat(feat):
    """
    convert string values of categorical feature to int
    
    input:
    @feat(str): name of the feautre to be converted
    return:
    @feat_int(Series of int): integer representation of the categorical feature
    """
#     feat = 'useragent'
    
    cat_series = pd.Series(X_all[feat], dtype='category', name=feat)
    num_cat = len(set(X_all[feat]))
    feat_int = cat_series.cat.rename_categories([x for x in range(num_cat)])
    
    return feat_int
   

In [189]:
def lgb_data(train_data, valid_data):
    """
    create dataset for lightgbm
    """
    y_train = train_data['click'].values
    y_valid = valid_data['click'].values
    X_train = train_data.drop('click', axis=1)
    X_valid = valid_data.drop('click', axis=1)

#     lgb_train = lgb.Dataset(X_train, y_train, categorical_feature= cat_feats, free_raw_data=False) #  ,feature_name=X_train.columns
#     lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature= cat_feats, free_raw_data=False)

    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) #  ,feature_name=X_train.columns
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, free_raw_data=False)
    
    return lgb_train, lgb_valid

In [190]:
# specify configurations as dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt', # ?
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

### result for plain gbm

In [191]:
lgb_train, lgb_valid = lgb_data(train_data, valid_data)

In [192]:
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)

[1]	valid_0's auc: 0.596453	valid_0's l2: 0.000664171
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.601192	valid_0's l2: 0.000664141
[3]	valid_0's auc: 0.618365	valid_0's l2: 0.000664083
[4]	valid_0's auc: 0.613268	valid_0's l2: 0.00066407
[5]	valid_0's auc: 0.619763	valid_0's l2: 0.00066406
[6]	valid_0's auc: 0.619762	valid_0's l2: 0.000664042
[7]	valid_0's auc: 0.618835	valid_0's l2: 0.000664015
[8]	valid_0's auc: 0.621834	valid_0's l2: 0.000663998
[9]	valid_0's auc: 0.621852	valid_0's l2: 0.000663992
[10]	valid_0's auc: 0.621792	valid_0's l2: 0.000663997
[11]	valid_0's auc: 0.621805	valid_0's l2: 0.000664006
[12]	valid_0's auc: 0.632818	valid_0's l2: 0.000664013
[13]	valid_0's auc: 0.633355	valid_0's l2: 0.000664023
[14]	valid_0's auc: 0.633712	valid_0's l2: 0.000664036
Early stopping, best iteration is:
[9]	valid_0's auc: 0.621852	valid_0's l2: 0.000663992


### result for plain gbm with basic categorical features

In [221]:
# # simpliest df
# basic_cat_feats = ['city']
# for cat in basic_cat_feats:
#     train_data[cat] = train_data[cat].astype('category')
#     valid_data[cat] = valid_data[cat].astype('category')

# # train_data.dtypes, valid_data.dtypes

In [26]:
# # 0.598
# lgb_train, lgb_valid = lgb_data(train_data, valid_data)
# gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's auc: 0.50027	valid_0's l2: 0.000664198
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.594934	valid_0's l2: 0.00066416
[3]	valid_0's auc: 0.598325	valid_0's l2: 0.00066415
[4]	valid_0's auc: 0.598211	valid_0's l2: 0.000664168
[5]	valid_0's auc: 0.597553	valid_0's l2: 0.00066418
[6]	valid_0's auc: 0.601807	valid_0's l2: 0.000664186
[7]	valid_0's auc: 0.600082	valid_0's l2: 0.000664196
[8]	valid_0's auc: 0.603028	valid_0's l2: 0.000664197
Early stopping, best iteration is:
[3]	valid_0's auc: 0.598325	valid_0's l2: 0.00066415


### add basic categorical features

In [204]:
def add_cat(feat_names, train, valid, cat=True, evl=False):
    """
    Adding extra categorical features accumulatively,
    Also allows investigate how the performance change accordingly
    input:
    @feats(list): names of the feature to be investigated
    @cat(bool): if or not set the additional feature as pandas.Series.category
    @elv(bool): if or not evaluate the performance for each concated DF
    output:
    @train, valid(df): data after concating all investigated features 
    Print the performance
    """
    for feat_name in feat_names:
        print('Add feature: {}'.format(feat_name))
        feat_int = convt_cat(feat_name)
        # concat additional column
        train = pd.concat([train, feat_int.rename(feat_name).iloc[:train.shape[0]]], axis=1)
        valid = pd.concat([valid, feat_int.rename(feat_name).iloc[train.shape[0]:(train.shape[0]+valid.shape[0])]], axis=1)
        
        if not cat:
            for col in train.columns:
                train[col] = train[col].astype(int)
                valid[col] = valid[col].astype(int)
        
        if evl:
            lgb_train, lgb_valid = lgb_data(train, valid)

            gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_valid,
                    early_stopping_rounds=5)
    
    return train, valid
    

In [182]:
# # useful cat_feats: # 0.714
# # adexchange is disposable
# cat_feats = ['useragent', 'slotvisibility', 'slotformat', 'creative', 'keypage']
# # cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']
# train_concated, v_contated = add_cat(cat_feats, train_data, valid_data)

In [205]:
### all cat_feats # 0.712
cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']

train_data, valid_data = add_cat(cat_feats, train_data, valid_data, evl=False)

Add feature: useragent
Add feature: adexchange
Add feature: slotvisibility
Add feature: slotformat
Add feature: creative
Add feature: keypage


In [53]:
# # experiment: set cat as int 0.712
# # ??? why there is no significant change regarding performance?
# cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']

# train_concated, v_contated = add_cat(cat_feats, train_data, valid_data, cat=False)

Add feature: useragent
[1]	valid_0's auc: 0.684079	valid_0's l2: 0.000664095
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.683772	valid_0's l2: 0.000664024
[3]	valid_0's auc: 0.683711	valid_0's l2: 0.000663947
[4]	valid_0's auc: 0.683674	valid_0's l2: 0.000663901
[5]	valid_0's auc: 0.683266	valid_0's l2: 0.000663864
[6]	valid_0's auc: 0.683416	valid_0's l2: 0.000663836
Early stopping, best iteration is:
[1]	valid_0's auc: 0.684079	valid_0's l2: 0.000664095
Add feature: adexchange
[1]	valid_0's auc: 0.668821	valid_0's l2: 0.000664121
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.668645	valid_0's l2: 0.000664055
[3]	valid_0's auc: 0.682695	valid_0's l2: 0.000663976
[4]	valid_0's auc: 0.682698	valid_0's l2: 0.000663927
[5]	valid_0's auc: 0.682872	valid_0's l2: 0.000663884
[6]	valid_0's auc: 0.682431	valid_0's l2: 0.000663879
[7]	valid_0's auc: 0.682593	valid_0's l2: 0.000663819
[8]	valid_0's auc: 0.682891	valid_0's l2

### add usertag

In [206]:
usertag_train = usertag_df.iloc[:train.shape[0]]
usertag_valid = usertag_df.iloc[train.shape[0]: train.shape[0] + valid.shape[0]].reset_index(drop=True)

In [207]:
train_data = pd.concat([train_data, usertag_train], 1)
valid_data = pd.concat([valid_data, usertag_valid], 1)

In [208]:
train_data.shape, valid_data.shape

((2430981, 84), (303925, 84))

In [209]:
lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's auc: 0.690401	valid_0's l2: 0.000658959
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.765204	valid_0's l2: 0.000653869
[3]	valid_0's auc: 0.765207	valid_0's l2: 0.000649685
[4]	valid_0's auc: 0.76668	valid_0's l2: 0.000649383
[5]	valid_0's auc: 0.765719	valid_0's l2: 0.000645757
[6]	valid_0's auc: 0.766045	valid_0's l2: 0.00064172
[7]	valid_0's auc: 0.766303	valid_0's l2: 0.000638934
[8]	valid_0's auc: 0.766314	valid_0's l2: 0.000634889
[9]	valid_0's auc: 0.766333	valid_0's l2: 0.000631425
Early stopping, best iteration is:
[4]	valid_0's auc: 0.76668	valid_0's l2: 0.000649383


### add domain

add sparse domain -> 0.70

In [148]:
domain_int = convt_cat('domain')

In [210]:
train_tmp = pd.concat([train_data, domain_int.rename('domain')[:train.shape[0]]], 1)
valid_tmp = pd.concat([valid_data, domain_int.rename('domain')[train.shape[0]:train.shape[0]+valid.shape[0]]], 1)

In [212]:
lgb_train, lgb_valid = lgb_data(train_tmp, valid_tmp)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's auc: 0.700645	valid_0's l2: 0.000658923
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 0.701611	valid_0's l2: 0.000654044
[3]	valid_0's auc: 0.701606	valid_0's l2: 0.000649627
[4]	valid_0's auc: 0.701442	valid_0's l2: 0.000645991
[5]	valid_0's auc: 0.700951	valid_0's l2: 0.000642251
[6]	valid_0's auc: 0.700933	valid_0's l2: 0.000639166
[7]	valid_0's auc: 0.700822	valid_0's l2: 0.000636072
Early stopping, best iteration is:
[2]	valid_0's auc: 0.701611	valid_0's l2: 0.000654044


### CTR contest

In [174]:
ctr_test = pd.read_csv('./data/test_feature_03-21.csv')

In [175]:
ctr_test.shape

(18371, 29955)

In [176]:
from sklearn.model_selection import train_test_split

In [178]:
X_train_ctr, X_valid_ctr = train_test_split(ctr_test)
X_train_ctr.shape, X_valid_ctr.shape

((13778, 29955), (4593, 29955))

In [180]:
y_train_ctr = np.zeros(X_train_ctr.shape[0], dtype=np.int)
y_valid_ctr = np.zeros(X_valid_ctr.shape[0], dtype=np.int)

In [181]:
lgb_train_ctr = lgb.Dataset(X_train_ctr, y_train_ctr, free_raw_data=False) #  ,feature_name=X_train.columns
lgb_valid_ctr = lgb.Dataset(X_valid_ctr, y_valid_ctr, reference=lgb_train, free_raw_data=False)

In [182]:
# %%time
# train
gbm = lgb.train(params,
                lgb_train_ctr,
#                 num_boost_round=20,
                num_boost_round=10,
                valid_sets=lgb_valid_ctr,
                early_stopping_rounds=5)

[1]	valid_0's auc: 1	valid_0's l2: 0
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's auc: 1	valid_0's l2: 0
[3]	valid_0's auc: 1	valid_0's l2: 0
[4]	valid_0's auc: 1	valid_0's l2: 0
[5]	valid_0's auc: 1	valid_0's l2: 0
[6]	valid_0's auc: 1	valid_0's l2: 0
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's l2: 0
