In [6]:
import pandas as pd
import numpy as np
import collections
import random
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer

import lightgbm as lgb
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
valid = pd.read_csv('./data/validation.csv')
test  = pd.read_csv('./data/test.csv')

In [48]:
# specify configurations as dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt', # ?
    'objective': 'regression',
    'metric': {'l2', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [20]:
def lgb_data(train_data, valid_data):
    """
    create dataset for lightgbm
    """
    y_train = train_data['click'].values
    y_valid = valid_data['click'].values
    X_train = train_data.drop('click', axis=1)
    X_valid = valid_data.drop('click', axis=1)

#     lgb_train = lgb.Dataset(X_train, y_train, categorical_feature= cat_feats, free_raw_data=False) #  ,feature_name=X_train.columns
#     lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature= cat_feats, free_raw_data=False)

    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False) #  ,feature_name=X_train.columns
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train, free_raw_data=False)
    
    return lgb_train, lgb_valid

# 1. Start with simple features
auc = 0.62

In [4]:
X_all = pd.concat([train.drop('click',axis=1), 
                   valid.drop('click', axis=1),
                   test], axis=0)
X_all.shape

(3038281, 24)

In [67]:
# drop useless columns
drop_cols = ['bidprice','payprice', 'bidid', 'userid', 'url', 'urlid' ]

train_data = train.drop(drop_cols, axis=1)
valid_data = valid.drop(drop_cols, axis=1)

# for simplification, drop many-value features 
smp_drop_cols = ['IP', 'domain', 'slotid', 'usertag']
train_data = train_data.drop(smp_drop_cols, axis=1)
valid_data = valid_data.drop(smp_drop_cols, axis=1)

# for simplification, drop categorical features 
cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']
train_data = train_data.drop(cat_feats, axis=1)
valid_data = valid_data.drop(cat_feats, axis=1)

In [21]:
lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)

[1]	valid_0's l2: 0.000664171	valid_0's auc: 0.596453
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000664141	valid_0's auc: 0.601192
[3]	valid_0's l2: 0.000664083	valid_0's auc: 0.618365
[4]	valid_0's l2: 0.00066407	valid_0's auc: 0.613268
[5]	valid_0's l2: 0.00066406	valid_0's auc: 0.619763
[6]	valid_0's l2: 0.000664042	valid_0's auc: 0.619762
[7]	valid_0's l2: 0.000664015	valid_0's auc: 0.618835
[8]	valid_0's l2: 0.000663998	valid_0's auc: 0.621834
[9]	valid_0's l2: 0.000663992	valid_0's auc: 0.621852
[10]	valid_0's l2: 0.000663997	valid_0's auc: 0.621792
[11]	valid_0's l2: 0.000664006	valid_0's auc: 0.621805
[12]	valid_0's l2: 0.000664013	valid_0's auc: 0.632818
[13]	valid_0's l2: 0.000664023	valid_0's auc: 0.633355
[14]	valid_0's l2: 0.000664036	valid_0's auc: 0.633712
Early stopping, best iteration is:
[9]	valid_0's l2: 0.000663992	valid_0's auc: 0.621852


# 2. Add basic categorical features
auc = 0.71

In [22]:
def convt_cat(feat):
    """
    convert string values of categorical feature to int
    
    input:
    @feat(str): name of the feautre to be converted
    return:
    @feat_int(Series of int): integer representation of the categorical feature
    """
#     feat = 'useragent'
    
    cat_series = pd.Series(X_all[feat], dtype='category', name=feat)
    num_cat = len(set(X_all[feat]))
    feat_int = cat_series.cat.rename_categories([x for x in range(num_cat)])
    
    return feat_int
   

In [23]:
def add_cat(feat_names, train, valid, cat=True, evl=False):
    """
    Adding extra categorical features accumulatively,
    Also allows investigate how the performance change accordingly
    input:
    @feats(list): names of the feature to be investigated
    @cat(bool): if or not set the additional feature as pandas.Series.category
    @elv(bool): if or not evaluate the performance for each concated DF
    output:
    @train, valid(df): data after concating all investigated features 
    Print the performance
    """
    for feat_name in feat_names:
        print('Add feature: {}'.format(feat_name))
        feat_int = convt_cat(feat_name)
        # concat additional column
        train = pd.concat([train, feat_int.rename(feat_name).iloc[:train.shape[0]]], axis=1)
        valid = pd.concat([valid, feat_int.rename(feat_name).iloc[train.shape[0]:(train.shape[0]+valid.shape[0])]], axis=1)
        
        if not cat:
            for col in train.columns:
                train[col] = train[col].astype(int)
                valid[col] = valid[col].astype(int)
        
        if evl:
            lgb_train, lgb_valid = lgb_data(train, valid)

            gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=20,
                    valid_sets=lgb_valid,
                    early_stopping_rounds=5)
    
    return train, valid
    

In [68]:
### all cat_feats # 0.712
# # adexchange is disposable -> 0.714
cat_feats = ['useragent', 'adexchange', 'slotvisibility', 'slotformat', 'creative', 'keypage']
train_data, valid_data = add_cat(cat_feats, train_data, valid_data, evl=False)

Add feature: useragent
Add feature: adexchange
Add feature: slotvisibility
Add feature: slotformat
Add feature: creative
Add feature: keypage


In [25]:
lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.00066412	valid_0's auc: 0.663687
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000664064	valid_0's auc: 0.6584
[3]	valid_0's l2: 0.000664024	valid_0's auc: 0.686115
[4]	valid_0's l2: 0.000663917	valid_0's auc: 0.682625
[5]	valid_0's l2: 0.000663871	valid_0's auc: 0.685961
[6]	valid_0's l2: 0.000663766	valid_0's auc: 0.698957
[7]	valid_0's l2: 0.000663739	valid_0's auc: 0.703411
[8]	valid_0's l2: 0.000663664	valid_0's auc: 0.70478
[9]	valid_0's l2: 0.000663651	valid_0's auc: 0.699383
[10]	valid_0's l2: 0.000663628	valid_0's auc: 0.69923
[11]	valid_0's l2: 0.000663569	valid_0's auc: 0.699907
[12]	valid_0's l2: 0.000663524	valid_0's auc: 0.707592
[13]	valid_0's l2: 0.000663506	valid_0's auc: 0.707623
[14]	valid_0's l2: 0.000663504	valid_0's auc: 0.707667
[15]	valid_0's l2: 0.00066344	valid_0's auc: 0.71277
[16]	valid_0's l2: 0.000663451	valid_0's auc: 0.712491
[17]	valid_0's l2: 0.000663471	valid_0's auc: 0.712577
[18]	valid_0's l2: 0

# 3. Add feature with special structure
auc = 0.79

### usertag*
0.71 -> 0.77

In [26]:
usertag = X_all['usertag'].str.split(',')

In [69]:
# binarinize feature with list of values
mlb = MultiLabelBinarizer()
usertag_dm = mlb.fit_transform(usertag)
usertag_df = pd.DataFrame(usertag_dm).add_prefix('usertag_')
usertag_df.shape

(3038281, 69)

In [70]:
usertag_df.head()

Unnamed: 0,usertag_0,usertag_1,usertag_2,usertag_3,usertag_4,usertag_5,usertag_6,usertag_7,usertag_8,usertag_9,...,usertag_59,usertag_60,usertag_61,usertag_62,usertag_63,usertag_64,usertag_65,usertag_66,usertag_67,usertag_68
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [71]:
usertag_train = usertag_df.iloc[:train.shape[0]]
usertag_valid = usertag_df.iloc[train.shape[0]: train.shape[0] + valid.shape[0]].reset_index(drop=True)

In [73]:
train_data = pd.concat([train_data, usertag_train], 1)
valid_data = pd.concat([valid_data, usertag_valid], 1)

train_data.shape, valid_data.shape

((2430981, 84), (303925, 84))

In [34]:
lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.000658959	valid_0's auc: 0.690401
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000653869	valid_0's auc: 0.765204
[3]	valid_0's l2: 0.000649685	valid_0's auc: 0.765207
[4]	valid_0's l2: 0.000649383	valid_0's auc: 0.76668
[5]	valid_0's l2: 0.000645757	valid_0's auc: 0.765719
[6]	valid_0's l2: 0.00064172	valid_0's auc: 0.766045
[7]	valid_0's l2: 0.000638934	valid_0's auc: 0.766303
[8]	valid_0's l2: 0.000634889	valid_0's auc: 0.766314
[9]	valid_0's l2: 0.000631425	valid_0's auc: 0.766333
Early stopping, best iteration is:
[4]	valid_0's l2: 0.000649383	valid_0's auc: 0.76668


### domain

0.77 -> 0.70

In [35]:
domain_int = convt_cat('domain')

In [36]:
train_tmp = pd.concat([train_data, domain_int.rename('domain')[:train.shape[0]]], 1)
valid_tmp = pd.concat([valid_data, domain_int.rename('domain')[train.shape[0]:train.shape[0]+valid.shape[0]]], 1)

In [37]:
# evaluate the chosen features
lgb_train, lgb_valid = lgb_data(train_tmp, valid_tmp)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.000658923	valid_0's auc: 0.700645
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000654044	valid_0's auc: 0.701611
[3]	valid_0's l2: 0.000649627	valid_0's auc: 0.701606
[4]	valid_0's l2: 0.000645991	valid_0's auc: 0.701442
[5]	valid_0's l2: 0.000642251	valid_0's auc: 0.700951
[6]	valid_0's l2: 0.000639166	valid_0's auc: 0.700933
[7]	valid_0's l2: 0.000636072	valid_0's auc: 0.700822
Early stopping, best iteration is:
[2]	valid_0's l2: 0.000654044	valid_0's auc: 0.701611


### slotid*
0.77 -> 0.79

In [49]:
slotid_int = convt_cat('slotid')

In [74]:
train_data = pd.concat([train_data, slotid_int.rename('slotid')[:train.shape[0]]], 1)
valid_data = pd.concat([valid_data, slotid_int.rename('slotid')[train.shape[0]:train.shape[0]+valid.shape[0]]], 1)

In [62]:
# evaluate the chosen features
lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.000659133	valid_0's auc: 0.66821
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000654246	valid_0's auc: 0.706725
[3]	valid_0's l2: 0.0006498	valid_0's auc: 0.706691
[4]	valid_0's l2: 0.000646493	valid_0's auc: 0.706612
[5]	valid_0's l2: 0.000642733	valid_0's auc: 0.707554
[6]	valid_0's l2: 0.000639673	valid_0's auc: 0.70844
[7]	valid_0's l2: 0.000636337	valid_0's auc: 0.708435
[8]	valid_0's l2: 0.000636135	valid_0's auc: 0.726657
[9]	valid_0's l2: 0.00063329	valid_0's auc: 0.726684
[10]	valid_0's l2: 0.000630925	valid_0's auc: 0.726246
[11]	valid_0's l2: 0.000628911	valid_0's auc: 0.726119
[12]	valid_0's l2: 0.000628566	valid_0's auc: 0.783827
[13]	valid_0's l2: 0.000626868	valid_0's auc: 0.783773
[14]	valid_0's l2: 0.00062506	valid_0's auc: 0.783715
[15]	valid_0's l2: 0.000623285	valid_0's auc: 0.782891
[16]	valid_0's l2: 0.000620188	valid_0's auc: 0.785003
[17]	valid_0's l2: 0.000618499	valid_0's auc: 0.784848
[18]	valid_0's l2: 

In [75]:
train_data.columns

Index(['click', 'weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight',
       'slotprice', 'advertiser', 'useragent', 'adexchange', 'slotvisibility',
       'slotformat', 'creative', 'keypage', 'usertag_0', 'usertag_1',
       'usertag_2', 'usertag_3', 'usertag_4', 'usertag_5', 'usertag_6',
       'usertag_7', 'usertag_8', 'usertag_9', 'usertag_10', 'usertag_11',
       'usertag_12', 'usertag_13', 'usertag_14', 'usertag_15', 'usertag_16',
       'usertag_17', 'usertag_18', 'usertag_19', 'usertag_20', 'usertag_21',
       'usertag_22', 'usertag_23', 'usertag_24', 'usertag_25', 'usertag_26',
       'usertag_27', 'usertag_28', 'usertag_29', 'usertag_30', 'usertag_31',
       'usertag_32', 'usertag_33', 'usertag_34', 'usertag_35', 'usertag_36',
       'usertag_37', 'usertag_38', 'usertag_39', 'usertag_40', 'usertag_41',
       'usertag_42', 'usertag_43', 'usertag_44', 'usertag_45', 'usertag_46',
       'usertag_47', 'usertag_48', 'usertag_49', 'usertag_50', 'usertag_51',
       'u

### IP
0.79 -> 0.76

#### add IP directly

In [76]:
IP_int = convt_cat('IP')

In [77]:
train_tmp = pd.concat([train_data, IP_int.rename('IP')[:train.shape[0]]], 1)
valid_tmp = pd.concat([valid_data, IP_int.rename('IP')[train.shape[0]:train.shape[0]+valid.shape[0]]], 1)
# evaluate the chosen features
lgb_train, lgb_valid = lgb_data(train_tmp, valid_tmp)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.000659013	valid_0's auc: 0.696071
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000654388	valid_0's auc: 0.708236
[3]	valid_0's l2: 0.000649837	valid_0's auc: 0.708024
[4]	valid_0's l2: 0.000645928	valid_0's auc: 0.707736
[5]	valid_0's l2: 0.000642579	valid_0's auc: 0.707477
[6]	valid_0's l2: 0.000639189	valid_0's auc: 0.710534
[7]	valid_0's l2: 0.000636679	valid_0's auc: 0.710484
[8]	valid_0's l2: 0.00063637	valid_0's auc: 0.705502
[9]	valid_0's l2: 0.000633388	valid_0's auc: 0.758492
[10]	valid_0's l2: 0.000630339	valid_0's auc: 0.75844
[11]	valid_0's l2: 0.000630126	valid_0's auc: 0.758157
[12]	valid_0's l2: 0.000628068	valid_0's auc: 0.757998
[13]	valid_0's l2: 0.00062641	valid_0's auc: 0.757892
[14]	valid_0's l2: 0.000624935	valid_0's auc: 0.757946
Early stopping, best iteration is:
[9]	valid_0's l2: 0.000633388	valid_0's auc: 0.758492


#### Split IP then add it

In [78]:
IPs = X_all['IP']

In [89]:
IPs = X_all['IP'].str.split('.')

In [99]:
# IPs_df = IPs.apply(lambda l: pd.Series(l))

In [100]:
%%time
IPs_df = pd.DataFrame.from_items(zip(IPs.index, IPs.values)).T

CPU times: user 2min 10s, sys: 10.3 s, total: 2min 20s
Wall time: 2min 23s


In [113]:
IPs_df.iloc[:,0] = IPs_df.iloc[:,0].astype('category')

In [123]:
for i in range(3):
    IPs_df[i] = IPs_df[i].astype('category')

In [125]:
IPs_df = IPs_df.drop(3, axis=1)

In [127]:
IPs_df.columns = ['ip_0', 'ip_1', 'ip_2']

In [133]:
train_tmp = pd.concat([train_data, IPs_df[:train.shape[0]]], 1)
valid_tmp = pd.concat([valid_data, IPs_df[train.shape[0]:train.shape[0]+valid.shape[0]]], 1)
# evaluate the chosen features
lgb_train, lgb_valid = lgb_data(train_tmp, valid_tmp)
# lgb_train, lgb_valid = lgb_data(train_data, valid_data)
gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_valid, early_stopping_rounds=5)



[1]	valid_0's l2: 0.000659197	valid_0's auc: 0.654795
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.000654376	valid_0's auc: 0.662717
[3]	valid_0's l2: 0.000650257	valid_0's auc: 0.667516
[4]	valid_0's l2: 0.000645978	valid_0's auc: 0.707328
[5]	valid_0's l2: 0.000642422	valid_0's auc: 0.707627
[6]	valid_0's l2: 0.000639211	valid_0's auc: 0.709892
[7]	valid_0's l2: 0.000635922	valid_0's auc: 0.710061
[8]	valid_0's l2: 0.000633444	valid_0's auc: 0.709739
[9]	valid_0's l2: 0.000631139	valid_0's auc: 0.709625
[10]	valid_0's l2: 0.000628142	valid_0's auc: 0.709642
[11]	valid_0's l2: 0.000626348	valid_0's auc: 0.709803
[12]	valid_0's l2: 0.000624843	valid_0's auc: 0.709744
Early stopping, best iteration is:
[7]	valid_0's l2: 0.000635922	valid_0's auc: 0.710061


# 4. Add hand-crafted features
construct a pipeline, as this would also be useful to the CVR competition

Index(['click', 'weekday', 'hour', 'region', 'city', 'slotwidth', 'slotheight',
       'slotprice', 'advertiser', 'useragent', 'adexchange', 'slotvisibility',
       'slotformat', 'creative', 'keypage', 'usertag_0', 'usertag_1',
       'usertag_2', 'usertag_3', 'usertag_4', 'usertag_5', 'usertag_6',
       'usertag_7', 'usertag_8', 'usertag_9', 'usertag_10', 'usertag_11',
       'usertag_12', 'usertag_13', 'usertag_14', 'usertag_15', 'usertag_16',
       'usertag_17', 'usertag_18', 'usertag_19', 'usertag_20', 'usertag_21',
       'usertag_22', 'usertag_23', 'usertag_24', 'usertag_25', 'usertag_26',
       'usertag_27', 'usertag_28', 'usertag_29', 'usertag_30', 'usertag_31',
       'usertag_32', 'usertag_33', 'usertag_34', 'usertag_35', 'usertag_36',
       'usertag_37', 'usertag_38', 'usertag_39', 'usertag_40', 'usertag_41',
       'usertag_42', 'usertag_43', 'usertag_44', 'usertag_45', 'usertag_46',
       'usertag_47', 'usertag_48', 'usertag_49', 'usertag_50', 'usertag_51',
       'u