In [2]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import gc



In [3]:
# read_file

path = 'dataset/'
start_time = time.time()
train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=40000000)
test = pd.read_csv(path+"test.csv")
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
print('[{}] Finished loading data'.format(time.time() - start_time))
train.head()

  interactivity=interactivity, compiler=compiler, result=result)


[97.2784919739] Finished loading data


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,11846,12,1,13,259,2017-11-09 08:17:26,,0
1,5147,19,0,0,347,2017-11-09 08:17:26,,0
2,11782,9,1,8,127,2017-11-09 08:17:26,,0
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1
4,110589,3,1,23,280,2017-11-09 08:17:26,,0


In [4]:
def dataPreProcessTime(df):
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek.astype(int)
    df['month']    = df['datetime'].dt.month.astype(int)
    df['hour']     = df['datetime'].dt.hour.astype('uint8')
    df['day']      = df['datetime'].dt.day.astype('uint8')
    #df.drop(['click_time','datetime'], axis=1, inplace=True)
    return df

def feature_clicksofip(df):
    print("feature count ip")
    ip_count = df.groupby(['ip'])['os'].count().reset_index()
    ip_count.columns = ['ip', 'clicks_by_ip']
    df = pd.merge(df, ip_count, on='ip', how='left', sort=False)
    df['clicks_by_ip'] = df['clicks_by_ip'].astype('uint16')
    #merge.drop('ip', axis=1, inplace=True)
    del ip_count
    gc.collect()
    return df

def feature_ipdayhour(df):
    print("feature count ip_day_hour: ip+time feature")
    ipdayhour = df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_day_hour'})
    df = df.merge(ipdayhour, on=['ip','day','hour'], how='left')
    del ipdayhour
    gc.collect()
    return df

def feature_ipapp(df):
    print("feature ip_app ")
    ip_app = df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app'})
    df = df.merge(ip_app, on=['ip','app'], how='left')
    del ip_app
    gc.collect()
    return df

#def feature_ipappos(df):
#    print("feature ip_app_os")
#    ipappos = df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os'})
#    df = df.merge(ipappos, on=['ip','app', 'os'], how='left')
#    del ipappos
#    gc.collect()
#    return df

#def feature_ipappchan(df):
#    print("feature ip_app_chan")
#    ipappchan = df[['ip','app', 'channel', 'os']].groupby(by=['ip', 'app', 'channel'])[['os']].count().reset_index().rename(index=str, columns={'os': 'ip_app_channel'})
#    df = df.merge(ipappchan, on=['ip','app', 'channel'], how='left')
#    del ipappchan
#    gc.collect()
#    return df 

def feature_appchan(df):
    print("feature app_channel ")
    ip_app = df[['os', 'app', 'channel']].groupby(by=['channel', 'app'])[['os']].count().reset_index().rename(index=str, columns={'os': 'app_chan'})
    df = df.merge(ip_app, on=['channel','app'], how='left')
    del ip_app
    gc.collect()
    return df

# Adding features with var and mean hour (inspired from nuhsikander's script)

def feature_ipdaychan_var_hour(df):
    print('grouping by : ip_day_chl_var_hour')
    ipdaychan = df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_day_chan_var_hour'})
    df = df.merge(ipdaychan, on=['ip','day','channel'], how='left')
    del ipdaychan
    gc.collect()
    return df

def feature_ipappos_var_hour(df):
    print('grouping by : ip_app_os_var_hour')
    gp = df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var_hour'})
    df = df.merge(gp, on=['ip','app', 'os'], how='left')
    del gp
    gc.collect()
    return df

def feature_ipappchan_var_day(df):
    print('grouping by : ip_app_channel_var_day')
    gp = df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
    df = df.merge(gp, on=['ip','app', 'channel'], how='left')
    del gp 
    gc.collect()
    return df

def feature_ip_app_chl_mean_hour(df):
    print('grouping by : ip_app_chl_mean_hour')
    gp = df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
    df = df.merge(gp, on=['ip','app', 'channel'], how='left')
    del gp
    gc.collect()
    return df

def merge_feature(df):
    train = dataPreProcessTime(df)
    # combine feature
    train = feature_clicksofip(train)
    train = feature_ipdayhour(train)
    train = feature_ipapp(train)
    train = feature_ipappos(train)
    train = feature_ipappchan(train)
    train = feature_appchan(train)
    
    # combine the time feature
    train = feature_ipdaychan_var_hour(train)
    train = feature_ipappos_var_hour(train)
    train = feature_ipappchan_var_day(train)
    train = feature_ip_app_chl_mean_hour(train)
    del df
    return train

train=merge_feature(train)
train.head()

feature count ip
feature count ip_day_hour: ip+time feature
feature ip_app 
feature ip_app_os
feature ip_app_chan
feature app_channel 
grouping by : ip_day_chl_var_hour
grouping by : ip_app_os_var_hour
grouping by : ip_app_channel_var_day
grouping by : ip_app_chl_mean_hour


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,datetime,dow,...,clicks_by_ip,ip_day_hour,ip_app,ip_app_os,ip_app_channel,app_chan,ip_day_chan_var_hour,ip_app_os_var_hour,ip_app_channel_var_day,ip_app_channel_mean_hour
0,11846,12,1,13,259,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,136,13,23,7,5,378148,5.3,2.142857,0.0,10.4
1,5147,19,0,0,347,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,38794,5091,743,233,467,60929,3.344748,2.840351,0.0,9.920771
2,11782,9,1,8,127,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,833,51,127,6,35,619000,6.633613,6.666667,0.0,12.314286
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1,2017-11-09 08:17:26,3,...,457,55,1,1,1,4828,12.5,,,8.0
4,110589,3,1,23,280,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,3266,324,735,9,405,268708,5.391749,4.444444,0.0,11.397531


In [5]:
train_y = train['is_attributed']
# add inplace=True 直接替换
train_x = train.drop(['is_attributed', 'attributed_time','click_time','datetime'], axis=1)
train_x.head()

Unnamed: 0,ip,app,device,os,channel,dow,month,hour,day,clicks_by_ip,ip_day_hour,ip_app,ip_app_os,ip_app_channel,app_chan,ip_day_chan_var_hour,ip_app_os_var_hour,ip_app_channel_var_day,ip_app_channel_mean_hour
0,11846,12,1,13,259,3,11,8,9,136,13,23,7,5,378148,5.3,2.142857,0.0,10.4
1,5147,19,0,0,347,3,11,8,9,38794,5091,743,233,467,60929,3.344748,2.840351,0.0,9.920771
2,11782,9,1,8,127,3,11,8,9,833,51,127,6,35,619000,6.633613,6.666667,0.0,12.314286
3,33867,35,1,19,21,3,11,8,9,457,55,1,1,1,4828,12.5,,,8.0
4,110589,3,1,23,280,3,11,8,9,3266,324,735,9,405,268708,5.391749,4.444444,0.0,11.397531


In [6]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

[425.188390017] Start XGBoost Training


In [7]:
params = {'eta': 0.1, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'scale_pos_weight': 150,
          'silent': True}

In [8]:
del train
gc.collect()

700

In [9]:
from sklearn.model_selection import KFold
num_folds = 3
seed = 7 
kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

valid_result = []
for train_index, test_index in kf.split(train_x):
    evals_result = {}
    watchlist = [(xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index]), 'train'), (xgb.DMatrix(train_x.ix[test_index], train_y.ix[test_index]), 'valid')]
    dtrain = xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index])
    model = xgb.train(params, dtrain, 50 , watchlist, maximize=True, verbose_eval=40,evals_result=evals_result,early_stopping_rounds = 400)
    valid_result.append( evals_result['valid']['auc'][-1])
print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


[0]	train-auc:0.920384	valid-auc:0.920071
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 400 rounds.
[40]	train-auc:0.966182	valid-auc:0.965258
[49]	train-auc:0.968245	valid-auc:0.967207
[0]	train-auc:0.919926	valid-auc:0.920931
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 400 rounds.
[40]	train-auc:0.966618	valid-auc:0.966587
[49]	train-auc:0.968349	valid-auc:0.968236
[0]	train-auc:0.929462	valid-auc:0.928048
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 400 rounds.
[40]	train-auc:0.965749	valid-auc:0.965809
[49]	train-auc:0.967891	valid-auc:0.967795
[2120.5599668] Finish XGBoost Training


In [10]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')

In [19]:
fscore=pd.Series(model.get_fscore()).sort_values(ascending = False)
print fscore

app                         190
app_chan                    127
clicks_by_ip                 76
channel                      75
device                       64
ip_app                       52
ip_day_hour                  50
os                           36
hour                         22
ip_app_os_var_hour           15
ip_app_os                    13
ip_app_channel               13
ip_app_channel_mean_hour     10
ip                            4
ip_day_chan_var_hour          3
dtype: int64


In [None]:
mean_result = reduce(lambda x, y: x + y, valid_result)/len(valid_result)
print mean_result

In [None]:
# deal with the test data
test=merge_feature(test)
test.drop(['click_time','datetime'], axis=1, inplace=True)
test.head()

In [None]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)