In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [2]:
# read_file

path = 'dataset/'
start_time = time.time()
train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=1000000)
test = pd.read_csv(path+"test.csv")
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
print('[{}] Finished loading data'.format(time.time() - start_time))
train.head()

[48.3177630901] Finished loading data


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,11846,12,1,13,259,2017-11-09 08:17:26,,0
1,5147,19,0,0,347,2017-11-09 08:17:26,,0
2,11782,9,1,8,127,2017-11-09 08:17:26,,0
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1
4,110589,3,1,23,280,2017-11-09 08:17:26,,0


In [3]:
def dataPreProcessTime(df):
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek.astype(int)
    df['month']    = df['datetime'].dt.month.astype(int)
    df['hour']     = df['datetime'].dt.hour.astype('uint8')
    df['day']      = df['datetime'].dt.day.astype('uint8')
    #df.drop(['click_time','datetime'], axis=1, inplace=True)
    return df

def feature_clicksofip(df):
    print("feature count ip")
    ip_count = df.groupby(['ip'])['os'].count().reset_index()
    ip_count.columns = ['ip', 'clicks_by_ip']
    merge = pd.merge(df, ip_count, on='ip', how='left', sort=False)
    merge['clicks_by_ip'] = merge['clicks_by_ip'].astype('uint16')
    #merge.drop('ip', axis=1, inplace=True)
    return merge

def feature_ipdayhour(df):
    print("feature count ip_day_hour: ip+time feature")
    ipdayhour = df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_day_hour'})
    train_df = df.merge(ipdayhour, on=['ip','day','hour'], how='left')
    return train_df

def feature_ipapp(df):
    print("feature ip_app ")
    ip_app = df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app'})
    train_df = df.merge(ip_app, on=['ip','app'], how='left')
    return train_df

def feature_ipappos(df):
    print("feature ip_app_os")
    ipappos = df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os'})
    train_df = df.merge(ipappos, on=['ip','app', 'os'], how='left')
    return train_df

def feature_ipappchan(df):
    print("feature ip_app_chan")
    ipappchan = df[['ip','app', 'channel', 'os']].groupby(by=['ip', 'app', 'channel'])[['os']].count().reset_index().rename(index=str, columns={'os': 'ip_app_channel'})
    train_df = df.merge(ipappchan, on=['ip','app', 'channel'], how='left')
    return train_df 

def feature_appchan(df):
    print("feature app_channel ")
    ip_app = df[['os', 'app', 'channel']].groupby(by=['channel', 'app'])[['os']].count().reset_index().rename(index=str, columns={'os': 'app_chan'})
    train_df = df.merge(ip_app, on=['channel','app'], how='left')
    return train_df

# Adding features with var and mean hour (inspired from nuhsikander's script)

def feature_ipdaychan_var_hour(df):
    print('grouping by : ip_day_chl_var_hour')
    ipdaychan = df[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_day_chan_var_hour'})
    train_df = df.merge(ipdaychan, on=['ip','day','channel'], how='left')
    return train_df

def feature_ipappos_var_hour(df):
    print('grouping by : ip_app_os_var_hour')
    gp = df[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var_hour'})
    train_df = df.merge(gp, on=['ip','app', 'os'], how='left')
    return train_df

def feature_ipappchan_var_day(df):
    print('grouping by : ip_app_channel_var_day')
    gp = df[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
    train_df = df.merge(gp, on=['ip','app', 'channel'], how='left')
    return train_df

def feature_ip_app_chl_mean_hour(df):
    print('grouping by : ip_app_chl_mean_hour')
    gp = df[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
    train_df = df.merge(gp, on=['ip','app', 'channel'], how='left')
    return train_df

def merge_feature(df):
    train = dataPreProcessTime(df)
    # combine feature
    train = feature_clicksofip(train)
    train = feature_ipdayhour(train)
    train = feature_ipapp(train)
    train = feature_ipappos(train)
    train = feature_ipappchan(train)
    train = feature_appchan(train)
    
    # combine the time feature
    train = feature_ipdaychan_var_hour(train)
    train = feature_ipappos_var_hour(train)
    train = feature_ipappchan_var_day(train)
    train = feature_ip_app_chl_mean_hour(train)
    return train

train=merge_feature(train)
train.head()

feature count ip
feature count ip_day_hour: ip+time feature
feature ip_app 
feature ip_app_os
feature ip_app_chan
feature app_channel 
grouping by : ip_day_chl_var_hour
grouping by : ip_app_os_var_hour
grouping by : ip_app_channel_var_day
grouping by : ip_app_chl_mean_hour


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,datetime,dow,...,clicks_by_ip,ip_day_hour,ip_app,ip_app_os,ip_app_channel,app_chan,ip_day_chan_var_hour,ip_app_os_var_hour,ip_app_channel_var_day,ip_app_channel_mean_hour
0,11846,12,1,13,259,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,3,3,1,1,1,16721,,,,8
1,5147,19,0,0,347,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,2395,2395,91,30,61,6656,0.0,0.0,0.0,8
2,11782,9,1,8,127,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,19,19,5,2,3,22123,0.0,0.0,0.0,8
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1,2017-11-09 08:17:26,3,...,16,16,1,1,1,131,,,,8
4,110589,3,1,23,280,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,...,231,231,38,2,25,8996,0.0,0.0,0.0,8


In [5]:
train_y = train['is_attributed']
# add inplace=True 直接替换
train_x = train.drop(['is_attributed', 'attributed_time','click_time','datetime'], axis=1)
train_x.head()

Unnamed: 0,ip,app,device,os,channel,dow,month,hour,day,clicks_by_ip,ip_day_hour,ip_app,ip_app_os,ip_app_channel,app_chan,ip_day_chan_var_hour,ip_app_os_var_hour,ip_app_channel_var_day,ip_app_channel_mean_hour
0,11846,12,1,13,259,3,11,8,9,3,3,1,1,1,16721,,,,8
1,5147,19,0,0,347,3,11,8,9,2395,2395,91,30,61,6656,0.0,0.0,0.0,8
2,11782,9,1,8,127,3,11,8,9,19,19,5,2,3,22123,0.0,0.0,0.0,8
3,33867,35,1,19,21,3,11,8,9,16,16,1,1,1,131,,,,8
4,110589,3,1,23,280,3,11,8,9,231,231,38,2,25,8996,0.0,0.0,0.0,8


In [6]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

[321.465833902] Start XGBoost Training


In [7]:
params = {'eta': 0.1, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'scale_pos_weight': 150,
          'silent': True}

In [8]:
from sklearn.model_selection import KFold
num_folds = 3
seed = 7 
kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

valid_result = []
for train_index, test_index in kf.split(train_x):
    evals_result = {}
    watchlist = [(xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index]), 'train'), (xgb.DMatrix(train_x.ix[test_index], train_y.ix[test_index]), 'valid')]
    dtrain = xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index])
    model = xgb.train(params, dtrain, 30 , watchlist, maximize=True, verbose_eval=10,evals_result=evals_result,early_stopping_rounds = 10)
    valid_result.append( evals_result['valid']['auc'][-1])
print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


[0]	train-auc:0.878237	valid-auc:0.882876
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.938617	valid-auc:0.945867
[20]	train-auc:0.941719	valid-auc:0.947534
[29]	train-auc:0.951702	valid-auc:0.953533
[0]	train-auc:0.910001	valid-auc:0.902138
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.94233	valid-auc:0.933782
[20]	train-auc:0.950295	valid-auc:0.939488
[29]	train-auc:0.958447	valid-auc:0.945101
[0]	train-auc:0.90818	valid-auc:0.904568
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.938722	valid-auc:0.933822
[20]	train-auc:0.948601	valid-auc:0.942925
[29]	train-auc:0.959374	valid-auc:0.948537
[391.146960974] Finish XGBoost Training


In [9]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')

In [10]:
mean_result = reduce(lambda x, y: x + y, valid_result)/len(valid_result)
print mean_result

0.949057


In [None]:
# deal with the test data
test=merge_feature(test)
test.drop(['click_time','datetime'], axis=1, inplace=True)
test.head()

feature count ip
feature count ip_day_hour: ip+time feature
feature ip_app 
feature ip_app_os
feature ip_app_chan
feature app_channel 
grouping by : ip_day_chl_var_hour
grouping by : ip_app_os_var_hour
grouping by : ip_app_channel_var_day


In [15]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)