In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [2]:
# read_file

path = 'dataset/'
start_time = time.time()
train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=1000000)
test = pd.read_csv(path+"test.csv")
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
print('[{}] Finished loading data'.format(time.time() - start_time))
train.head()

[50.6012411118] Finished loading data


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,11846,12,1,13,259,2017-11-09 08:17:26,,0
1,5147,19,0,0,347,2017-11-09 08:17:26,,0
2,11782,9,1,8,127,2017-11-09 08:17:26,,0
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1
4,110589,3,1,23,280,2017-11-09 08:17:26,,0


In [3]:
def dataPreProcessTime(df):
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek.astype(int)
    df['month']    = df['datetime'].dt.month.astype(int)
    df['hour']     = df['datetime'].dt.hour.astype(int)
    df['day']      = df['datetime'].dt.day.astype(int)
    #df.drop(['click_time','datetime'], axis=1, inplace=True)
    return df
def feature_clicksofip(df):
    ip_count = df.groupby(['ip'])['os'].count().reset_index()
    ip_count.columns = ['ip', 'clicks_by_ip']
    merge = pd.merge(df, ip_count, on='ip', how='left', sort=False)
    merge['clicks_by_ip'] = merge['clicks_by_ip'].astype('uint16')
    #merge.drop('ip', axis=1, inplace=True)
    return merge

def feature_ipdayhour(df):
    ipdayhour = df[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_day_hour'})
    train_df = df.merge(ipdayhour, on=['ip','day','hour'], how='left')
    return train_df

def feature_ipapp(df):
    ip_app = df[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app'})
    train_df = df.merge(ip_app, on=['ip','app'], how='left')
    return train_df

def feature_ipappos(df):
    ipappos = df[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os'})
    train_df = df.merge(ipappos, on=['ip','app', 'os'], how='left')
    return train_df

def merge_feature(df):
    train = dataPreProcessTime(df)
    train = feature_clicksofip(train)
    train = feature_ipdayhour(train)
    train = feature_ipapp(train)
    train = feature_ipappos(train)
    return train

train=merge_feature(train)
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,datetime,dow,month,hour,day,clicks_by_ip,ip_day_hour,ip_app,ip_app_os
0,11846,12,1,13,259,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,11,8,9,3,3,1,1
1,5147,19,0,0,347,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,11,8,9,2395,2395,91,30
2,11782,9,1,8,127,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,11,8,9,19,19,5,2
3,33867,35,1,19,21,2017-11-09 08:17:26,2017-11-09 09:05:37,1,2017-11-09 08:17:26,3,11,8,9,16,16,1,1
4,110589,3,1,23,280,2017-11-09 08:17:26,,0,2017-11-09 08:17:26,3,11,8,9,231,231,38,2


In [4]:
train_y = train['is_attributed']
# add inplace=True 直接替换
train_x = train.drop(['is_attributed', 'attributed_time','ip','click_time','datetime'], axis=1)
train_x.head()

Unnamed: 0,app,device,os,channel,dow,month,hour,day,clicks_by_ip,ip_day_hour,ip_app,ip_app_os
0,12,1,13,259,3,11,8,9,3,3,1,1
1,19,0,0,347,3,11,8,9,2395,2395,91,30
2,9,1,8,127,3,11,8,9,19,19,5,2
3,35,1,19,21,3,11,8,9,16,16,1,1
4,3,1,23,280,3,11,8,9,231,231,38,2


In [5]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

[61.7221109867] Start XGBoost Training


In [6]:
params = {'eta': 0.1, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'scale_pos_weight': 150,
          'silent': True}

In [7]:
from sklearn.model_selection import KFold
num_folds = 3
seed = 7 
kf = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

valid_result = []
for train_index, test_index in kf.split(train_x):
    evals_result = {}
    watchlist = [(xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index]), 'train'), (xgb.DMatrix(train_x.ix[test_index], train_y.ix[test_index]), 'valid')]
    dtrain = xgb.DMatrix(train_x.ix[train_index], train_y.ix[train_index])
    model = xgb.train(params, dtrain, 30 , watchlist, maximize=True, verbose_eval=10,evals_result=evals_result,early_stopping_rounds = 10)
    valid_result.append( evals_result['valid']['auc'][-1])
print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


[0]	train-auc:0.816175	valid-auc:0.825637
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.93783	valid-auc:0.944474
[20]	train-auc:0.939696	valid-auc:0.946379
[29]	train-auc:0.944666	valid-auc:0.950579
[0]	train-auc:0.868059	valid-auc:0.859855
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.943815	valid-auc:0.935
[20]	train-auc:0.947927	valid-auc:0.936869
[29]	train-auc:0.950042	valid-auc:0.938138
[0]	train-auc:0.858342	valid-auc:0.844812
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 10 rounds.
[10]	train-auc:0.942426	valid-auc:0.937725
[20]	train-auc:0.947164	valid-auc:0.940798
[29]	train-auc:0.951574	valid-auc:0.942178
[105.892889023] Finish XGBoost Training


In [8]:
from xgboost import plot_importance
import matplotlib.pyplot as plt
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')

In [9]:
mean_result = reduce(lambda x, y: x + y, valid_result)/len(valid_result)
print mean_result

0.943631666667


In [14]:
# deal with the test data
#test=merge_feature(test)
test.drop(['ip','click_time','datetime'], axis=1, inplace=True)
test.head()

Unnamed: 0,app,device,os,channel,dow,month,hour,day,clicks_by_ip,ip_day_hour,ip_app,ip_app_os
0,9,1,3,107,4,11,4,10,91,34,28,1
1,9,1,3,466,4,11,4,10,2083,403,289,5
2,21,1,19,128,4,11,4,10,2135,229,312,24
3,15,1,13,111,4,11,4,10,1201,239,42,23
4,12,1,13,328,4,11,4,10,208,60,24,7


In [15]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)