In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb



In [4]:
path = 'dataset/'

In [5]:
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    return df

In [6]:
start_time = time.time()

In [7]:
train = pd.read_csv(path+"train.csv", skiprows=160000000, nrows=40000000)
test = pd.read_csv(path+"test.csv")
train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']

print('[{}] Finished loading data'.format(time.time() - start_time))

  interactivity=interactivity, compiler=compiler, result=result)


[81.2262809277] Finished loading data


In [8]:
train = dataPreProcessTime(train)
test = dataPreProcessTime(test)

y = train['is_attributed']
train.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True)

In [9]:
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
test.drop('click_id', axis=1, inplace=True)

print('[{}] Start XGBoost Training'.format(time.time() - start_time))

[509.827786922] Start XGBoost Training


In [10]:
params = {'eta': 0.1, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'alpha':4,
          'objective': 'binary:logistic', 
          'eval_metric': 'auc', 
          'random_state': 99, 
          'scale_pos_weight': 150,
          'silent': True}
          
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, random_state=99)

In [11]:
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 270, watchlist, maximize=True, verbose_eval=10)

print('[{}] Finish XGBoost Training'.format(time.time() - start_time))

[0]	train-auc:0.936604	valid-auc:0.932967
[10]	train-auc:0.954738	valid-auc:0.95207
[20]	train-auc:0.957782	valid-auc:0.954578
[30]	train-auc:0.957866	valid-auc:0.95454
[40]	train-auc:0.96087	valid-auc:0.957605
[50]	train-auc:0.962488	valid-auc:0.959326
[60]	train-auc:0.964568	valid-auc:0.961497
[70]	train-auc:0.965797	valid-auc:0.96279
[80]	train-auc:0.966744	valid-auc:0.963732
[90]	train-auc:0.967409	valid-auc:0.964292
[100]	train-auc:0.967917	valid-auc:0.964733
[110]	train-auc:0.968352	valid-auc:0.965054
[120]	train-auc:0.968835	valid-auc:0.965434
[130]	train-auc:0.969139	valid-auc:0.96565
[140]	train-auc:0.96939	valid-auc:0.965807
[150]	train-auc:0.969644	valid-auc:0.96598
[160]	train-auc:0.969848	valid-auc:0.966119
[170]	train-auc:0.970068	valid-auc:0.966258
[180]	train-auc:0.970262	valid-auc:0.966374
[190]	train-auc:0.97041	valid-auc:0.966493
[200]	train-auc:0.970609	valid-auc:0.966572
[210]	train-auc:0.970733	valid-auc:0.966665
[220]	train-auc:0.970878	valid-auc:0.966754
[230]	t

In [12]:
sub['is_attributed'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)