In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
path = './input/'

train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train = pd.read_csv(path+'train.csv', usecols=train_columns, dtype=dtypes)
test = pd.read_csv(path+'test.csv', usecols=test_columns, dtype=dtypes)

In [3]:
print(train.shape, test.shape)

(184903890, 7) (18790469, 7)


In [4]:
# 训练集label
y_train = train['is_attributed'].values
# 删除多余变量
del train['is_attributed']
sub = test[['click_id']]
del test['click_id']
# 训练集与测试集合并
nrow_train = train.shape[0]
data = pd.concat([train, test], axis=0)
del train, test
gc.collect()

0

In [5]:
for f in ['ip','app','device','os','channel']:
    data[f+'_cnts'] = data.groupby([f])['click_time'].transform('count')

data['click_time'] = pd.to_datetime(data['click_time'])
data['days'] = data['click_time'].dt.day  
data['hours_in_day'] = data['click_time'].dt.hour 
data['day_of_week'] = data['click_time'].dt.dayofweek 

train = data[:nrow_train]
test = data[nrow_train:]
del data
gc.collect()

13

In [6]:
cols = [f for f in train.columns if f not in ['click_time']]

In [8]:
params = {'eta': 0.2,
          'max_leaves': 2**9-1,  
          'max_depth': 9, 
          'subsample': 0.7, 
          'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'n_jobs':24,
          'random_state': 2020,
          'silent': True}
          
trn_x, val_x, trn_y, val_y = train_test_split(train[cols], y_train, test_size=0.2, random_state=2020)
dtrain = xgb.DMatrix(trn_x, trn_y)
dvalid = xgb.DMatrix(val_x, val_y)
del trn_x, val_x, trn_y, val_y 
gc.collect()
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
model = xgb.train(params, dtrain, 200, watchlist, early_stopping_rounds = 20, verbose_eval=10)

[21:32:02] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
[0]	train-auc:0.963463	valid-auc:0.962897
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.969029	valid-auc:0.968335
[20]	train-auc:0.971793	valid-auc:0.970815
[30]	train-auc:0.974514	valid-auc:0.972974
[40]	train-auc:0.976252	valid-auc:0.973998
[50]	train-auc:0.977784	valid-auc:0.974714
[60]	train-auc:0.978459	valid-auc:0.975059
[70]	train-auc:0.979047	valid-auc:0.975191
[80]	train-auc:0.979595	valid-auc:0.975238
[90]	train-auc:0.980047	valid-auc:0.975299
[100]	train-auc:0.980454	valid-auc:0.975339
[110]	train-auc:0.980931	valid-auc:0.97536
[120]	train-auc:0.981351	valid-auc:0.975342
[130]	train-auc:0.981737	valid-auc:0.975344
Stopping. Best iteration:
[113]	train-auc:0.981055	valid-auc:0.975366



In [10]:
dtest = xgb.DMatrix(test[cols])
sub['is_attributed'] = None
sub['is_attributed'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.to_csv('talkingdata_baseline.csv', index=False)