In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import XGBRegressor
import gc



In [2]:
traincolumns = ['ip','app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
train = pd.read_csv('datasets/train.csv',skiprows=range(1,149903891), nrows=10000000, usecols = traincolumns)
test = pd.read_csv('datasets/test.csv')
sub = pd.DataFrame()
sub['click_id'] = test['click_id']
train = train.dropna()

In [3]:
def preprocessClicktime(df):
    # Make some new features with click_time column
    df["datetime"] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek
    df['woy']      = df['datetime'].dt.week
    df['day']      = df['datetime'].dt.day
    df['hour']     = df['datetime'].dt.hour
    df['minute']   = df['datetime'].dt.minute
    df['second']   = df['datetime'].dt.second
    return df

In [4]:
train = preprocessClicktime(train)
train = train.drop(['click_time','datetime'],axis=1)
test = preprocessClicktime(test)
test = test.drop(['click_id','click_time','datetime'],axis=1)

y = train['is_attributed']
train = train.drop(['is_attributed'], axis=1)

In [5]:
# Some feature engineering
nrow_train = train.shape[0]
merge = pd.concat([train, test])
del train, test
gc.collect()

276

In [6]:
# Count the number of clicks by ip
ip_count = merge.groupby('ip')['app'].count().reset_index()
ip_count.columns = ['ip', 'clicks_by_ip']
merge = pd.merge(merge, ip_count, on='ip', how='left', sort=False)
merge.drop('ip', axis=1, inplace=True)

In [7]:
train = merge[:nrow_train]
test = merge[nrow_train:]
del merge
gc.collect()

7

Some good defaults (you'll be better able to select settings to try once you understand how XGBoost works):

In [25]:
xgb_param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.0, 1.0, 1.5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 6, 7, 8, 10]
        }


xgb_param_grid_large = {
        'learning_rate': [0.1, 0.05, 0.2],
        'n_estimators': [50, 100, 500, 600],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.0, 1.0, 1.5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.5, 0.6, 0.8],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'reg_lambda': [1, ]
        }


Let's try them out on our regression task:

In [26]:
# To place it on the GPU
xgb_reg = XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor')
# On the CPU
#xgb_reg = XGBRegressor()

In [33]:
from sklearn.model_selection import RandomizedSearchCV

r_xgb_reg = RandomizedSearchCV(xgb_reg, xgb_param_grid_large, n_iter=10, verbose=20)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(train,y, random_state=42)

In [35]:
r_xgb_reg.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5 
[CV]  subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5, score=0.41112952659061336, total=  33.5s
[CV] subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.3s remaining:    0.0s


[CV]  subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5, score=0.41342581760577646, total=  33.3s
[CV] subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  subsample=1.0, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=7, learning_rate=0.05, gamma=0.0, colsample_bytree=0.5, score=0.4154350341283367, total=  33.5s
[CV] subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s


[CV]  subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8, score=0.41309925117544843, total=  34.0s
[CV] subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.4min remaining:    0.0s


[CV]  subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8, score=0.4173960811049907, total=  34.3s
[CV] subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min remaining:    0.0s


[CV]  subsample=1.0, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=6, learning_rate=0.05, gamma=1.0, colsample_bytree=0.8, score=0.418120259650414, total=  34.1s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.6min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.40022754434651997, total=  30.4s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.1min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.4028941475500455, total=  30.3s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.6min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=10, max_depth=7, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.4038258402460403, total=  30.5s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.2min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.3741123670079538, total=  45.8s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.0min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.3811332222459698, total=  45.1s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:  6.8min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=1, max_depth=9, learning_rate=0.1, gamma=1.5, colsample_bytree=0.6, score=0.3837608439174919, total=  45.8s
[CV] subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  7.5min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6, score=0.37849848097867056, total=   7.2s
[CV] subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:  7.7min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6, score=0.3806196655007582, total=   7.3s
[CV] subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:  7.8min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=100, min_child_weight=5, max_depth=4, learning_rate=0.2, gamma=1.0, colsample_bytree=0.6, score=0.3793501117627993, total=   7.2s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  8.0min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5, score=0.39703099278891246, total=  38.6s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:  8.7min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5, score=0.40048964450714086, total=  38.3s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5 


[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:  9.3min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=1, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.5, score=0.4005689818252508, total=  38.4s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 10.0min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6, score=0.39621380924854843, total=  36.8s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed: 10.6min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6, score=0.3995179023530553, total=  36.5s
[CV] subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6 
[CV]  subsample=0.6, reg_lambda=1, n_estimators=600, min_child_weight=5, max_depth=7, learning_rate=0.1, gamma=0.0, colsample_bytree=0.6, score=0.4012588147558674, total=  36.9s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=5, max_depth=8, learning_rate=0.05, gamma=0.0, colsample_bytree=0.8 
[CV]  subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=5, max_depth=8, learning_rate=0.05, gamma=0.0, colsample_bytree=0.8, score=0.3996384795930832, total=  34.3s
[CV] subsample=0.6, reg_lambda=1, n_estimators=500, min_child_weight=5, max_depth=8, learning_rate=0.05, gamma=0.0, colsample_bytree=0.8 
[CV]  subsample=0.6, reg_lambda=1, n_estimators=500,

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.5min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear',
       predictor='gpu_predictor', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='gpu_hist'),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'learning_rate': [0.1, 0.05, 0.2], 'n_estimators': [50, 100, 500, 600], 'min_child_weight': [1, 5, 10], 'gamma': [0.0, 1.0, 1.5], 'subsample': [0.6, 1.0], 'colsample_bytree': [0.5, 0.6, 0.8], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'reg_lambda': [1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=20)

In [36]:
r_xgb_reg.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=1.0, learning_rate=0.05,
       max_delta_step=0, max_depth=6, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=None, objective='reg:linear',
       predictor='gpu_predictor', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0, tree_method='gpu_hist')

In [37]:
r_xgb_reg.score(X_test, y_test)

0.41581337166763044

In [40]:
xgb_reg = r_xgb_reg.best_estimator_

In [41]:
xgb_reg.save_model('0001.model')

In [43]:
bst = xgb.Booster({'nthread': 4})  # init model
bst.load_model('0001.model')  # load data

In [86]:
xgtest = xgb.DMatrix(test.values)

In [88]:
bst.predict(xgtest)