In [25]:
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Read in a sample of the data with feature engineering already done to be the training data
train = pd.read_csv('Train000Features2.csv').loc[0:10000,] 

# Subset to have the first 425 columns 
train = train.iloc[:,1:426]

In [26]:
# Inspect the data
train.head()

Unnamed: 0,click,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Monday,Morning,Day,...,week_site_2585f751fd,week_site_211fbe01fe,week_site_241fbe01fe,week_site_261fbe01fe,week_site_231fbe01fe,week_site_221fbe01fe,week_site_251fbe01fe,week_site_281fbe01fe,week_site_271fbe01fe,week_site_291fbe01fe
0,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# Read in another sample of the data to use as the test data
test = pd.read_csv('Train000Features2.csv').loc[10001:13001,] 
test = test.iloc[:,1:426]

In [28]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression as LR

# Define our cross validations technique
cv=KFold(n_splits=5, random_state=42)

In [29]:
train.iloc[:,1:424].shape

(10001, 423)

In [31]:
# Run a stepwise forward selection logistic regression using cross validation and the log loss
# function as our optimization function
lr = LR()
sfs1 = SFS(lr,
           # Change the features number accordingly.
           k_features=423,
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='neg_log_loss',
           cv=cv,
           n_jobs=8)

# Fit the model and use it to predict 'click'
sfs1 = sfs1.fit(train.iloc[:,1:424], train['click'])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 423 out of 423 | elapsed:    4.6s finished

[2019-12-17 15:37:21] Features: 1/423 -- score: -0.44132475156889595[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 422 out of 422 | elapsed:    3.1s finished

[2019-12-17 15:37:24] Features: 2/423 -- score: -0.4361286742243684[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 421 out of 421 | elapsed:    3.5s finished

[2019-12-17 15:37:28] Features: 3/423 -- score: -0.43339814408482163[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 276 tasks    

[Parallel(n_jobs=8)]: Done 401 out of 401 | elapsed:    7.0s finished

[2019-12-17 15:39:14] Features: 23/423 -- score: -0.4187070513442402[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 276 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    7.0s finished

[2019-12-17 15:39:21] Features: 24/423 -- score: -0.4182407424201517[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 276 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 384 out of 399 | elapsed:    6.7s remaining:    0.3s
[Parallel(n_jobs=8)]: Done 399 out of 399 | elapsed:    7.0s finished

[2019-12-17 15:39:28] Features: 25/423 -- score: -0.41778802692743167[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 ta

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    3.4s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    8.1s
[Parallel(n_jobs=8)]: Done 379 out of 379 | elapsed:    8.7s finished

[2019-12-17 15:42:18] Features: 45/423 -- score: -0.41288622380304263[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.0s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    9.2s
[Parallel(n_jobs=8)]: Done 378 out of 378 | elapsed:    9.9s finished

[2019-12-17 15:42:28] Features: 46/423 -- score: -0.4127275084810088[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.1s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    9.4s
[Parallel(n_jobs=8)]: 

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.9s
[Parallel(n_jobs=8)]: Done 357 out of 357 | elapsed:   11.6s finished

[2019-12-17 15:46:16] Features: 67/423 -- score: -0.4105103913461777[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.6s
[Parallel(n_jobs=8)]: Done 356 out of 356 | elapsed:   10.9s finished

[2019-12-17 15:46:27] Features: 68/423 -- score: -0.41044604370583804[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.7s
[Parallel(n_jobs=8)]: Done 355 out of 355 | elapsed:   11.0s finished

[2019-12-17 15:46:38] Features: 69/423 -- score: -0.4103777481326677[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent 

[Parallel(n_jobs=8)]: Done 333 out of 333 | elapsed:  1.2min finished

[2019-12-17 16:02:59] Features: 91/423 -- score: -0.40930930198283083[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   31.6s
[Parallel(n_jobs=8)]: Done 332 out of 332 | elapsed:  1.2min finished

[2019-12-17 16:04:09] Features: 92/423 -- score: -0.4092817482223401[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    6.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   31.7s
[Parallel(n_jobs=8)]: Done 331 out of 331 | elapsed:  1.0min finished

[2019-12-17 16:05:11] Features: 93/423 -- score: -0.40925470532527974[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed

In [32]:
# Assign the logistic regression output to a DataFrame for increased clarity and manipulation
output = pd.DataFrame.from_dict(sfs1.get_metric_dict()).T

# Retrieve the feature_names of the best performing iteration of the logistic regression
features_fw = output[output.avg_score==max(output.avg_score)].feature_names

# Create a list of the selected features 
features_fw = list(list(features_fw)[0])

# Select the significant features from the train dataset
features_fw_lr = train[features_fw]

In [33]:
print('Chosen features for logistic regression:',features_fw_lr.columns)

Chosen features for logistic regression: Index(['Day', 'Night', 'C1_1007', 'C1_1012', 'banner_pos_7',
       'site_category_28905ebd', 'site_category_335d28a8',
       'site_category_3e814130', 'site_category_70fb0e29',
       'site_category_72722551',
       ...
       'site_device_c4e18dd60bcabeaf', 'site_device_f3845767711ee120',
       'site_device_c4e18dd636b67a2a', 'site_device_f38457674ea23a13',
       'week_app_22ecad2386', 'week_app_23ecad2386', 'week_app_24ecad2386',
       'week_app_2692f5800b', 'week_app_2992f5800b', 'week_site_251fbe01fe'],
      dtype='object', length=101)


In [42]:
# Fit another logistic regression using the features selected by our stepwise forward selection to predict 'click'
lr_final = LR().fit(features_fw_lr,train['click'])

# Calculate y hat to then measure the model performance on test data, using log loss as our function
yhat = pd.DataFrame(lr_final.predict_proba(test[features_fw]))
print("Logistic Regression Out-Sample Performance:",log_loss(test['click'],np.asarray(yhat.iloc[:,1],dtype=np.float64)))

Logistic Regression Out-Sample Performance: 0.4127336197367702




In [4]:
# Build an XG Boost model 
booster = xgb.XGBClassifier(colsample_bytree=0.6, gamma=0.1, learning_rate=0.05, 
                           max_depth=7, n_estimators=200, random_state=42)

# Run stepwise forward selection to find the most significant features in the data
# Using cross validation and log loss as our loss function
sfs2 = SFS(booster,
           # Change the features number accordingly.
           k_features=343, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='neg_log_loss',
           cv=cv,
           n_jobs=8)

# Fit the model to predict 'click'
sfs2 = sfs2.fit(train.iloc[:,1:344], train['click'])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   10.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 343 out of 343 | elapsed:  5.3min finished

[2019-12-17 12:14:02] Features: 1/343 -- score: -0.4359453514248132[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   56.2s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  2.5min
[Parallel(n_jobs=8)]: Done 342 out of 342 | elapsed:  6.5min finished

[2019-12-17 12:20:34] Features: 2/343 -- score: -0.4304299298016354[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   52.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  4.6min
[Parallel(n_jobs=8)]: Done 341 out of 341 | elapsed:  8.6min finished

[2019-12-17 12:29:12] Features: 3/343 -- score: -0.42687391815

In [5]:
# Assign the XG Boost output to a DataFrame for increased clarity and manipulation
output = pd.DataFrame.from_dict(sfs2.get_metric_dict()).T

# Retrieve the feature_names of the best performing iteration of the XG Boost
xgb_features_fw = output[output.avg_score==max(output.avg_score)].feature_names

# Create a list of the selected features 
xgb_features_fw = list(list(xgb_features_fw)[0])

# Select the significant features from the train dataset
features_fw_xgb = train[xgb_features_fw]

# Print significant features
print('Chosen features for XGBoost:',features_fw_xgb .columns)

Chosen features for XGBoost: Index(['site_category_28905ebd', 'C16_250', 'C18_2', 'site_id_e151e245',
       'C21_71'],
      dtype='object')


In [46]:
# Fit another XG Boost model using the features selected by our stepwise forward selection to predict 'click'
xgb_final = booster.fit(features_fw_xgb,train['click'])

# Calculate y hat to then measure the model performance on test data, using log loss as our function
yhat_xgb = pd.DataFrame(xgb_final.predict_proba(test[xgb_features_fw]))
print("XGB Out-Sample Performance:",log_loss(test['click'],np.asarray(yhat_xgb.iloc[:,1],dtype=np.float64)))

XGB Out-Sample Performance: 0.4129710191817372


In [81]:
############################################## Hyperparametertuning with XGBoost ###########################################
# We use GridSearchCV from scikit-learn to optimize the model parameters and then couple that with our stepwise forward
# selection features in order to improve the model as much as possible
booster = GridSearchCV(estimator=xgb.XGBClassifier(colsample_bytree=0.6,gamma=0.1,random_state=42,n_estimators=200,),
                  param_grid=[{'learning_rate': [0.02,0.05,0.1],
                               'max_depth': [3,5,7,9,11,13]}],
                  cv=cv,
                  n_jobs=4,
                  scoring='neg_log_loss')

# Fit the model again using the selected features and the optimized parameters 
xgb_final = booster.fit(features_fw_lr,train['click'])

# Calculate y hat to then measure the model performance on test data, using log loss as our function
yhat_xgb = pd.DataFrame(xgb_final.predict_proba(test[features_fw]))
print("XGB Out-Sample Performance:",log_loss(test['click'],np.asarray(yhat_xgb.iloc[:,1],dtype=np.float64)))

XGB Out-Sample Performance: 0.4113717111693126


In [None]:
# LightGBM is a gradient boosting framework that uses tree based learning algorithms. 
# It is designed to be distributed and efficient
# Documentation: https://lightgbm.readthedocs.io/en/latest/
# Run a lgb model with no max depth and a learning rate (h) of 0.1
lgbm = lgb.LGBMClassifier(num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=100, random_state=42)

# Perform stepwise forward selection using our cross validation and log loss as our loss function
sfs3 = SFS(lgbm,
           # Change the features number accordingly.
           k_features=343, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='neg_log_loss',
           cv=cv,
           n_jobs=8)

# Fit the model to predict 'click'
sfs3 = sfs3.fit(train.iloc[:,1:344], train['click'])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.6s
[Parallel(n_jobs=8)]: Done 343 out of 343 | elapsed:    9.4s finished

[2019-12-17 17:03:14] Features: 1/343 -- score: -0.4413258046814545[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    4.8s
[Parallel(n_jobs=8)]: Done 342 out of 342 | elapsed:   11.0s finished

[2019-12-17 17:03:25] Features: 2/343 -- score: -0.4361569598657439[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    8.1s
[Parallel(n_jobs=8)]: Done 341 out of 341 | elapsed:   15.2s finished

[2019-12-17 17:03:41] Features: 3/343 -- score: -0.43334230413

In [None]:
# # Assign the lgb output to a DataFrame for increased clarity and manipulation
output = pd.DataFrame.from_dict(sfs3.get_metric_dict()).T

# Retrieve the feature_names of the best performing iteration of the lgb
lgb_features_fw = output[output.avg_score==max(output.avg_score)].feature_names

# Create a list of the selected features 
lgb_features_fw = list(list(lgb_features_fw)[0])

# Select the significant features from the train dataset
features_fw_lgb = train[xgb_features_fw]

# Print significant features
print('Chosen features for XGBoost:',features_fw_lgb .columns)

In [None]:
# Fit another XG Boost model using the features selected by our stepwise forward selection to predict 'click'
lgb_final = lgbm.fit(features_fw_lgb,train['click'])

# Calculate y hat to then measure the model performance on test data, using log loss as our function
yhat_lgb = pd.DataFrame(lgb_final.predict_proba(test[lgb_features_fw]))
print("LightGBM Out-Sample Performance:",log_loss(test['click'],np.asarray(yhat_lgb.iloc[:,1],dtype=np.float64)))

In [82]:
##################################################### Ensemble Models ################################################
yhat_ensembled = (yhat_xgb.iloc[:,1] + yhat.iloc[:,1])/2
print("Ensembled Models Out-Sample Performance:",log_loss(test['click'],yhat_ensembled))

Ensembled Models Out-Sample Performance: 0.41081692013874477
