# XGBoost implementation

## Initialisation

In [1]:
import os
from datetime import date
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import cross_validation
from sklearn.metrics import log_loss

%matplotlib inline



In [2]:
todayDate = date.today().strftime("%Y-%m-%d")

In [3]:
filename = 'logs/' + todayDate + '-xgb_log.txt'
dir = os.path.dirname(filename)
if not os.path.exists(dir):
    os.makedirs(dir)

f = open(os.getcwd() + '/../' + filename, 'w')

## Data preparation

In [4]:
train = pd.read_csv(os.getcwd() + '/../data/numerai_training_data.csv')
test = pd.read_csv(os.getcwd() + '/../data/numerai_tournament_data.csv')
example = pd.read_csv(os.getcwd() + '/../data/example_predictions.csv')

X = train.drop('target', axis=1)
y = train.target

Xtest = test.drop('t_id', axis=1)
ID = test.t_id

In [5]:
y.sum()/y.size

0.5051702657807309

In [7]:
X.shape

(96320, 21)

In [9]:
Xtest.shape

(135270, 21)

In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96320 entries, 0 to 96319
Data columns (total 21 columns):
feature1     96320 non-null float64
feature2     96320 non-null float64
feature3     96320 non-null float64
feature4     96320 non-null float64
feature5     96320 non-null float64
feature6     96320 non-null float64
feature7     96320 non-null float64
feature8     96320 non-null float64
feature9     96320 non-null float64
feature10    96320 non-null float64
feature11    96320 non-null float64
feature12    96320 non-null float64
feature13    96320 non-null float64
feature14    96320 non-null float64
feature15    96320 non-null float64
feature16    96320 non-null float64
feature17    96320 non-null float64
feature18    96320 non-null float64
feature19    96320 non-null float64
feature20    96320 non-null float64
feature21    96320 non-null float64
dtypes: float64(21)
memory usage: 15.4 MB


## Train-validation split

In [6]:
Xtr, Xval, ytr, yval = cross_validation.train_test_split(X, y, test_size=0.15, random_state=42)

## XGBoost

In [10]:
dtrain = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(Xtest)

In [11]:
dtrain_tr = xgb.DMatrix(Xtr, label=ytr)
dtest_val = xgb.DMatrix(Xval)

1

In [16]:
# specify parameters via map
param = { 'eval_metric':'logloss', 'objective':'binary:logistic' }
num_round = 100

In [17]:
xgb1 = xgb.train(param, dtrain_tr, num_round)

In [23]:
xgb1_pred_tr = xgb1.predict(dtrain_tr)
xgb1_pred_val = xgb1.predict(dtest_val)

In [24]:
# write log to file
xgb1_logloss_train = log_loss(ytr, xgb1_pred_tr)
xgb1_logloss_val = log_loss(yval, xgb1_pred_val)
f.write('Train logloss: ' + str(xgb1_logloss_train) + '\n')
f.write('Validation logloss: ' + str(xgb1_logloss_val) + '\n')
f.close()

2

In [54]:
# specify parameters via map
param = { 'eval_metric':'logloss', 'objective':'binary:logistic' }
num_round = 1

In [55]:
xgb2 = xgb.train(param, dtrain_tr, num_round)

In [56]:
xgb2_pred_tr = xgb2.predict(dtrain_tr)
xgb2_pred_val = xgb2.predict(dtest_val)

In [57]:
xgb2_logloss_train = log_loss(ytr, xgb2_pred_tr)
xgb2_logloss_val = log_loss(yval, xgb2_pred_val)
print('Train logloss: ' + str(xgb2_logloss_train) + '\n')
print('Validation logloss: ' + str(xgb2_logloss_val) + '\n')

Train logloss: 0.6907986937

Validation logloss: 0.692806807179



Refit using all data

In [58]:
xgb_bst = xgb.train(param, dtrain, num_round)

In [59]:
xgb_bst_pred = xgb_bst.predict(dtest)

In [60]:
xgb_bst_pred

array([ 0.50418621,  0.48460144,  0.49575031, ...,  0.51249832,
        0.49575031,  0.51249832], dtype=float32)

## Submission

In [61]:
xgb_submit = pd.DataFrame(xgb_bst_pred, index=ID, columns={'probability'})

In [62]:
xgb_submit.to_csv('../output/' + todayDate + '-xgb_submit2.csv')