In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
from math import ceil
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

In [3]:
#load data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

y_target = pd.read_csv('train.csv')['TARGET']
train_id = pd.read_csv('train.csv')['ID']
test_id = pd.read_csv('test.csv')['ID']

In [4]:
# split data into training and testing set(50% to train set)
X_train, X_test, y_train, y_test = train_test_split(train, y_target, test_size=0.5, random_state=290977)

In [5]:
## Feature selection
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train, y_train)

## recursive feature elimination from ExtraTrees Classifier
fs = SelectFromModel(selector, prefit=True)

# transform data for model validation
X_train_sm = fs.transform(X_train)
X_test_sm = fs.transform(X_test)

# also transform data for submission
test_sm = fs.transform(test)

In [6]:
# check how many features were selected
X_train_sm.shape, X_test_sm.shape, test_sm.shape

((38010, 36), (38010, 36), (75818, 36))

In [7]:
## Train Model

# generate sparse matrices
dtrain = xgb.DMatrix(X_train_sm, label = y_train)
dtest = xgb.DMatrix(X_test_sm, label = y_test)

dtest_sub = xgb.DMatrix(test_sm)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.792781      0.015975        0.811858       0.002268
1       0.808090      0.013118        0.828260       0.001705
2       0.823661      0.010825        0.841187       0.002971
3       0.823754      0.009578        0.847781       0.001819
4       0.824687      0.010644        0.853044       0.002105
5       0.822878      0.009528        0.856971       0.002095
6       0.821971      0.009129        0.860796       0.002468
7       0.818319      0.007070        0.865071       0.002276
8       0.819049      0.007861        0.869082       0.002527
9       0.817260      0.008214        0.872492       0.002246
Best Interaction:  4
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039490	train-error:0.039674
[2]	eval-error:0.039569	train-error:0.039726
[3]	eval-error:0.039305	train-error:0.039621
[4]	eval-error:0.039384	train-error:0.039647
[5]	eval

In [8]:
# function to build model
def optimise_xgb(param):
    model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
    cross_val_auc =model_cv.T.loc['test-auc-mean'].max()
    watchlist  = [(dtest,'eval'), (dtrain,'train')]
    xgb_model = xgb.train(param, dtrain, num_round, watchlist)
    prediction = xgb_model.predict(dtest)
    test_auc = roc_auc_score(y_test, prediction, average='macro')
    return cross_val_auc, test_auc

In [9]:
#optimise max_depth
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in range(1,8):
    param = {'max_depth':i, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039937	train-error:0.040147
[8]	eval-error:0.039937	train-error:0.040147
[9]	eval-error:0.039911	train-error:0.040147
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039963	train-error:0.040147
[5]	eval-error:0.039569	train-error:0.040042
[6]	eval-error:0.039542	train-error:0.039884
[7]	eval-error:0.039647	train-error:0.039911
[8]	eval-error:0.039437	train-error:0.039911
[9]	eval-error:0.039595	train-error:0.039937
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    3
test_auc         2
dtype: int64

In [10]:
#optimise eta
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 0.3, 0.6, 0.9]:
    param = {'max_depth':3, 'eta': i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    0.9
test_auc         0.6
dtype: float64

In [11]:
#optimise gamma
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0, 1, 10, 100]:
    param = {'max_depth':2, 'eta': 0.9, 'gamma':i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039384	train-error:0.039779
[9]	eval-error:0.039437	train-error:0.039753
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039384	train-error:0.039779
[9]	eval-error:0.039437	train-error:0.039753
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    10
test_auc          0
dtype: int64

In [12]:
# optimise min child weight'
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [1, 10, 100]:
    param = {'max_depth':2, 'eta': 0.6, 'gamma':0, 'min_child_weight': i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039384	train-error:0.039779
[9]	eval-error:0.039437	train-error:0.039753
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039411	train-error:0.039726
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    10
test_auc          1
dtype: int64

In [13]:
#optimise subsample
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 0.3, 0.5, 0.8,1]:
    param = {'max_depth':2, 'eta': 0.6, 'gamma':10, 'min_child_weight': 1, 'subsample':i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039805
[7]	eval-error:0.039358	train-error:0.039805
[8]	eval-error:0.039358	train-error:0.039937
[9]	eval-error:0.039332	train-error:0.039911
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039463	train-error:0.039753
[9]	eval-error:0.039437	train-error:0.039726
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    1
test_auc         1
dtype: float64

In [14]:
# optimise lambda
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 1, 10, 100 ]:
    param = {'max_depth':2, 'eta': 0.6, 'gamma':10, 'min_child_weight': 1, 'subsample':1, 'lambda': i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039700
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    10.0
test_auc          0.1
dtype: float64

In [15]:
# optimise alpha
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0, 0.3, 0.6, 1, 10 ]:
    param = {'max_depth':2, 'eta': 0.6, 'gamma':10, 'min_child_weight': 1, 'subsample':0.5, 'lambda': 10, 'alpha':i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    1
test_auc         1
dtype: float64

In [13]:
# do crossvalidation
print ('running cross validation')
seed_set = 0
param = {'max_depth':2, 'eta': 0.6, 'gamma':10, 'min_child_weight': 1, 'subsample':1, 'lambda': 10, 'alpha':0.3,
             'silent':1, 'objective':'binary:logistic', 'seed': seed_set}
num_round = 40

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'})
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

# average the predictions of 10 xgboosts on different seedds
prediction = np.zeros(X_test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest)
    i = i + 10

prediction = prediction/10
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
    test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0        0.720463      0.019804        0.722324       0.004688
1        0.724810      0.020068        0.728660       0.003141
2        0.778931      0.020352        0.784862       0.004757
3        0.794773      0.013949        0.801444       0.004147
4        0.795925      0.016113        0.808174       0.003236
5        0.809745      0.011975        0.818244       0.003949
6        0.813388      0.009822        0.824161       0.003916
7        0.817871      0.009718        0.827843       0.002623
8        0.820256      0.008691        0.830904       0.003340
9        0.822358      0.010175        0.833683       0.003171
10       0.824227      0.010336        0.835922       0.002907
11       0.824960      0.010508        0.837675       0.002367
12       0.824386      0.010209        0.838575       0.001660
13       0.824314      0.010208        0.838905       0.001461
14       0.824314      0.01020

In [14]:
# Save predictions on X_test for model stacking
#prediction = xgb_extraTreesParam.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_ExtraTreesFeat.csv", index=False)

y_test = pd.DataFrame({'label': y_test})
y_test.to_csv('test_label.csv', index=False)

In [15]:
##  Submission

prediction = np.zeros(test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest_sub)
    i = i + 10

prediction = prediction/10
#prediction = xgb_extraTreesParam.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_ExtraTreesFeat.csv", index=False)

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[10]	eval-error:0.039358	train-error:0.039779
[11]	eval-error:0.039358	train-error:0.039779
[12]	eval-error:0.039358	train-error:0.039779
[13]	eval-error:0.039358	train-error:0.039779
[14]	eval-error:0.039358	train-error:0.039779
[15]	eval-error:0.039358	train-error:0.039779
[16]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]

In [19]:
#  save features used for later
feat_imp = pd.Series(clf.feature_importances_, index = X_train.columns.values)
feat_imp = feat_imp.sort_values(ascending=False)
feat_imp.to_csv('extraTreesFeatures.csv')