In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
from math import ceil
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

In [3]:
#load data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

y_target = pd.read_csv('train.csv')['TARGET']
test_id = pd.read_csv('test.csv')['ID']

In [4]:
boruta_vars = boruta_big_vars = vars_to_keep = ['var38','var15','imp_op_var39_comer_ult3','imp_op_var41_comer_ult3',
                'imp_op_var41_efect_ult1','imp_op_var41_efect_ult3','imp_op_var41_ult1',
                'imp_op_var39_efect_ult1','imp_op_var39_efect_ult3','imp_op_var39_ult1',
                'ind_var37_cte','ind_var37_0', 'num_op_var41_hace2','num_op_var41_ult1',
                'num_op_var41_ult3','num_op_var39_ult1','num_op_var39_ult3','num_var37_med_ult2',
                'num_var37_0', 'saldo_var5', 'saldo_var30', 'saldo_var37', 'saldo_var42', 'num_var22_hace2',
                'num_var22_hace3', 'num_var22_ult3', 'num_med_var22_ult3', 'num_med_var45_ult3', 'num_meses_var5_ult3',
                'num_op_var39_comer_ult1','num_op_var39_comer_ult3', 'num_op_var41_comer_ult1', 'num_op_var41_comer_ult3',
                'num_var45_hace2', 'num_var45_hace3', 'num_var45_ult1', 'num_var45_ult3', 'saldo_medio_var5_hace2',
                'saldo_medio_var5_ult1','saldo_medio_var5_ult3', 'no0']

train = train[boruta_vars]
test = test[boruta_vars]

In [5]:
# split data into training and testing set (50% in train set)
X_train, X_test, y_train, y_test = train_test_split(train, y_target, test_size=0.5, random_state=290977)

In [6]:
# check how many parameters I got
X_train.shape, X_test.shape, test.shape

((38010, 41), (38010, 41), (75818, 41))

In [7]:
## Train Model

# generate sparse matrices
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

dtest_sub = xgb.DMatrix(test)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.794546      0.011784        0.813062       0.003687
1       0.808805      0.012272        0.829165       0.001919
2       0.820587      0.010261        0.839291       0.002635
3       0.823001      0.007834        0.846492       0.002167
4       0.823291      0.009767        0.852774       0.003164
5       0.822619      0.011264        0.856619       0.002764
6       0.823006      0.011576        0.861671       0.002521
7       0.822896      0.011316        0.864345       0.003345
8       0.819530      0.011835        0.867473       0.002743
9       0.816680      0.011707        0.871138       0.003531
Best Interaction:  4
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039490	train-error:0.039674
[2]	eval-error:0.039569	train-error:0.039726
[3]	eval-error:0.039516	train-error:0.039674
[4]	eval-error:0.039463	train-error:0.039700
[5]	eval

In [8]:
# function to build model
def optimise_xgb(param):
    model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
    cross_val_auc =model_cv.T.loc['test-auc-mean'].max()
    watchlist  = [(dtest,'eval'), (dtrain,'train')]
    xgb_model = xgb.train(param, dtrain, num_round, watchlist)
    prediction = xgb_model.predict(dtest)
    test_auc = roc_auc_score(y_test, prediction, average='macro')
    return cross_val_auc, test_auc

In [9]:
#optimise max_depth
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in range(1,8):
    param = {'max_depth':i, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039937
[8]	eval-error:0.039358	train-error:0.039937
[9]	eval-error:0.039384	train-error:0.039884
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039595	train-error:0.039884
[5]	eval-error:0.039647	train-error:0.039911
[6]	eval-error:0.039384	train-error:0.039726
[7]	eval-error:0.039700	train-error:0.039989
[8]	eval-error:0.039674	train-error:0.039937
[9]	eval-error:0.039647	train-error:0.039858
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    3
test_auc         2
dtype: int64

In [10]:
#optimise eta
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 0.3, 0.6, 0.9]:
    param = {'max_depth':4, 'eta': i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-error:0.039358	train-error:0.039726
[3]	eval-error:0.039358	train-error:0.039726
[4]	eval-error:0.039358	train-error:0.039726
[5]	eval-error:0.039332	train-error:0.039726
[6]	eval-error:0.039358	train-error:0.039726
[7]	eval-error:0.039332	train-error:0.039726
[8]	eval-error:0.039332	train-error:0.039726
[9]	eval-error:0.039332	train-error:0.039753
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-error:0.039358	train-error:0.039726
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039753
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039726
[8]	eval-error:0.039358	train-error:0.039726
[9]	eval-error:0.039358	train-error:0.039753
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-e

cross_val_auc    0.6
test_auc         0.6
dtype: float64

In [11]:
#optimise gamma
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0, 1, 10, 100]:
    param = {'max_depth':4, 'eta': 0.6, 'gamma':i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039411	train-error:0.039674
[4]	eval-error:0.039411	train-error:0.039700
[5]	eval-error:0.039463	train-error:0.039595
[6]	eval-error:0.039463	train-error:0.039595
[7]	eval-error:0.039411	train-error:0.039542
[8]	eval-error:0.039437	train-error:0.039463
[9]	eval-error:0.039411	train-error:0.039463
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039411	train-error:0.039674
[4]	eval-error:0.039411	train-error:0.039674
[5]	eval-error:0.039490	train-error:0.039621
[6]	eval-error:0.039490	train-error:0.039621
[7]	eval-error:0.039411	train-error:0.039595
[8]	eval-error:0.039516	train-error:0.039542
[9]	eval-error:0.039490	train-error:0.039595
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    0
test_auc         1
dtype: int64

In [12]:
# optimise min child weight
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [1, 10, 100]:
    param = {'max_depth':4, 'eta': 0.6, 'gamma':1, 'min_child_weight': i, 'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039726
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039411	train-error:0.039674
[4]	eval-error:0.039411	train-error:0.039674
[5]	eval-error:0.039490	train-error:0.039621
[6]	eval-error:0.039490	train-error:0.039621
[7]	eval-error:0.039411	train-error:0.039595
[8]	eval-error:0.039516	train-error:0.039542
[9]	eval-error:0.039490	train-error:0.039595
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039411	train-error:0.039779
[6]	eval-error:0.039490	train-error:0.039779
[7]	eval-error:0.039384	train-error:0.039779
[8]	eval-error:0.039490	train-error:0.039621
[9]	eval-error:0.039490	train-error:0.039647
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    1
test_auc         1
dtype: int64

In [13]:
#optimise subsample
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 0.3, 0.5, 0.8,1]:
    param = {'max_depth':4, 'eta': 0.6, 'gamma':1, 'min_child_weight': 10, 'subsample':i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039832
[7]	eval-error:0.039542	train-error:0.039911
[8]	eval-error:0.039621	train-error:0.039911
[9]	eval-error:0.039700	train-error:0.039832
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039384	train-error:0.039779
[5]	eval-error:0.039437	train-error:0.039647
[6]	eval-error:0.039411	train-error:0.039674
[7]	eval-error:0.039569	train-error:0.039832
[8]	eval-error:0.039621	train-error:0.039700
[9]	eval-error:0.039753	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    1.0
test_auc         0.8
dtype: float64

In [14]:
# optimise lambda
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0.1, 1, 10, 100 ]:
    param = {'max_depth':4, 'eta': 0.6, 'gamma':1, 'min_child_weight': 10, 'subsample':1, 'lambda': i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039437	train-error:0.039700
[6]	eval-error:0.039463	train-error:0.039700
[7]	eval-error:0.039463	train-error:0.039647
[8]	eval-error:0.039542	train-error:0.039647
[9]	eval-error:0.039542	train-error:0.039674
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039411	train-error:0.039779
[6]	eval-error:0.039490	train-error:0.039779
[7]	eval-error:0.039384	train-error:0.039779
[8]	eval-error:0.039490	train-error:0.039621
[9]	eval-error:0.039490	train-error:0.039647
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    10
test_auc         10
dtype: float64

In [15]:
# optimise alpha
num_round = 10
param_list = []
cross_val_list = []
test_auc_list = []
for i in [0, 0.3, 0.6, 1, 10 ]:
    param = {'max_depth':4, 'eta': 0.6, 'gamma':1, 'min_child_weight': 10, 'subsample':1, 'lambda': 0.1, 'alpha':i,
             'silent':1, 'objective':'binary:logistic'}
    cross_val_auc, test_auc = optimise_xgb(param)
    cross_val_list.append(cross_val_auc), test_auc_list.append(test_auc), param_list.append(i)

final_vals = pd.DataFrame([cross_val_list,test_auc_list]).T
final_vals.columns = ['cross_val_auc', 'test_auc']
final_vals.index = param_list
final_vals.idxmax()

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039437	train-error:0.039700
[6]	eval-error:0.039463	train-error:0.039700
[7]	eval-error:0.039463	train-error:0.039647
[8]	eval-error:0.039542	train-error:0.039647
[9]	eval-error:0.039542	train-error:0.039674
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039384	train-error:0.039753
[6]	eval-error:0.039463	train-error:0.039700
[7]	eval-error:0.039411	train-error:0.039700
[8]	eval-error:0.039358	train-error:0.039542
[9]	eval-error:0.039358	train-error:0.039542
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-e

cross_val_auc    1.0
test_auc         0.3
dtype: float64

In [9]:
# do crossvalidation
print ('running cross validation')
seed_seet = 0
param = {'max_depth':4, 'eta': 0.6, 'gamma':1, 'min_child_weight': 10, 'subsample':1, 'lambda': 0.1, 'alpha':0,
             'silent':1, 'objective':'binary:logistic', 'seed': seed_set}
num_round = 40

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'})
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

# average the predictions of 10 xgboosts on different seedds
prediction = np.zeros(X_test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest)

prediction = prediction/10
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
    test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0        0.799736      0.008268        0.818959       0.002285
1        0.807840      0.010605        0.827151       0.001994
2        0.814515      0.008969        0.833443       0.002419
3        0.819675      0.011359        0.838655       0.001546
4        0.823675      0.009228        0.842478       0.002555
5        0.826621      0.008777        0.846644       0.001436
6        0.826998      0.009952        0.849780       0.001479
7        0.827021      0.009507        0.852429       0.001675
8        0.827213      0.009285        0.855271       0.003033
9        0.828323      0.008489        0.857184       0.003448
10       0.828623      0.008965        0.860307       0.002435
11       0.827788      0.009229        0.861818       0.002303
12       0.827087      0.011415        0.864050       0.003118
13       0.827149      0.011002        0.865435       0.002695
14       0.826933      0.01096

In [10]:
## Save predictions for model stacking
#prediction = xgb_boruta.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_boruta_big.csv", index=False)

In [18]:
## Submission

prediction = np.zeros(test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest_sub)
    
#prediction = xgb_boruta.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_boruta_big.csv", index=False)