In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [15]:
import pandas as pd
import numpy as np
from math import ceil

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb

In [4]:
# load data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

y_target = pd.read_csv('train.csv')['TARGET']
test_id = pd.read_csv('test.csv')['ID']

In [8]:
## Feature selection
features = pd.read_csv('extraTreesFeatures.csv')['feature']
features.head()

0                     var38
1                     var15
2     saldo_medio_var5_ult3
3    saldo_medio_var5_hace3
4            num_var45_ult3
Name: feature, dtype: object

In [9]:
## Third Model, with features selected from small Boruta Classifier
boruta_vars = ['var15', 'num_var37_med_ult2', 'num_var37_0', 'saldo_var5', 'saldo_var30', 'saldo_var37', 'saldo_var42',
               'num_var22_hace3', 'num_var22_ult3', 'num_med_var45_ult3', 'num_meses_var5_ult3', 'num_var45_hace2',
               'num_var45_hace3', 'num_var45_ult3', 'saldo_medio_var5_hace2', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3',
               'no0']

In [11]:
## Fourth Model, with features selected from small Boruta + ExtraTrees Classifier
vars_to_keep = list(features[0:40])

diff_vars =[]
for var in boruta_vars:
    if var not in vars_to_keep:
        diff_vars.append(var)
        
diff_vars

['num_var37_med_ult2', 'num_var37_0', 'saldo_var37']

In [12]:
for var in diff_vars:
    vars_to_keep.append(var)

len(vars_to_keep)

43

In [13]:
# split data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(train[vars_to_keep], y_target, test_size=0.5,random_state=290977)
X_train.shape

(38010, 43)

In [16]:
## Train Model

# generate sparse matrices
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

dtest_sub = xgb.DMatrix(test[vars_to_keep])

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.792807      0.015997        0.811858       0.002268
1       0.807573      0.012799        0.828273       0.001659
2       0.822881      0.010505        0.841285       0.002909
3       0.822094      0.010254        0.847865       0.001935
4       0.823297      0.010107        0.852761       0.002285
5       0.823300      0.010420        0.857237       0.002504
6       0.822049      0.010456        0.860843       0.002523
7       0.819952      0.013085        0.864827       0.002846
8       0.817771      0.012292        0.868454       0.002220
9       0.816232      0.013941        0.871742       0.002989
Best Interaction:  5
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039490	train-error:0.039700
[2]	eval-error:0.039595	train-error:0.039674
[3]	eval-error:0.039305	train-error:0.039700
[4]	eval-error:0.039621	train-error:0.039516
[5]	eval

### For each selected:

1) Generate 4 binary variables segregated by quartiles.

2) Generate the log transformation.

3) Select the best of the 6 against the target variable, and drop the rest.

In [17]:
# categorisation by quantiles + add log transformation
def feat_eng(col, data):
    a = False
    try:
        temp = pd.qcut(data[col], [0, .25, .5, .75, 1], labels = ['FirstQ','SecondQ','ThirdQ','FourthQ'])
        temp_dummies = pd.get_dummies(temp)
        a = True
    except:
        pass
    
    if a:
        temp_dummies['log'] = np.log(data[col]+1)
        temp_dummies['asis'] = data[col]
    else:
        temp_dummies = pd.DataFrame(index=data.index, columns= ['asis', 'log'])
        temp_dummies['log'] = np.log(data[col]+1)
        temp_dummies['asis'] = data[col]
        
    return temp_dummies

In [28]:
# feature selection function
def check_features(Y, features):
    X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size=0.5, random_state=42)
    clf = ExtraTreesClassifier(random_state=1729)
    selector = clf.fit(X_train, y_train)
    
    feat_imp = pd.Series(clf.feature_importances_, index = features.columns.values)
    feat_imp = feat_imp.sort_values(ascending=False)
    #selector = SelectKBest(f_regression, k = 1)
    #selector.fit(X_train, Y_train)
    
    return feat_imp.index[0]

In [29]:
feat_eng('var38', train).shape, y_target.shape

((76020, 6), (76020,))

In [30]:
check_features(y_target, feat_eng('var38', train))

'asis'

In [31]:
import warnings
warnings.filterwarnings('ignore')

train_sm = train[vars_to_keep]
test_sm = test[vars_to_keep]

for col in vars_to_keep:
    var = feat_eng(col, train_sm)
    var.log.fillna(var.log.mean(), inplace = True)

    temp = check_features(y_target,var)
    col_name = col + '_' + temp
    train_sm[col_name] = var[temp]
    train_sm.drop(col, axis = 1, inplace = True)
    
    # for the test set
    var = feat_eng(col, test_sm)
    var.log.fillna(var.log.mean(), inplace = True)
    test_sm[col_name] = var[temp]
    test_sm.drop(col, axis = 1, inplace = True)

In [32]:
train_sm.shape, test_sm.shape

((76020, 43), (75818, 43))

In [34]:
train_sm.columns

Index(['var38_asis', 'var15_log', 'saldo_medio_var5_ult3_asis',
       'saldo_medio_var5_hace3_asis', 'num_var45_ult3_log', 'no0_FourthQ',
       'num_var45_hace3_log', 'num_var45_hace2_asis', 'num_var22_ult3_asis',
       'num_var45_ult1_log', 'num_var22_hace3_asis', 'var36_asis',
       'num_med_var45_ult3_log', 'num_meses_var5_ult3_asis',
       'saldo_medio_var5_hace2_asis', 'saldo_var30_asis',
       'saldo_medio_var5_ult1_asis', 'num_var22_hace2_asis',
       'saldo_var42_asis', 'saldo_var5_asis', 'ind_var30_asis',
       'num_meses_var39_vig_ult3_asis', 'num_var22_ult1_log',
       'num_med_var22_ult3_log', 'ind_var5_asis',
       'imp_op_var41_comer_ult3_log', 'imp_op_var39_efect_ult3_asis',
       'num_op_var41_ult3_log', 'imp_ent_var16_ult1_asis',
       'num_op_var39_efect_ult3_log', 'num_op_var39_comer_ult3_log',
       'num_op_var39_ult3_log', 'num_op_var39_ult1_asis',
       'num_op_var41_comer_ult3_log', 'imp_op_var39_comer_ult3_asis',
       'imp_op_var41_ult1_log', 'nu

In [35]:
test_sm.columns

Index(['var38_asis', 'var15_log', 'saldo_medio_var5_ult3_asis',
       'saldo_medio_var5_hace3_asis', 'num_var45_ult3_log', 'no0_FourthQ',
       'num_var45_hace3_log', 'num_var45_hace2_asis', 'num_var22_ult3_asis',
       'num_var45_ult1_log', 'num_var22_hace3_asis', 'var36_asis',
       'num_med_var45_ult3_log', 'num_meses_var5_ult3_asis',
       'saldo_medio_var5_hace2_asis', 'saldo_var30_asis',
       'saldo_medio_var5_ult1_asis', 'num_var22_hace2_asis',
       'saldo_var42_asis', 'saldo_var5_asis', 'ind_var30_asis',
       'num_meses_var39_vig_ult3_asis', 'num_var22_ult1_log',
       'num_med_var22_ult3_log', 'ind_var5_asis',
       'imp_op_var41_comer_ult3_log', 'imp_op_var39_efect_ult3_asis',
       'num_op_var41_ult3_log', 'imp_ent_var16_ult1_asis',
       'num_op_var39_efect_ult3_log', 'num_op_var39_comer_ult3_log',
       'num_op_var39_ult3_log', 'num_op_var39_ult1_asis',
       'num_op_var41_comer_ult3_log', 'imp_op_var39_comer_ult3_asis',
       'imp_op_var41_ult1_log', 'nu

In [36]:
#split train and test
X_train, X_test, y_train, y_test = train_test_split(train_sm, y_target, test_size=0.5,random_state=290977)

# generate sparse matrices
dtrain = xgb.DMatrix(X_train, label = y_train)
dtest = xgb.DMatrix(X_test, label = y_test)

dtest_sub = xgb.DMatrix(test_sm)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.792822      0.016001        0.811849       0.002284
1       0.807667      0.012847        0.828312       0.001710
2       0.822200      0.011259        0.841347       0.002937
3       0.821298      0.009447        0.847474       0.002175
4       0.820283      0.009913        0.852643       0.001985
5       0.818479      0.009924        0.857877       0.002100
6       0.817198      0.012364        0.860587       0.002436
7       0.814936      0.012671        0.863123       0.003341
8       0.812908      0.014672        0.866481       0.003700
9       0.811923      0.012893        0.869675       0.003933
Best Interaction:  2
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039490	train-error:0.039700
[2]	eval-error:0.039595	train-error:0.039674
Best Inter:  2
Roc AUC test:  0.824771161321


In [38]:
# do crossvalidation
print ('running cross validation')
seed_set = 0
param = {'max_depth':2, 'eta': 0.6, 'gamma':10, 'min_child_weight': 1, 'subsample':1, 'lambda': 10, 'alpha':0.3,
             'silent':1, 'objective':'binary:logistic', 'seed': seed_set}
num_round = 40

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'})
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

# average the predictions of 10 xgboosts on different seedds
prediction = np.zeros(X_test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest)

prediction = prediction/10
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
    test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0        0.720463      0.019804        0.722324       0.004688
1        0.724810      0.020068        0.728660       0.003141
2        0.778931      0.020352        0.784862       0.004757
3        0.794773      0.013949        0.801444       0.004147
4        0.795925      0.016113        0.808174       0.003236
5        0.809745      0.011975        0.818244       0.003949
6        0.813388      0.009822        0.824161       0.003916
7        0.817854      0.009701        0.827853       0.002620
8        0.820999      0.009185        0.830891       0.003329
9        0.823019      0.010525        0.833746       0.003176
10       0.824611      0.010469        0.835726       0.002783
11       0.825642      0.010880        0.837216       0.002139
12       0.825156      0.010721        0.838003       0.001664
13       0.825156      0.010721        0.838003       0.001664
14       0.825156      0.01072

In [39]:
# Save predictions on X_test for model stacking
#prediction = xgb_extraTreesParam.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_EngFeat.csv", index=False)

In [40]:
prediction = np.zeros(test.shape[0])
for seed_set in [1,2,3,4,5,6,7,8,9,10]:
    xgb_extraTreesParam = xgb.train(param, dtrain, num_round, watchlist)
    prediction = prediction + xgb_extraTreesParam.predict(dtest_sub)

prediction = prediction/10
#prediction = xgb_extraTreesParam.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_EngFeat.csv", index=False)

[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval-error:0.039358	train-error:0.039779
[6]	eval-error:0.039358	train-error:0.039779
[7]	eval-error:0.039358	train-error:0.039779
[8]	eval-error:0.039358	train-error:0.039779
[9]	eval-error:0.039358	train-error:0.039779
[10]	eval-error:0.039358	train-error:0.039779
[11]	eval-error:0.039358	train-error:0.039779
[12]	eval-error:0.039358	train-error:0.039779
[13]	eval-error:0.039358	train-error:0.039779
[14]	eval-error:0.039358	train-error:0.039779
[15]	eval-error:0.039358	train-error:0.039779
[16]	eval-error:0.039358	train-error:0.039779
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039358	train-error:0.039779
[4]	eval-error:0.039358	train-error:0.039779
[5]