In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
from math import ceil

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

In [3]:
#load data
train = pd.read_csv('train_nonCorr.csv')
test = pd.read_csv('test_nonCorr.csv')

y = pd.read_csv('train.csv')['TARGET']
test_id = pd.read_csv('test.csv')['ID']

train.shape, test.shape

((76020, 12), (75818, 12))

In [4]:
# separate train and test set, train 50%
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.5,random_state=290977)

In [5]:
# Normalise features
std_model = StandardScaler()
X_train = std_model.fit_transform(X_train)
X_test = std_model.transform(X_test)

test = std_model.transform(test)

In [6]:
# Make polynomial features
poly = PolynomialFeatures(degree = 3, include_bias=False)
poly.fit(X_train)

PolynomialFeatures(degree=3, include_bias=False, interaction_only=False)

In [7]:
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

test_poly = poly.transform(test)

X_train_poly.shape, X_test_poly.shape

((38010, 454), (38010, 454))

In [8]:
## Feature selection
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train_poly, y_train)

# clf.feature_importances_ 
fs = SelectFromModel(selector, prefit=True)

X_train_poly_sm = fs.transform(X_train_poly)
X_test_poly_sm = fs.transform(X_test_poly)

test_poly_sm = fs.transform(test_poly)

X_train_poly_sm.shape, X_test_poly_sm.shape

((38010, 158), (38010, 158))

In [9]:
# classifier from xgboost

# generate sparse matrices
dtrain = xgb.DMatrix(X_train_poly_sm, label = y_train)
dtest = xgb.DMatrix(X_test_poly_sm, label = y_test)

dtest_sub = xgb.DMatrix(test_poly_sm)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.784845      0.014412        0.793491       0.004886
1       0.797112      0.013032        0.818932       0.003563
2       0.800776      0.011269        0.830045       0.003786
3       0.805375      0.009634        0.839997       0.002118
4       0.802310      0.010891        0.847614       0.002061
5       0.798261      0.008634        0.854278       0.002105
6       0.794799      0.011496        0.859929       0.002294
7       0.791985      0.013527        0.865184       0.001881
8       0.788798      0.013258        0.869741       0.002291
9       0.786763      0.014562        0.874341       0.003238
Best Interaction:  3
building model and testing on test set
[0]	eval-error:0.039621	train-error:0.039674
[1]	eval-error:0.039358	train-error:0.039700
[2]	eval-error:0.039384	train-error:0.039437
[3]	eval-error:0.039437	train-error:0.039437
[4]	eval-error:0.039621	train-error:0.039411
Best Int

In [10]:
# Select K best features
fs = SelectKBest(f_classif, k = 40).fit(X_train_poly, y_train)

X_train_poly_sm = fs.transform(X_train_poly)
X_test_poly_sm = fs.transform(X_test_poly)

test_poly_sm = fs.transform(test_poly)

X_train_poly_sm.shape, X_test_poly_sm.shape

((38010, 40), (38010, 40))

In [11]:
# classifier from xgboost

# generate sparse matrices
dtrain = xgb.DMatrix(X_train_poly_sm, label = y_train)
dtest = xgb.DMatrix(X_test_poly_sm, label = y_test)

dtest_sub = xgb.DMatrix(test_poly_sm)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.796389      0.013274        0.803325       0.004273
1       0.802455      0.011711        0.818759       0.005482
2       0.805381      0.011407        0.830278       0.003689
3       0.808906      0.007471        0.838884       0.003542
4       0.806351      0.009997        0.845838       0.003046
5       0.806891      0.012146        0.852174       0.003545
6       0.806784      0.011806        0.857144       0.002704
7       0.805296      0.011760        0.861432       0.001735
8       0.803195      0.010570        0.864589       0.001307
9       0.802498      0.010263        0.867349       0.002370
Best Interaction:  3
building model and testing on test set
[0]	eval-error:0.039516	train-error:0.039516
[1]	eval-error:0.039516	train-error:0.039569
[2]	eval-error:0.039516	train-error:0.039542
[3]	eval-error:0.039569	train-error:0.039542
[4]	eval-error:0.039595	train-error:0.039358
Best Int

In [12]:
## Save predictions for model stacking
prediction = xgb_model.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_polynomial.csv", index=False)

In [13]:
## Submission
prediction = xgb_model.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_polynomial.csv", index=False)

In [14]:
# classifier from xgboost: LINEAR

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic', 'booster':'gblinear'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.770650      0.020034        0.771281       0.006621
1       0.772637      0.019822        0.776092       0.006030
2       0.772934      0.019834        0.778971       0.004638
3       0.773545      0.020544        0.781734       0.003400
4       0.778826      0.017183        0.785304       0.003420
5       0.779889      0.016951        0.786826       0.003541
6       0.780436      0.017135        0.787575       0.003256
7       0.780526      0.016490        0.787846       0.003422
8       0.781256      0.016656        0.788221       0.003374
9       0.781381      0.016798        0.788489       0.003400
Best Interaction:  9
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039384	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039805
[3]	eval-error:0.039358	train-error:0.039805
[4]	eval-error:0.039358	train-error:0.039858
[5]	eval

In [16]:
## Save predictions for model stacking
prediction = xgb_model.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_polynomial_gblinear.csv", index=False)

In [17]:
## Submission
prediction = xgb_model.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_polynomial_gblinear.csv", index=False)

In [15]:
# logit
logit = LogisticRegression()
logit.fit(X_train_poly_sm, y_train)

print("Roc AUC test: ", roc_auc_score(y_test,logit.predict_proba(X_test_poly_sm)[:,1],
              average='macro'))

print("Roc AUC train: ", roc_auc_score(y_train, logit.predict_proba(X_train_poly_sm)[:,1],
              average='macro'))

Roc AUC test:  0.781378897157
Roc AUC train:  0.788900315912
