In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
from math import ceil

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif

import xgboost as xgb

In [3]:
#load data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

y_target = pd.read_csv('train.csv')['TARGET']
test_id = pd.read_csv('test.csv')['ID']

In [4]:
# split data into train and test set (train set 50%)
X_train, X_test, y_train, y_test = train_test_split(train, y_target, test_size=0.5,random_state=290977)

In [5]:
# to select Principal components it is necessary to standarise the features
std_model = StandardScaler()
X_train = std_model.fit_transform(X_train)
X_test = std_model.transform(X_test)

test = std_model.transform(test)

In [6]:
# Principal component analysis
pca_mod = PCA(n_components = 8)
pca_mod.fit(X_train)

PCA(copy=True, n_components=8, whiten=False)

In [7]:
pca_mod.explained_variance_ratio_

array([ 0.08721071,  0.05937304,  0.04304532,  0.03885253,  0.03249657,
        0.03159644,  0.03070391,  0.02816655])

In [8]:
X_train_pca = pca_mod.transform(X_train)
X_test_pca = pca_mod.transform(X_test)

X_train_pca.shape, X_test_pca.shape

((38010, 8), (38010, 8))

In [9]:
## Model 1: Train Model

# generate sparse matrices
dtrain = xgb.DMatrix(X_train_pca, label = y_train)
dtest = xgb.DMatrix(X_test_pca, label = y_test)

#dtest_sub = xgb.DMatrix(test)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.747377      0.014812        0.767501       0.004081
1       0.771801      0.010288        0.794503       0.004627
2       0.779852      0.011254        0.804728       0.002806
3       0.782828      0.010473        0.814308       0.002713
4       0.782498      0.011473        0.822538       0.003407
5       0.780539      0.013045        0.827215       0.003172
6       0.778380      0.015297        0.831498       0.003136
7       0.776859      0.014850        0.836088       0.003893
8       0.779498      0.012895        0.839943       0.004661
9       0.778201      0.012507        0.844654       0.004594
Best Interaction:  3
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039411	train-error:0.039805
[3]	eval-error:0.039411	train-error:0.039805
[4]	eval-error:0.039411	train-error:0.039805
Best Int

In [10]:
# Truncated single value decomposition for dimensionality reduction
svd_mod = TruncatedSVD(n_components =10)
svd_mod.fit(X_train)

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=5,
       random_state=None, tol=0.0)

In [11]:
svd_mod.explained_variance_ratio_

array([ 0.08721071,  0.05937302,  0.04304514,  0.03885128,  0.03249106,
        0.03159044,  0.03069211,  0.02816362,  0.02506688,  0.02292882])

In [12]:
X_train_svd = svd_mod.transform(X_train)
X_test_svd = svd_mod.transform(X_test)

test_svd = svd_mod.transform(test)

In [13]:
# generate sparse matrices
dtrain = xgb.DMatrix(X_train_svd, label = y_train)
dtest = xgb.DMatrix(X_test_svd, label = y_test)

dtest_sub = xgb.DMatrix(test_svd)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
   test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0       0.747882      0.013762        0.766662       0.004329
1       0.767692      0.011574        0.790811       0.006006
2       0.780323      0.012822        0.807599       0.003461
3       0.778694      0.011714        0.816967       0.004401
4       0.784280      0.009520        0.823267       0.004024
5       0.782257      0.010463        0.830404       0.003309
6       0.783902      0.010277        0.835638       0.002592
7       0.783510      0.010866        0.839936       0.002060
8       0.780411      0.008867        0.845243       0.001993
9       0.777490      0.009596        0.849053       0.001800
Best Interaction:  4
building model and testing on test set
[0]	eval-error:0.039358	train-error:0.039779
[1]	eval-error:0.039358	train-error:0.039779
[2]	eval-error:0.039358	train-error:0.039779
[3]	eval-error:0.039411	train-error:0.039884
[4]	eval-error:0.039358	train-error:0.039779
[5]	eval

In [14]:
# save prediction for model stacking
prediction = xgb_model.predict(dtest)
submission = pd.DataFrame({"TARGET": prediction})
submission.to_csv("test_xgb_SDV.csv", index=False)

In [15]:
# submission
prediction = xgb_model.predict(dtest_sub)
submission = pd.DataFrame({"ID":test_id, "TARGET": prediction})
submission.to_csv("submission_xgb_svd.csv", index=False)

In [None]:
# Try boosting on a linear model instead of trees
# Truncated single value decomposition for dimensionality reduction
svd_mod = TruncatedSVD(n_components =30)
svd_mod.fit(X_train)

X_train_svd = svd_mod.transform(X_train)
X_test_svd = svd_mod.transform(X_test)

test_svd = svd_mod.transform(test)

dtrain = xgb.DMatrix(X_train_svd, label = y_train)
dtest = xgb.DMatrix(X_test_svd, label = y_test)

dtest_sub = xgb.DMatrix(test_svd)
# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic', 'booster':'gblinear'}
num_round = 20

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

running cross validation
    test-auc-mean  test-auc-std  train-auc-mean  train-auc-std
0        0.744949      0.024065        0.745082       0.004998
1        0.747328      0.023331        0.747275       0.006187
2        0.750062      0.023265        0.750510       0.005432
3        0.752460      0.023043        0.753355       0.004885
4        0.753547      0.023186        0.755064       0.005101
5        0.754174      0.022992        0.755648       0.005216
6        0.754509      0.022799        0.756160       0.005344
7        0.754945      0.022396        0.756557       0.005344
8        0.755624      0.022143        0.757176       0.005392
9        0.755697      0.021908        0.757353       0.005585
10       0.756030      0.021821        0.757727       0.005525
11       0.756284      0.021343        0.757995       0.005571
12       0.756302      0.020941        0.758258       0.005479
13       0.756361      0.020835        0.758464       0.005459
14       0.756465      0.02094

In [None]:
# kernel pca for dimensionality reduction
kpca = KernelPCA(kernel="rbf")
kpca.fit(X_train)

In [None]:
X_train_kpca = kpca.transform(X_train)
X_test_kpca = kpca.transform(X_test)

X_train_kpca.shape

In [None]:
## Feature selection
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train_kpca, y_train)

## recursive feature elimination from ExtraTrees Classifier
fs = SelectFromModel(selector, prefit=True)

# transform data for model validatio
X_train_sm = fs.transform(X_train_kpca)
X_test_sm = fs.transform(X_test_kpca)

# also transform data for submission
#test_sm = fs.transform(test)

In [None]:
X_train_sm.shape

In [None]:
# generate sparse matrices
dtrain = xgb.DMatrix(X_train_sm, label = y_train)
dtest = xgb.DMatrix(X_test_sm, label = y_test)

#dtest_sub = xgb.DMatrix(test)

# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

In [None]:
fs2 = SelectKBest(f_classif, k=45)
fs2.fit(X_train_kpca, y_train)

# transform data for model validatio
X_train_sm2 = fs2.transform(X_train_kpca)
X_test_sm2 = fs2.transform(X_test_kpca)

# also transform data for submission
#test_sm = fs.transform(test)
dtrain = xgb.DMatrix(X_train_sm2, label = y_train)
dtest = xgb.DMatrix(X_test_sm2, label = y_test)

print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic'}
num_round = 10

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))

In [None]:
dtrain = xgb.DMatrix(X_train_sm2, label = y_train)
dtest = xgb.DMatrix(X_test_sm2, label = y_test)

dtest_sub = xgb.DMatrix(test_svd)
# do crossvalidation
print ('running cross validation')
param = {'max_depth':4, 'eta':1, 'silent':0, 'objective':'binary:logistic', 'booster':'gblinear'}
num_round = 20

model_cv = xgb.cv(param, dtrain, num_round, nfold = 5,  metrics={'auc'}, seed = 0)
print(model_cv.T.T)
bestIter = model_cv.T.loc['test-auc-mean'].idxmax()
print("Best Interaction: ", bestIter)
print('=======================')
print ('building model and testing on test set')
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = ceil(int(bestIter) * 1.5)

xgb_model = xgb.train(param, dtrain, num_round, watchlist)
prediction = xgb_model.predict(dtest)
print('Best Inter: ', xgb_model.best_iteration)
print("Roc AUC test: ", roc_auc_score(y_test, prediction, average='macro'))