In [1]:
import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, KernelPCA, TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier

import xgboost as xgb

In [3]:
#load data
train = pd.read_csv('train_nonCorr.csv')
test = pd.read_csv('test_nonCorr.csv')

train.shape, test.shape

((76020, 12), (75818, 12))

In [4]:
# load target
y = pd.read_csv('train.csv')['TARGET']

In [5]:
# split train data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.6,random_state=12345)

In [6]:
# standarisation of train and test sets
std_model = StandardScaler()
X_train = std_model.fit_transform(X_train)
X_test = std_model.transform(X_test)

test_std = std_model.transform(test)

In [7]:
# Polynomial feature creation
poly = PolynomialFeatures(degree = 3)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

test_poly = poly.transform(test_std)

In [8]:
# clf.feature_importances_ 
fs = SelectKBest(f_classif, k = 15).fit(X_train_poly, y_train)

X_train_poly_sm = fs.transform(X_train_poly)
X_test_poly_sm = fs.transform(X_test_poly)

test_poly_sm = fs.transform(test_poly)



In [9]:
# classifier from xgboost
m1_xgb_poly = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, seed=1729)
m1_xgb_poly.fit(X_train_poly_sm, y_train, eval_metric="auc", verbose = False,
           eval_set=[(X_test_poly_sm, y_test)])

print("Roc AUC test: ", roc_auc_score(y_test, m1_xgb_poly.predict_proba(X_test_poly_sm)[:,1],
              average='macro'))

print("Roc AUC train: ", roc_auc_score(y_train, m1_xgb_poly.predict_proba(X_train_poly_sm)[:,1],
              average='macro'))

Roc AUC test:  0.806826064463
Roc AUC train:  0.858078760109


In [10]:
# Truncated single value decomposition for dimensionality reduction
svd_mod = TruncatedSVD(n_components =10)
svd_mod.fit(X_train)

X_train_svd = svd_mod.transform(X_train)
X_test_svd = svd_mod.transform(X_test)

# classifier from xgboost
m2_xgb_svd = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, seed=1729)
m2_xgb_svd.fit(X_train_svd, y_train, eval_metric="auc", verbose = False,
           eval_set=[(X_test_svd, y_test)])

print("Roc AUC test: ", roc_auc_score(y_test, m2_xgb_svd.predict_proba(X_test_svd)[:,1],
              average='macro'))

print("Roc AUC train: ", roc_auc_score(y_train, m2_xgb_svd.predict_proba(X_train_svd)[:,1],
              average='macro'))

Roc AUC test:  0.802631435195
Roc AUC train:  0.887209460736


In [11]:
#load data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

boruta_vars = ['var38','var15', 'num_var37_med_ult2', 'num_var37_0', 'saldo_var5', 'saldo_var30', 'saldo_var37', 'saldo_var42',
               'num_var22_hace3', 'num_var22_ult3', 'num_med_var45_ult3', 'num_meses_var5_ult3', 'num_var45_hace2',
               'num_var45_hace3', 'num_var45_ult3', 'saldo_medio_var5_hace2', 'saldo_medio_var5_ult1', 'saldo_medio_var5_ult3',
               'no0']

X_train, X_test, y_train, y_test = train_test_split(train[boruta_vars], y, test_size=0.4,
                                                    random_state=1729 )

# classifier from xgboost
m3_xgb = xgb.XGBClassifier(n_estimators=110, nthread=-1, max_depth = 4, seed=1729)
m3_xgb.fit(X_train, y_train, eval_metric="auc", verbose = False,
           eval_set=[(X_test, y_test)])

print("Roc AUC test: ", roc_auc_score(y_test, m3_xgb.predict_proba(X_test)[:,1],
              average='macro'))

print("Roc AUC train: ", roc_auc_score(y_train, m3_xgb.predict_proba(X_train)[:,1],
              average='macro'))

Roc AUC test:  0.8247663035
Roc AUC train:  0.876585413816


In [None]:
## # Submission
probs_1 = m1_xgb_poly.predict_proba(test_std)
probs_2 = m2_xgb_svd.predict_proba(test_poly)
probs_3 = m3_xgb.predict_proba(test[boruta_vars])

In [None]:
prediction = (probs_1[:,1] + probs_2[:,1] + 5*probs_3[:,1]) / 5

submission = pd.read_csv('sample_submission.csv')
submission.TARGET = prediction
submission.to_csv("Final_submission.csv", index=False)