# Importing Libraries

In [None]:
import numpy as np # support for multi-dimensional arrays and matrices
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
sns.set(style="white", color_codes=True)

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

# Input data files are available in the "./input/" directory.

from subprocess import check_output
print(check_output(["ls", "."]).decode("utf8"))

# Importing Datas

In [5]:
print('Load data...')
train = pd.read_csv("./train.csv")
train_id = train['ID'].values
target = train['TARGET'].values

test = pd.read_csv("./test.csv")
test_id = test['ID'].values

Load data...


In [6]:
train = train.drop(['ID','TARGET'],axis=1)
test = test.drop(['ID'],axis=1)

In [8]:
# adding zero counts
train["zeroes"] = (train == 0).astype(int).sum(axis=1)
test["zeroes"] = (test == 0).astype(int).sum(axis=1)

# remove constant columns (std = 0)
remove = []
for col in train.columns:
    if train[col].std() == 0:
        remove.append(col)

train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)

print(train.shape, test.shape,target.shape)

(76020, 336) (75818, 336) (76020,)


In [9]:
Cols = train.columns.values.tolist()
clf = GradientBoostingClassifier(random_state = 8001)

print(train.head(3))
print('a')

selector = clf.fit(train, target)
importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
train = fs.transform(train)
test = fs.transform(test)
print(train.shape, test.shape)

   var3  var15  imp_ent_var16_ult1  imp_op_var39_comer_ult1  \
0     2     23                 0.0                      0.0   
1     2     34                 0.0                      0.0   
2     2     23                 0.0                      0.0   

   imp_op_var39_comer_ult3  imp_op_var40_comer_ult1  imp_op_var40_comer_ult3  \
0                      0.0                      0.0                      0.0   
1                      0.0                      0.0                      0.0   
2                      0.0                      0.0                      0.0   

   imp_op_var40_efect_ult1  imp_op_var40_efect_ult3  imp_op_var40_ult1  \
0                      0.0                      0.0                0.0   
1                      0.0                      0.0                0.0   
2                      0.0                      0.0                0.0   

    ...    saldo_medio_var33_hace2  saldo_medio_var33_hace3  \
0   ...                        0.0                      0.0   
1  

In [10]:
selectedCols = train.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]
train = pd.DataFrame(train)
test = pd.DataFrame(test)
train.columns = sortedCols
test.columns = sortedCols

print(sortedCols[0:5])

['delta_imp_amort_var18_1y3', 'delta_imp_amort_var34_1y3', 'delta_imp_aport_var13_1y3', 'delta_imp_aport_var17_1y3', 'delta_imp_aport_var33_1y3']


In [11]:
train = train.replace(np.inf, 999999)
train = train.replace(-np.inf, -999999)
train = train.replace(np.nan, -1)
test = test.replace(np.inf, 999999)
test = test.replace(-np.inf, -999999)
test = test.replace(np.nan, -1)

In [12]:
# Second round of gradient boosting
Cols = train.columns.values.tolist()
clf = GradientBoostingClassifier(random_state=1729)
selector = clf.fit(train, target)

print(train)
print('a')
importances = selector.feature_importances_
fs = SelectFromModel(selector, prefit=True)
train = fs.transform(train)
test = fs.transform(test)
print(train.shape, test.shape)

selectedCols = train.shape[1]
sortedCols = [col for importance, col  in sorted(zip(importances, Cols))]
sortedCols = sortedCols[0:selectedCols]

       delta_imp_amort_var18_1y3  delta_imp_amort_var34_1y3  \
0                            2.0                       23.0   
1                            2.0                       34.0   
2                            2.0                       23.0   
3                            2.0                       37.0   
4                            2.0                       39.0   
5                            2.0                       23.0   
6                            2.0                       27.0   
7                            2.0                       26.0   
8                            2.0                       45.0   
9                            2.0                       25.0   
10                           2.0                       42.0   
11                           2.0                       26.0   
12                           2.0                       51.0   
13                           2.0                       43.0   
14                           2.0                       

In [13]:
import xgboost as xgb



In [14]:
from sklearn.cross_validation import KFold

In [15]:
# Create an empty array for prediction
predictedResult = np.zeros(train.shape[0])

# Split dataset into k = 10 consecutive folds
# Each fold is used once as a validation while the k - 1 remaining folds form the training set
kf = KFold(train.shape[0], n_folds=10)

testPred = []

for trainIndex, testIndex in kf:
    trainFold, testFold = train[trainIndex], train[testIndex]
    trainFoldTarget, testFoldTarget = target[trainIndex], target[testIndex]
    
    xgbc = xgb.XGBClassifier(n_estimators = 560, # number of boosted trees
                             learning_rate = 0.0202047, # step size shrinkage used in update to prevent overfitting
                             max_depth = 5, # maximum depth of a tree
                             subsample = 0.6815, # subsample ratio of the training set (Stochastic gradient boosting)
                             colsample_bytree = 0.701) # subsample features
    
    xgbc.fit(trainFold, trainFoldTarget)
    xgbpred =xgbc.predict_proba(testFold)[:,1]
    testPred.append(xgbc.predict_proba(test)[:,1])
    predictedResult[testIndex] = xgbpred
    
    print(testFoldTarget.shape)
    # Print the AUC
    print(roc_auc_score(testFoldTarget, xgbpred))

0.836487426064
0.836798285403
0.8232672329
0.831966713627
0.843268800343
0.839548695888
0.842184643425
0.862213695787
0.847774466934
0.825447708348


In [16]:
print(roc_auc_score(target, predictedResult))
testPred = np.average(np.array(testPred), axis =0)
pd.DataFrame({"ID": test_id, "TARGET": testPred}).to_csv('submission.csv',index=False)

0.838327659232
