In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import gc
gc.enable()
import xgboost as xgb
import utilities

In [3]:
train = utilities.read_csv('../../datatmp/data/trainjoincollnoisy.csv')
test = utilities.read_csv('../../datatmp/data/testjoincollnoisy.csv')

In [4]:
target = train['TARGET'].astype(int).copy()
train.drop('TARGET',axis=1, inplace=True)
testIds = test['SK_ID_CURR'].astype(int).copy()
train.drop('SK_ID_CURR', axis=1, inplace=True)
test.drop('SK_ID_CURR', axis=1, inplace=True)

In [5]:
# Due to the last version of lgb, we got an error related to special chars in column names. Replacing the blanks in the following fixed it
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

In [6]:
impFeats = []
importances = pd.read_csv('../../datatmp/data/importances.csv')

for feat in importances:
    if importances.loc[0][feat] > 0:
        impFeats.append(feat)

In [7]:
# Final train test
train = train[impFeats]
test = test[impFeats]

In [8]:
# K-Fold xgboost train and predict
from sklearn.model_selection import KFold

def trainNpredXgb(train, test, y, nFolds):

    testPreds = np.zeros(test.shape[0])

    # Unbalanced data: use cv
    # Set seed to make the experiment reproducible
    folds = KFold(n_splits=nFolds, shuffle=True, random_state=42)

    for fold, (trainIds, valIds) in enumerate(folds.split(train, y)):

        trainX, trainY = train.iloc[trainIds], y.iloc[trainIds]
        valX, valY = train.iloc[valIds], y.iloc[valIds]

        # Params taken from https://www.kaggle.com/tunguz/xgb-simple-features#L277
        clf = xgb.XGBClassifier(learning_rate =0.01, n_estimators=10000, max_depth=4, min_child_weight=5, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', 
            nthread=4, scale_pos_weight=2.5, seed=27, reg_lambda = 1.2)

        
        clf.fit(trainX, trainY, eval_set=[(valX, valY)], eval_metric= 'auc', verbose= 100, early_stopping_rounds=200)
        

        # Average best iteration preds for test 
        testPreds += clf.predict_proba(test)[:, 1] / folds.n_splits 

        # Free space
        del trainX, trainY, valX, valY
        gc.collect()

    return testPreds

In [9]:
%time predictions = trainNpredXgb(train, test, target, 2)

[0]	validation_0-auc:0.698847
Will train until validation_0-auc hasn't improved in 200 rounds.
[100]	validation_0-auc:0.732321
[200]	validation_0-auc:0.742045
[300]	validation_0-auc:0.752157
[400]	validation_0-auc:0.760184
[500]	validation_0-auc:0.765545
[600]	validation_0-auc:0.76919
[700]	validation_0-auc:0.771902
[800]	validation_0-auc:0.774142
[900]	validation_0-auc:0.775943
[1000]	validation_0-auc:0.777439
[1100]	validation_0-auc:0.778572
[1200]	validation_0-auc:0.779606
[1300]	validation_0-auc:0.780447
[1400]	validation_0-auc:0.781295
[1500]	validation_0-auc:0.781946
[1600]	validation_0-auc:0.782517
[1700]	validation_0-auc:0.783078
[1800]	validation_0-auc:0.783607
[1900]	validation_0-auc:0.783993
[2000]	validation_0-auc:0.784334
[2100]	validation_0-auc:0.784683
[2200]	validation_0-auc:0.784933
[2300]	validation_0-auc:0.78517
[2400]	validation_0-auc:0.785417
[2500]	validation_0-auc:0.78569
[2600]	validation_0-auc:0.785979
[2700]	validation_0-auc:0.786116
[2800]	validation_0-auc:0.

In [10]:
# Create submission csv 
subdf = pd.DataFrame({'SK_ID_CURR': testIds, 'TARGET': predictions})

In [11]:
subdf.to_csv('../../datatmp/data/submissionxgb.csv', index=False)