In [31]:
# display inline plots
%matplotlib inline

# import libraries for numerical and scientific computing
import numpy as np
import scipy as sp

# import matplotlib for plotting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# import pandas for data wrangling and munging
import pandas as pd

# set some options for better view
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# import plotting library built on top of matplotlib
import seaborn as sns

# set some settings related to style of plots that will render
sns.set_style("whitegrid")
sns.set_context("poster")

import warnings
warnings.filterwarnings('ignore')

In [32]:
# load train and test dataset
loan_train = pd.read_csv('./data/train_u6lujuX.csv', index_col='Loan_ID')
loan_test = pd.read_csv('./data/test_Y3wMUE5.csv', index_col='Loan_ID')

In [33]:
%run scripts/data.py

In [34]:
features_cols = loan_train.columns.drop('Loan_Status')
loan_train_features_df = loan_train[features_cols]
target_feature = loan_train['Loan_Status']

In [35]:
obj_cols = loan_train_features_df.select_dtypes(include=['object']).columns
non_obj_cols = loan_train_features_df.select_dtypes(exclude=['object']).columns

In [36]:
# label encoded data
train_df_enc, test_df_enc = get_label_encoded_data(loan_train_features_df, loan_test, obj_cols)

In [37]:
# one hot encoded
train_df_hot, test_df_hot = get_dummy_variable_data(loan_train_features_df, loan_test, non_obj_cols, obj_cols)

## Missing Values

In [38]:
def fill_missing_values_enc(train_df, test_df, func):
    train_df_cpy = train_df.copy()
    test_df_cpy = test_df.copy()
    
    train_df_cpy['LoanAmount'] = train_df_cpy.LoanAmount.fillna(func(train_df.LoanAmount))
    test_df_cpy['LoanAmount'] = test_df_cpy.LoanAmount.fillna(func(test_df.LoanAmount))
    
    train_df_cpy['Loan_Amount_Term'] = train_df_cpy.Loan_Amount_Term.fillna(func(train_df.Loan_Amount_Term))
    test_df_cpy['Loan_Amount_Term'] = test_df_cpy.Loan_Amount_Term.fillna(func(test_df.Loan_Amount_Term))
    
    train_df_cpy['Credit_History'] = train_df_cpy.Credit_History.fillna(1.0)
    test_df_cpy['Credit_History'] = test_df_cpy.Credit_History.fillna(1.0)
    
    return train_df_cpy, test_df_cpy

In [39]:
train_df_enc_mean, test_df_enc_mean = fill_missing_values_enc(train_df_enc, test_df_enc, np.mean)

In [40]:
train_df_hot_mean, test_df_hot_mean = fill_missing_values_enc(train_df_hot, test_df_hot, np.mean)

## Stacking

In [41]:
# set seed
np.random.seed(42)

In [52]:
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold(y=target_feature, n_folds=10)

In [78]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
import xgboost as xgb

In [None]:
# (train_df_enc_mean, test_df_enc_mean, RandomForestClassifier(n_estimators=500, criterion='entropy')),
# (train_df_hot_mean, test_df_hot_mean, RandomForestClassifier(n_estimators=500, criterion='entropy')),
# (train_df_enc_mean, test_df_enc_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01)),
# (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01)),
        

In [125]:
# list of all the classifiers
clfs = [(train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.8, min_samples_leaf=10)),
        (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.8, min_samples_leaf=10)),
        (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.7, min_samples_leaf=12)),
        (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.7, min_samples_leaf=12)),
        (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.6, min_samples_leaf=15)),
        (train_df_hot_mean, test_df_hot_mean, GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, subsample=.6, min_samples_leaf=15)),
       ]

In [126]:
blend_train = np.zeros((train_df_enc_mean.shape[0], len(clfs)))
blend_test = np.zeros((test_df_enc_mean.shape[0], len(clfs)))

In [127]:
cv_results = np.zeros((len(clfs), len(skf)))

In [128]:
from sklearn.metrics import accuracy_score

In [129]:
target_map = {'Y': 1, 'N': 0}
target_feature_bin = np.array(target_feature.map(target_map))

In [None]:
for j, data_clf in enumerate(clfs):
    X_dev = data_clf[0].values
    Y_dev = target_feature_bin
    
    X_test = data_clf[1].values
    
    clf = data_clf[2]
    print ('\nTraining classifier [%s]: %s' % (j, clf))
    blend_test_j = np.zeros((X_test.shape[0], len(skf)))
    
    for i, (train_index, cv_index) in enumerate(skf):
        # print ('Fold [%s]' % (i))

        X_train = X_dev[train_index]
        Y_train = Y_dev[train_index]
        X_cv = X_dev[cv_index]
        Y_cv = Y_dev[cv_index]

        # print("fit")
        if 'fit_cv' in dir(clf):
            clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)])
        else:
            clf.fit(X_train, Y_train)

        one_result = clf.predict(X_cv)
        blend_train[cv_index, j] = one_result
        cv_score = accuracy_score(Y_cv, blend_train[cv_index, j])
        cv_results[j, i] = cv_score
        print ('Fold [%s] Accuracy Score = %0.5f' % (i, cv_score))
        blend_test_j[:, i] = clf.predict(X_test)
    blend_test[:, j] = blend_test_j.mean(1)
    print ('Clf_%d Mean Accuracy Score = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std()))


Training classifier [0]: GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=10, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              random_state=None, subsample=0.8, verbose=0,
              warm_start=False)
Fold [0] Accuracy Score = 0.79365
Fold [1] Accuracy Score = 0.82540
Fold [2] Accuracy Score = 0.73770
Fold [3] Accuracy Score = 0.75410
Fold [4] Accuracy Score = 0.78689
Fold [5] Accuracy Score = 0.78689

In [112]:
print "CV-Results", cv_results.mean()

CV-Results 0.802923063579


In [113]:
from sklearn.linear_model import LogisticRegressionCV

In [114]:
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]

In [115]:
bclf = LogisticRegressionCV(Cs)
bclf.fit(blend_train, Y_dev)

Y_test_predict = bclf.predict(blend_test)

In [116]:
%run scripts/helper.py

In [117]:
Y_test_predict_labels = [inverse_mapping(pred) for pred in Y_test_predict]

In [118]:
create_submissions(loan_test.index.values, Y_test_predict_labels, 'stacking_4.csv')