In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
%matplotlib inline

In [55]:
df = pd.read_csv('train_data.csv')

In [56]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
       'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [57]:
df.shape

(23999, 25)

In [58]:
df = df.replace({'EDUCATION': 0}, 5)
df = df.replace({'EDUCATION': 6}, 5)
df = df.replace({'MARRIAGE': 0}, 3)

In [59]:
X = df.drop(['ID', 'default payment next month'], axis=1)
y = df['default payment next month']

In [60]:
def encode_and_concat_feature_train(X_train, feature_name):
    """
    Helper function for transforming training data.  It takes in the full X dataframe and
    feature name, makes a one-hot encoder, and returns the encoder as well as the dataframe
    with that feature transformed into multiple columns of 1s and 0s
    """
    # make a one-hot encoder and fit it to the training data
    ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
    single_feature_df = X_train[[feature_name]]
    ohe.fit(single_feature_df)
    
    # call helper function that actually encodes the feature and concats it
    X_train = encode_and_concat_feature(X_train, feature_name, ohe)
    
    return ohe, X_train

In [61]:
def encode_and_concat_feature(X, feature_name, ohe):
    """
    Helper function for transforming a feature into multiple columns of 1s and 0s. Used
    in both training and testing steps.  Takes in the full X dataframe, feature name, 
    and encoder, and returns the dataframe with that feature transformed into multiple
    columns of 1s and 0s
    """
    # create new one-hot encoded df based on the feature
    single_feature_df = X[[feature_name]]
    feature_array = ohe.transform(single_feature_df).toarray()
    ohe_df = pd.DataFrame(feature_array, columns=ohe.categories_[0], index=X.index)
    
    # drop the old feature from X and concat the new one-hot encoded df
    X = X.drop(feature_name, axis=1)
    X = pd.concat([X, ohe_df], axis=1)
    
    return X

In [62]:
categorical_feature_names = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0','PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6' ]

encoders = {}

for categorical_feature in categorical_feature_names:
    ohe, X = encode_and_concat_feature_train(X, categorical_feature)
    encoders[categorical_feature] = ohe

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [64]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [73]:
estimators = [  
              ('rf', RandomForestClassifier(n_estimators = 100, class_weight='balanced')),
              ('log', LogisticRegression(class_weight='balanced')),
              ('grad', GradientBoostingClassifier())]
stack = StackingClassifier(estimators = estimators, final_estimator = LogisticRegression(class_weight='balanced'), cv = 5)
stack.fit(X_train, y_train);
stack.score(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.8998833268514917

In [74]:
stack.score(X_test, y_test)

0.7851666666666667

In [75]:
def metrics(y_true, y_pred):
    print('Accuracy: ' + str(accuracy_score(y_true, y_pred)))
    print('Precision: ' + str(precision_score(y_true, y_pred)))
    print('Recall: ' + str(recall_score(y_true, y_pred)))
    print('F1: ' + str(f1_score(y_true, y_pred)))
    print('\n')

In [76]:
metrics(y_test, stack.predict(X_test))

Accuracy: 0.7851666666666667
Precision: 0.5161699429296132
Recall: 0.6074626865671642
F1: 0.5581076448405896




In [72]:
print(classification_report(y_test, stack.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      4660
           1       0.70      0.37      0.48      1340

    accuracy                           0.82      6000
   macro avg       0.77      0.66      0.69      6000
weighted avg       0.81      0.82      0.80      6000



In [29]:
for i in stack.estimators_:
    print(i)
    metrics(y_test, i.predict(X_test))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Accuracy: 0.8215
Precision: 0.6835106382978723
Recall: 0.38158871566443947
F1: 0.4897570271557885


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Accuracy: 0.822166666666666

In [30]:
from sklearn.utils import resample

In [34]:
X_train

array([[-0.9036827 , -0.92385231, -0.03691078, ..., -0.02472891,
        -0.03729474, -0.00745397],
       [ 2.24579984,  0.80648337,  3.68530682, ..., -0.02472891,
        -0.03729474, -0.00745397],
       [-1.13413264,  0.26575347, -0.56634565, ..., -0.02472891,
        -0.03729474, -0.00745397],
       ...,
       [-0.36596617, -0.16683045, -0.69088831, ..., -0.02472891,
        -0.03729474, -0.00745397],
       [-0.67323276, -0.05868447, -0.31878971, ..., -0.02472891,
        -0.03729474, -0.00745397],
       [-0.9036827 , -0.92385231, -0.09084498, ..., -0.02472891,
        -0.03729474, -0.00745397]])

In [47]:
# concatenate our training data back together
X_df = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_dfd = X_df[X_df['default payment next month']==0]
dfd = X_df[X_df['default payment next month']==1]

# upsample minority
dfd_upsampled = resample(dfd,
                          replace=True, # sample with replacement
                          n_samples=len(not_dfd), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_dfd, dfd_upsampled])


In [53]:
upsampled['default payment next month'].value_counts()

1    14030
0    14030
Name: default payment next month, dtype: int64

In [43]:
X = upsampled.drop(['default payment next month'], axis=1)
y = upsampled['default payment next month']

In [44]:
stack.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

StackingClassifier(cv=5,
                   estimators=[('rf',
                                RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=None,
                                                       max_features='auto',
                                                       max_leaf_nodes=None,
                                                       max_samples=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                        

In [46]:
metrics(y_test, stack.predict(X_test))

Accuracy: 0.8028333333333333
Precision: 0.6733333333333333
Recall: 0.22628827483196415
F1: 0.33873672442705427


