In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

## Data Analysis and Cleaning

In [None]:
df = pd.read_excel('default_of_credit_card_clients.xls', index_col="ID", skiprows=[0])

print(a)

#print(df.head())
#print(df.describe())
#print(df.info())

# One Hot-Coding for categorical features : binary features take values of 1 or 0
# - Scikit-learn might assume these are numerical features
# - can't use labels because Scikit-learn only accepts numbers

# obtain the one hot encoding of columns 'SEX', 'EDUCATION', 'MARRIAGE'
# The base values are: female, other_education, other_marital_status
df['male'] = (df['SEX'] == 1).astype('int')
df.drop('SEX', axis=1, inplace=True)

df['grad_school'] = (df['EDUCATION'] == 1).astype('int')
df['university'] = (df['EDUCATION'] == 2).astype('int')
df['high_school'] = (df['EDUCATION'] == 3).astype('int')
df.drop('EDUCATION', axis=1, inplace=True)

df['married'] = (df['MARRIAGE'] == 1).astype('int')
df['single'] = (df['MARRIAGE'] == 2).astype('int')
df.drop('MARRIAGE', axis=1, inplace=True)

# From the documentation, we can infer that PAY_n features represent not delayed if it is <= 0
pay_n_features = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for col in pay_n_features:
    hist = df[col].hist(bins=10)
    print("Plotting for column {}".format(col))
    plt.show()
    
# modify all values of PAY_n features which are < 0 to 0
for pay_n in pay_n_features:
    df.loc[df[pay_n] <= 0, pay_n] = 0

df.rename(columns={'default payment next month': 'default'}, inplace=True)
    
pd.options.display.max_columns = None
display(df.sample(5))

## Building Machine Learning Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.preprocessing import RobustScaler

In [None]:
# Feature scaling to get more accurate representation and better learning performance
'''
Most machine learning algorithms take into account only the magnitude of the measurements, not the units of those measurements.
The feature with a very high magnitude (number) may affect the prediction a lot more than an equally important feature.
e.g. the AGE (within certain fixed range) and the PAY_AMTn (monetary) features have very different ranges of values

RobustScaler:
The Robust Scaler uses statistics that are robust to outliers.
This usage of interquartiles means that they focus on the parts where the bulk of the data is.
This makes them very suitable for working with outliers.
Notice that after Robust scaling, the distributions are brought into the same scale and overlap, but the outliers remain outside of bulk of the new distributions.
'''
# plot the distribution of all data
for col in df.columns:
    hist = df[col].hist(bins=10)
    print("Plotting for column {}".format(col))
    plt.show()

x = df.drop('default', axis=1)
rb_scaler = RobustScaler()
x = rb_scaler.fit_transform(x)# rescale all the features to a same range
y = df['default']
# stratify parameter makes data split in a stratified fashion meaning the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=123, stratify=y)

In [None]:
def c_matrix(CM, labels=['pay', 'default']):
    df = pd.DataFrame(data = CM, index=labels, columns=labels)
    df.index.name = 'TRUE'
    df.columns.name = 'PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

## Evaluating Model Performance

In [None]:
# Preparing dataframe to store the evaluation metrics
metrics = pd.DataFrame(
    index=['accuracy', 'precision', 'recall'],
    columns=['NULL', 'LogisticReg', 'DecisionTree', 'NaiveBayes', 'NeuralNet']
)

### In this application

1. Accuracy: Overall how often the model predicts correctly defaulters and non-defaulters?
2. Precision: When the model predicts defaults: how often is correct?
3. Recall: The proportion of actual defaulters that the model will correctly predict?

### Which metric to use?
1. False positive: A person who will pay predicted as defaulter
2. False negative: A person who will default predicted as payer

#### False negatives are worse => look for better recall

## The Null model: always predict the most common category

In [None]:
# benchmark or base for how good the model must be performed to beat the Null model
# predict the most common category which is 'pay'
y_predicted = np.repeat(y_train.value_counts().idxmax(), y_test.size)
metrics.loc['accuracy', 'NULL'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'NULL'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'NULL'] = recall_score(y_pred=y_predicted, y_true=y_test)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>1. Logistic Regression</font>

In [None]:
# import the model class
from sklearn.linear_model import LogisticRegression

# create an instance of the model
log_reg = LogisticRegression(n_jobs=-1, random_state=15)

# train the model using the training data
log_reg.fit(x_train, y_train)

# evaluate the model performance
y_predicted = log_reg.predict(x_test)
metrics.loc['accuracy', 'LogisticReg'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'LogisticReg'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'LogisticReg'] = recall_score(y_pred=y_predicted, y_true=y_test)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>2. Decision Tree Classifier</font>

In [None]:
# import the model class
from sklearn.tree import DecisionTreeClassifier

# create an instance of the model
'''
min_samples_split => minimum number of samples required to split an internal node
min_samples_leaf => minimum number of samples required to be at a leaf node
'''
dec_tree = DecisionTreeClassifier(min_samples_split=30, min_samples_leaf=10, random_state=10)

# train the model using the training data
dec_tree.fit(x_train, y_train)

# evaluate the model performance
y_predicted = dec_tree.predict(x_test)
metrics.loc['accuracy', 'DecisionTree'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'DecisionTree'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'DecisionTree'] = recall_score(y_pred=y_predicted, y_true=y_test)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>3. Naive Bayes Classifier</font>

In [None]:
# import the model class
from sklearn.naive_bayes import GaussianNB# for features with continuous values

# create an instance of the model
nb_classifier = GaussianNB()

# train the model using the training data
nb_classifier.fit(x_train, y_train)

# evaluate the model performance
y_predicted = nb_classifier.predict(x_test)
metrics.loc['accuracy', 'NaiveBayes'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'NaiveBayes'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'NaiveBayes'] = recall_score(y_pred=y_predicted, y_true=y_test)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>4. Feed Forward Deep Neural Networks</font>

### Hyperparameter Tuning for Sequential Model (Using GridSearchCV)
To use Keras model in Scikit Learn, we need to use the KerasClassifier or KerasRegressor classes. These two classes accept a function which creates and returns a Keras model.
1. Tuning batch size and epochs

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.constraints import unit_norm
from keras.wrappers.scikit_learn import KerasClassifier

def cc_default_classifier():
    input_dim = x_train.shape[1]

    # Weight constraints provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
    model = Sequential()
    model.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(1,  activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=cc_default_classifier)

batch_sizes = [24, 32]
epochs = [30, 50]
params = {
    'batch_size': batch_sizes,
    'epochs': epochs,
}

clf = GridSearchCV(model, params, verbose=2, cv=3)
clf.fit(np.array(x_train), np.array(y_train))

In [None]:
# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)

2. Tuning optimizer

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.constraints import unit_norm
from keras.layers.core import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

def cc_default_classifier(optimizer):
    input_dim = x_train.shape[1]

    # Weight constraints provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
    model = Sequential()
    model.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
    model.add(Dropout(0.5))
    model.add(Dense(1,  activation='sigmoid'))

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=cc_default_classifier, epochs=50, batch_size=24)

params = {'optimizer':['SGD', 'Adagrad', 'Adadelta', 'Adam']}

clf = GridSearchCV(model, params, verbose=2, cv=3)
clf.fit(np.array(x_train), np.array(y_train))

In [None]:
# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.constraints import unit_norm
from keras.callbacks import Callback

input_dim = x_train.shape[1]

# Weight constraints provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
model = Sequential()
model.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
model.add(Dropout(0.5))
model.add(Dense(1,  activation='sigmoid'))

model.compile(optimizer='Adadelta', loss='binary_crossentropy', metrics=['accuracy'])

class BatchLogger(Callback):
    def on_train_begin(self, epoch, logs={}):
        self.log_values = {}
        for k in self.params['metrics']:
            self.log_values[k] = []

    def on_epoch_end(self, batch, logs={}):
        for k in self.params['metrics']:
            if k in logs:
                self.log_values[k].append(logs[k])
    
    def get_values(self, metric_name, window):
        d =  pd.Series(self.log_values[metric_name])
        return d.rolling(window,center=False).mean()

bl = BatchLogger()

history = model.fit(np.array(x_train), np.array(y_train),
              batch_size=24, epochs=50, verbose=1, callbacks=[bl],
              validation_split=0.2)

# evaluate the model
_, train_acc = model.evaluate(x_train, y_train, verbose=0)
_, test_acc = model.evaluate(x_test, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
# plot history
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show()

In [None]:
import itertools
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix

def plot_cm(ax, y_true, y_pred, classes, title, th=0.5, cmap=plt.cm.Blues):
    y_pred_labels = (y_pred>th).astype(int)
    
    cm = confusion_matrix(y_true, y_pred_labels)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)

    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes)
    ax.set_yticklabels(classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

def plot_auc(ax, y_train, y_train_pred, y_test, y_test_pred, th=0.5):

    y_train_pred_labels = (y_train_pred>th).astype(int)
    y_test_pred_labels  = (y_test_pred>th).astype(int)

    fpr_train, tpr_train, _ = roc_curve(y_train,y_train_pred)
    roc_auc_train = auc(fpr_train, tpr_train)
    acc_train = accuracy_score(y_train, y_train_pred_labels)

    fpr_test, tpr_test, _ = roc_curve(y_test,y_test_pred)
    roc_auc_test = auc(fpr_test, tpr_test)
    acc_test = accuracy_score(y_test, y_test_pred_labels)

    ax.plot(fpr_train, tpr_train)
    ax.plot(fpr_test, tpr_test)

    ax.plot([0, 1], [0, 1], 'k--')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC curve')
    
    train_text = 'train acc = {:.3f}, auc = {:.2f}'.format(acc_train, roc_auc_train)
    test_text = 'test acc = {:.3f}, auc = {:.2f}'.format(acc_test, roc_auc_test)

In [None]:
score = model.evaluate(np.array(x_test), np.array(y_test), verbose=0)
print('Test log loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1, 2, 1)
plt.title('loss, per batch')
plt.plot(bl.get_values('loss',1), 'b-', label='train');
plt.plot(bl.get_values('val_loss',1), 'r-', label='test');

plt.subplot(1, 2, 2)
plt.title('accuracy, per batch')
plt.plot(bl.get_values('acc',1), 'b-', label='train');
plt.plot(bl.get_values('val_acc',1), 'r-', label='test');
plt.show()

In [None]:
y_train_pred = model.predict_on_batch(np.array(x_train))[:,0]
y_test_pred = model.predict_on_batch(np.array(x_test))[:,0]

fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)

plot_cm(ax[0], y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)')
plot_cm(ax[1], y_test, y_test_pred, [0,1], 'Confusion matrix (TEST)')

plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred)
    
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

'''
# predict probabilities for test set
y_pred_probs = model.predict(x_test, verbose=0)
# predict crisp classes for test set
y_pred_classes = model.predict_classes(x_test, verbose=0)

# reduce to 1d array
y_pred_probs = y_pred_probs[:, 0]
y_pred_classes = y_pred_classes[:, 0]


# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred_classes)
print('F1 score: %f' % f1)


# kappa
kappa = cohen_kappa_score(y_test, y_pred_classes)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(y_test, y_pred_probs)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(y_test, y_pred_classes)
print(matrix)
'''
# predict probabilities for test set
y_pred_probs = model.predict(x_test, verbose=0)
# predict crisp classes for test set
y_pred_classes = model.predict_classes(x_test, verbose=0)

# reduce to 1d array
y_pred_probs = y_pred_probs[:, 0]
y_pred_classes = y_pred_classes[:, 0]

# evaluate the model performance
metrics.loc['accuracy', 'NeuralNet'] = accuracy_score(y_test, y_pred_classes)
metrics.loc['precision', 'NeuralNet'] = precision_score(y_test, y_pred_classes)
metrics.loc['recall', 'NeuralNet'] = recall_score(y_test, y_pred_classes)

# construct the confusion matrix
CM = confusion_matrix(y_test, y_pred_classes)
c_matrix(CM)

## Metrics Analysis and Visualization

In [None]:
100 * metrics

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
metrics.plot(kind='barh', ax=ax)
ax.grid()

In [None]:
# adjust precision and recall by modifying the classification thresholds
# predict_proba gives you the probabilities for the target (0 and 1 in your case) in array form
precision_nb, recall_nb, thresholds_nb = precision_recall_curve(y_true=y_test, probas_pred=nb_classifier.predict_proba(x_test)[:,1])

precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_true=y_test, probas_pred=log_reg.predict_proba(x_test)[:,1])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(precision_nb, recall_nb, label='NaiveBayes')
ax.plot(precision_lr, recall_lr, label='LogisticReg')
ax.set_xlabel('Precision')
ax.set_ylabel('Recall')
ax.set_title('Precision-Recall Curve')
ax.hlines(y=0.5, xmin=0, xmax=1, color='red')
ax.legend()
ax.grid()

# Logistic regression is better than Naive Bayes

## Confusion Matrix for modified Logistic Regression Classifier

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
print(thresholds_lr)
print(precision_lr)
ax.plot(thresholds_lr, precision_lr[1:], label='Precision')
ax.plot(thresholds_lr, recall_lr[1:], label='Recall')
ax.set_xlabel('Classfication Threshold')
ax.set_ylabel('Precision, Recall')
ax.set_title('Logistic Regression Classifier: Precision-Recall')
ax.hlines(y=0.6, xmin=0, xmax=1, color='red')
ax.legend()
ax.grid()

## Classifier with threshold of 0.2

In [None]:
y_pred_proba = log_reg.predict_proba(x_test)[:,1]
y_predicted = (y_pred_proba >= 0.2).astype('int')
# adjust the original classification threshold from 0.5 to 0.2

# confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
print("Recall: ", 100*recall_score(y_pred=y_predicted, y_true=y_test))
print("Precision: ", 100*precision_score(y_pred=y_predicted, y_true=y_test))
c_matrix(CM)

## Final Predictive Model (Logistic Regression)

In [None]:
def predict_default(new_data):
    '''
    #print(new_data)
    #print(new_data.shape)
    # The criterion to satisfy for providing the new shape is that 'The new shape should be compatible with the original shape'
    # https://stackoverflow.com/questions/18691084/what-does-1-mean-in-numpy-reshape
    '''
    data = new_data.values.reshape(1, -1)
    data = robust_scaler.transform(data)
    prob = log_reg.predict_proba(data)[0][1]
    if prob >= 0.2:
        return "Will default"
    else:
        return "Will pay"

In [None]:
pay = df[df['default']==0]

In [None]:
pay.head()

In [None]:
from collections import OrderedDict
new_customer = OrderedDict([
    ('LIMIT_BAL', 4000), ('AGE', 50), ('BILL_AMT1', 500),
    ('BILL_AMT2', 35509), ('BILL_AMT3', 689), ('BILL_AMT4', 0),
    ('BILL_AMT5', 0), ('BILL_AMT6', 0), ('PAY_AMT1', 0),
    ('PAY_AMT2', 35509), ('PAY_AMT3', 0), ('PAY_AMT4', 0),
    ('PAY_AMT5', 0), ('PAY_AMT6', 0), ('male', 1), ('grad_school', 0),
    ('university', 1), ('high_school', 0), ('married', 1), ('single', 0), ('pay_0', -1),
    ('pay_2', -1), ('pay_3', -1), ('pay_4', 0), ('pay_5', -1), ('pay_6', 0),
])

new_customer = pd.Series(new_customer)
predict_default(new_customer)

In [None]:
'''
for x in negative.index[0:100]:
    print(predict_default(negative.loc[x].drop('default')))
'''