# Lab 2: Supervised Learning
## Dataset: Default of Credit Card Clients
Dataset Link: https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients

### <font color=red>Note that:<br>- The hyperparameter tuning cells are changed to markdown, please change the cell type to run the code<br>- The results of hyperparameter tuning might not be the same as well</font>

In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
%matplotlib inline

## Data Analysis and Cleaning

In [None]:
df = pd.read_excel('default_of_credit_card_clients.xls', index_col="ID", skiprows=[0])

#print(df.head())
#print(df.describe())
#print(df.info())

# One Hot-Coding for categorical features : binary features take values of 1 or 0
# - Scikit-learn might assume these are numerical features
# - can't use labels because Scikit-learn only accepts numbers

# obtain the one hot encoding of columns 'SEX', 'EDUCATION', 'MARRIAGE'
# The base values are: female, other_education, other_marital_status
df['male'] = (df['SEX'] == 1).astype('int')
df.drop('SEX', axis=1, inplace=True)

df['grad_school'] = (df['EDUCATION'] == 1).astype('int')
df['university'] = (df['EDUCATION'] == 2).astype('int')
df['high_school'] = (df['EDUCATION'] == 3).astype('int')
df.drop('EDUCATION', axis=1, inplace=True)

df['married'] = (df['MARRIAGE'] == 1).astype('int')
df['single'] = (df['MARRIAGE'] == 2).astype('int')
df.drop('MARRIAGE', axis=1, inplace=True)

# From the documentation, we can infer that PAY_n features represent not delayed if it is <= 0
pay_n_features = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
for col in pay_n_features:
    hist = df[col].hist(bins=10)
    print("Plotting for column {}".format(col))
    plt.show()
# modify all values of PAY_n features which are < 0 to 0
for pay_n in pay_n_features:
    df.loc[df[pay_n] <= 0, pay_n] = 0

df.rename(columns={'default payment next month': 'default'}, inplace=True)
    
pd.options.display.max_columns = None
display(df.sample(5))

## Building Machine Learning Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, precision_recall_curve, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import RobustScaler

In [None]:
# Feature scaling to get more accurate representation and better learning performance
'''
Most machine learning algorithms take into account only the magnitude of the measurements, not the units of those measurements.
The feature with a very high magnitude (number) may affect the prediction a lot more than an equally important feature.
e.g. the AGE (within certain fixed range) and the PAY_AMTn (monetary) features have very different ranges of values

RobustScaler:
The Robust Scaler uses statistics that are robust to outliers.
This usage of interquartiles means that they focus on the parts where the bulk of the data is.
This makes them very suitable for working with outliers.
Notice that after Robust scaling, the distributions are brought into the same scale and overlap, but the outliers remain outside of bulk of the new distributions.
'''
# plot the distribution of all data
for col in df.columns:
    hist = df[col].hist(bins=10)
    print("Plotting for column {}".format(col))
    plt.show()

x = df.drop('default', axis=1)
rb_scaler = RobustScaler()
x = rb_scaler.fit_transform(x)# rescale all the features to a same range
y = df['default']
# stratify parameter makes data split in a stratified fashion meaning the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=123, stratify=y)

In [None]:
def c_matrix(CM, labels=['pay', 'default']):
    df = pd.DataFrame(data = CM, index=labels, columns=labels)
    df.index.name = 'TRUE'
    df.columns.name = 'PREDICTION'
    df.loc['Total'] = df.sum()
    df['Total'] = df.sum(axis=1)
    return df

## Evaluating Model Performance

In [None]:
# Preparing dataframe to store the evaluation metrics
metrics = pd.DataFrame(
    index=['accuracy', 'precision', 'recall', 'f1-score', 'AUC'],
    columns=['LogisticReg', 'DecisionTree', 'NeuralNet']
)

## <font color=red>1. Logistic Regression</font>

### Hyperparameter Tuning for Logistic Regression (Using GridSearchCV)
Tuning regularization penalty and regularization hyperparameter

```python
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# create an instance of the model
log_reg = LogisticRegression(n_jobs=-1)

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
#logspace => Return numbers spaced evenly on a log scale
C = np.logspace(0, 4, 10)

# Create hyperparameter options
params = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(log_reg, params, verbose=0, cv=3)

# Fit grid search
best_model = clf.fit(x_train, y_train)

# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)
```

Actual Machine Learning Model

In [None]:
# import the model class
from sklearn.linear_model import LogisticRegression

# create an instance of the model
log_reg = LogisticRegression(n_jobs=-1, C=2.7825594022071245, penalty='l1')

# train the model using the training data
log_reg.fit(x_train, y_train)

## evaluate the model performance and log the metrics
y_predicted = log_reg.predict(x_test)
probs_log_reg = log_reg.predict_proba(x_test)# predict probabilities
probs_log_reg = probs_log_reg[:, 1]# keep probabilities for the positive outcome only
metrics.loc['accuracy', 'LogisticReg'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'LogisticReg'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'LogisticReg'] = recall_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['f1-score', 'LogisticReg'] = f1_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['AUC', 'LogisticReg'] = roc_auc_score(y_test, probs_log_reg)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>2. Decision Tree Classifier</font>

### Hyperparameter Tuning for Decision Tree Classifier (Using GridSearchCV)
- Tuning max depth, min sample split and criterion

```python
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# create an instance of the model
dec_tree = DecisionTreeClassifier(min_samples_split=30, min_samples_leaf=10)

# Create max depth space
depths = [None, 5, 10]

# Create minimum sample split space
samples_splits = [2, 4]

# Create criterion space
criteria = ['gini', 'entropy']

# Create hyperparameter options
params = dict(max_depth=depths, min_samples_split=samples_splits, criterion=criteria)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(dec_tree, params, verbose=0, cv=3)

# Fit grid search
best_model = clf.fit(x_train, y_train)

# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)
```

Actual Machine Learning Model

In [None]:
# import the model class
from sklearn.tree import DecisionTreeClassifier

# create an instance of the model
'''
min_samples_split => minimum number of samples required to split an internal node
min_samples_leaf => minimum number of samples required to be at a leaf node
'''
dec_tree = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_split=2, min_samples_leaf=10)

# train the model using the training data
dec_tree.fit(x_train, y_train)

## evaluate the model performance and log the metrics
y_predicted = dec_tree.predict(x_test)
probs_dec_tree = dec_tree.predict_proba(x_test)# predict probabilities
probs_dec_tree = probs_dec_tree[:, 1]# keep probabilities for the positive outcome only
metrics.loc['accuracy', 'DecisionTree'] = accuracy_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['precision', 'DecisionTree'] = precision_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['recall', 'DecisionTree'] = recall_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['f1-score', 'DecisionTree'] = f1_score(y_pred=y_predicted, y_true=y_test)
metrics.loc['AUC', 'DecisionTree'] = roc_auc_score(y_test, probs_dec_tree)

# construct the confusion matrix
CM = confusion_matrix(y_pred=y_predicted, y_true=y_test)
c_matrix(CM)

## <font color=red>3. Feed Forward Deep Neural Networks</font>

### Hyperparameter Tuning for Sequential Model (Using GridSearchCV)
To use Keras model in Scikit Learn, we need to use the KerasClassifier or KerasRegressor classes. These two classes accept a function which creates and returns a Keras model.
1. Tuning batch size and epochs

```python
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.constraints import unit_norm
from keras.wrappers.scikit_learn import KerasClassifier

def cc_default_classifier():
    input_dim = x_train.shape[1]

    '''
    Weight constraints (unit_norm) provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
    '''
    neuralNet = Sequential()
    neuralNet.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(1,  activation='sigmoid'))

    neuralNet.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return neuralNet

neuralNet = KerasClassifier(build_fn=cc_default_classifier)

batch_sizes = [24, 32]
epochs = [30, 50]
params = {
    'batch_size': batch_sizes,
    'epochs': epochs,
}

clf = GridSearchCV(neuralNet, params, verbose=2, cv=3)
clf.fit(np.array(x_train), np.array(y_train))

# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)
```

2. Tuning optimizer

```python
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.constraints import unit_norm
from keras.layers.core import Dense, Activation, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

def cc_default_classifier(optimizer):
    input_dim = x_train.shape[1]

    # Weight constraints provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
    neuralNet = Sequential()
    neuralNet.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
    neuralNet.add(Dropout(0.5))
    neuralNet.add(Dense(1,  activation='sigmoid'))

    neuralNet.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return neuralNet

neuralNet = KerasClassifier(build_fn=cc_default_classifier, epochs=50, batch_size=24)

params = {'optimizer':['SGD', 'Adagrad', 'Adam']}

clf = GridSearchCV(neuralNet, params, verbose=2, cv=3)
clf.fit(np.array(x_train), np.array(y_train))

# Display the best score and best parameters
print("Best mean test score and best parameters:")
print(clf.best_score_, clf.best_params_)
print()

# Loop through and display each pair of mean test score and parameter
print("List of Mean test scores and respective parameters:")
means = clf.cv_results_['mean_test_score']
parameters = clf.cv_results_['params']
for mean, parameter in zip(means, parameters):
    print(mean, parameter)
```

Actual Machine Learning Model

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.constraints import unit_norm
from keras.callbacks import Callback

input_dim = x_train.shape[1]

# Weight constraints provide an approach to reduce the overfitting of a deep learning neural network model on the training data and improve the performance of the model on new data
neuralNet = Sequential()
neuralNet.add(Dense(64, input_shape=(input_dim,), activation='relu', kernel_constraint=unit_norm()))
neuralNet.add(Dropout(0.5))
neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
neuralNet.add(Dropout(0.5))
neuralNet.add(Dense(32, activation='relu', kernel_constraint=unit_norm()))
neuralNet.add(Dropout(0.5))
neuralNet.add(Dense(16, activation='relu', kernel_constraint=unit_norm()))
neuralNet.add(Dropout(0.5))
neuralNet.add(Dense(1,  activation='sigmoid'))

neuralNet.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

history = neuralNet.fit(np.array(x_train), np.array(y_train),
              batch_size=24, epochs=50, verbose=1,
              validation_split=0.2)

## evaluate the model performance and log the metrics
# predict probabilities for test set
y_pred_probs = neuralNet.predict(x_test, verbose=0)
# predict crisp classes for test set
y_pred_classes = neuralNet.predict_classes(x_test, verbose=0)

# reduce to 1d array
y_pred_probs = y_pred_probs[:, 0]
y_pred_classes = y_pred_classes[:, 0]

metrics.loc['accuracy', 'NeuralNet'] = accuracy_score(y_test, y_pred_classes)
metrics.loc['precision', 'NeuralNet'] = precision_score(y_test, y_pred_classes)
metrics.loc['recall', 'NeuralNet'] = recall_score(y_test, y_pred_classes)
metrics.loc['f1-score', 'NeuralNet'] = f1_score(y_test, y_pred_classes)
metrics.loc['AUC', 'NeuralNet'] = roc_auc_score(y_test, y_pred_probs)

# construct the confusion matrix
CM = confusion_matrix(y_test, y_pred_classes)
c_matrix(CM)

## Metrics Analysis and Visualization

In [None]:
100 * metrics

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
metrics.plot(kind='barh', ax=ax)
ax.grid()

In [None]:
# Plot roc curves for all models
fpr_lr, tpr_lr, thresholds_lr = roc_curve(y_test, log_reg.predict_proba(x_test)[:,1])
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, dec_tree.predict_proba(x_test)[:,1])
fpr_nn, tpr_nn, thresholds_nn = roc_curve(y_test, neuralNet.predict(x_test, verbose=0)[:, 0])

fig, ax = plt.subplots(figsize=(8,5))
ax.plot([0, 1], [0, 1], linestyle='--')
ax.plot(fpr_lr, tpr_lr, label='LogisticReg')
ax.plot(fpr_dt, tpr_dt, label='DecisionTree')
ax.plot(fpr_nn, tpr_nn, label='NeuralNet')
ax.set_title('ROC Curves')
ax.legend()
ax.grid()

In [None]:
# Plot precision-recall curves for all models
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, log_reg.predict_proba(x_test)[:,1])
precision_dt, recall_dt, thresholds_dt = precision_recall_curve(y_test, dec_tree.predict_proba(x_test)[:,1])
precision_nn, recall_nn, thresholds_nn = precision_recall_curve(y_test, neuralNet.predict(x_test, verbose=0)[:, 0])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(precision_lr, recall_lr, label='LogisticReg')
ax.plot(precision_dt, recall_dt, label='DecisionTree')
ax.plot(precision_nn, recall_nn, label='NeuralNet')
ax.set_xlabel('Precision')
ax.set_ylabel('Recall')
ax.set_title('Precision-Recall Curves')
ax.legend()
ax.grid()