In [None]:
import pandas as pd
import numpy as np
from plotnine import *
import plotnine
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import make_scorer
import xgboost as xgb
import joblib

In [None]:
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

After importing the data set into Python, the `df_train` is now our data frame. The data frame has a lot of functions and methods that will create spesific outputs about the characteristic of data frame. The method of `columns` will print out all the column names.

In [None]:
df_train = pd.read_csv('loan_prediction.csv',
    usecols = [i for i in range(1, 14)]
)

In [None]:
print('Data dimension: {} rows and {} columns'.format(len(df_train), len(df_train.columns)))
df_train.head()

In [None]:
df_test = pd.read_csv(
    filepath_or_buffer = 'https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_test.csv'
)

In [None]:
print('Data dimension: {} rows and {} columns'.format(len(df_test), len(df_test.columns)))
df_test.head()

The method of `info` will show us the metadata or information about the columns in a data frame. It undirectly specifies the scale measurement of a given columns in a data frame. However, it can be misleading. So, we must modify the scale measurement or column types based on column characteristic.

In [None]:
# Data frame metadata
df_train.info()

In [None]:
# Change column types
df_train = df_train.astype({'Credit_History': object, 'Loan_Status': int})
df_train.select_dtypes(include = ['object']).dtypes

In [None]:
# Summary statistics of categorical columns
for i in df_train.select_dtypes('object').columns:
    print(df_train[i].value_counts(),'\n')

#### Handle missing values

In [None]:
# Check missing values
df_train.isna().sum()

**Note**: Consideration to remove missing values is based on a business logic. The concept of *garbage in garbage out* applies. Without any relevant domain knowledges of loan problem, the interpolation will lead to the biased result.

Instead of dropping the missing values brutally, we try to inspect the relevant variables in the data in order to suggest the consideration for the next analysis

##### `Dependents`

In [None]:
print('Number of missing dependents is about {} rows'.format(df_train['Dependents'].isna().sum()))

In [None]:
# Replace missing valuess with "0"
df_train['Dependents'].fillna(value = '0', inplace = True) 

##### `Self_Employed`

In [None]:
print('Number of missing Self_Employed is about {} rows'.format(df_train['Self_Employed'].isna().sum()))

In [None]:
# Replace missing values with "No"
df_train['Self_Employed'].fillna(value = 'No', inplace = True) 

##### `Loan_Amount_Term`

In [None]:
df_train[['Loan_Amount_Term', 'Loan_Status']].groupby('Loan_Status').describe()

In [None]:
print('Percentile 20th: {}'.format(df_train['Loan_Amount_Term'].quantile(q = 0.2)))

In [None]:
# Replace missing values with "360"
df_train['Loan_Amount_Term'].fillna(value = 360, inplace = True)

##### `Credit_History`

In [None]:
# Cross tabulation of credit history and loan status
df_cred_hist = pd.crosstab(df_train['Credit_History'], df_train['Loan_Status'], margins = True).reset_index()
# Remove index name
df_cred_hist.columns.name = None
# Remove last row for total column attribute
df_cred_hist = df_cred_hist.drop([len(df_cred_hist) - 1], axis = 0)
df_cred_hist.rename(columns = {'Credit_History':'Credit History', 0:'No', 1:'Yes'}, inplace = True)
df_cred_hist

In [None]:
# Slice the data frame based on loan status
pos_cred_hist0 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 0)]
pos_cred_hist1 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 1)]
print('Number of rows with Loan_Status is No but Credit_History is NaN  : {}'.format(len(pos_cred_hist0)))
print('Number of rows with Loan_Status is Yes but Credit_History is NaN : {}'.format(len(pos_cred_hist1)))

In [None]:
# Replace the missing values with a specific condition
credit_loan = zip(df_train['Credit_History'], df_train['Loan_Status'])
df_train['Credit_History'] = [
                                0.0 if np.isnan(credit) and status == 0 else
                                1.0 if np.isnan(credit) and status == 1 else
                                credit for credit, status in credit_loan
                             ]

##### `Gender` and `Loan Amount`

In [None]:
# Drop missing values
df_train.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
# Check missing value
df_train.isna().sum()

### Testing data

#### Scale measurement

In [None]:
# Data frame metadata
df_test.info()

In [None]:
# Change column types
df_test = df_test.astype({'Credit_History': object})
df_test.select_dtypes(include = ['object']).dtypes

In [None]:
# Summary statistics of categorical columns
for i in df_test.select_dtypes('object').columns:
    print(df_test[i].value_counts(),'\n')

#### Handle missing values

In [None]:
# Check missing values
df_test.isna().sum()

##### `Dependents`

In [None]:
print('Number of missing values in Dependents is about {} rows'.format(df_test['Dependents'].isna().sum()))

In [None]:
# Replace missing values with "0"
df_test['Dependents'].fillna(value = '0', inplace = True)

##### `Self_Employed`

In [None]:
print('Number of missing values in Self_Employed is about {} rows'.format(df_test['Self_Employed'].isna().sum()))

In [None]:
# Replace missing values with "No"
df_test['Self_Employed'].fillna(value = 'No', inplace = True) 

##### `Loan_Amount_Term`

In [None]:
# Replace missing values with "360"
df_test['Loan_Amount_Term'].fillna(value = 360, inplace = True)

##### `Gender`, `Married`, `LoanAmount` and `Credit_History`

In [None]:
# Drop missing values
df_test.dropna(axis = 0, how = 'any', inplace = True)

In [None]:
# Check missing values
df_test.isna().sum()

## Explanatory data analysis

### The composition of default and not default customers

In [None]:
# Data aggregation between default and not default customers
df_viz_1 = df_train.groupby(['Loan_Status'])['Loan_ID'].count().reset_index(name = 'Total')
# Map the loan status
df_viz_1['Loan_Status'] = df_viz_1['Loan_Status'].map(
    {
        0: 'Not default',
        1: 'Default'
    }
)

In [None]:
# Show the data
df_viz_1

In [None]:
# Figure size
plt.figure(figsize = (6.4,4.8))

# Customize colors and other settings
colors = ['#80797c','#981220']

# Explode 1st slice
explode = (0.1, 0)

# Create a pie chart
plt.pie(
    x = 'Total',
    labels = 'Loan_Status',
    data = df_viz_1,
    explode = explode,
    colors = colors,
    autopct = '%1.1f%%',
    shadow = False,
    startangle = 140
)

# Title and axis
plt.title('Number of customers by loan status', fontsize = 18)
plt.axis('equal')
plt.show()

### The composition of loan status by the dependents

In [None]:
# Data aggregation between loan status and dependents
df_viz_2 = df_train.groupby(['Loan_Status', 'Dependents'])['Loan_ID'].count().reset_index(name = 'Total')
# Map the loan status
df_viz_2['Loan_Status'] = df_viz_2['Loan_Status'].map(
    {
        0: 'Not default',
        1: 'Default'
    }
)

In [None]:
# Show the data
df_viz_2

In [None]:
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(
        data = df_viz_2
    )+
    geom_bar(
        aes(
            x = 'Dependents',
            y = 'Total',
            fill = 'Loan_Status'
        ),
        stat = 'identity',
        position = 'fill',
        width = 0.5
    )+
    labs(
        title = 'The composition of loan status by the dependents',
        fill = 'Loan status'
    )+
    xlab(
        'Dependents'
    )+
    ylab(
        'Frequency'
    )+
    scale_x_discrete(
        limits = ['0', '1', '2', '3+']
    )+
    scale_fill_manual(
        values = ['#981220','#80797c'],
        labels = ['Default', 'Not Default']
    )+
    theme_minimal()
)

### The composition of default customer by the educations

In [None]:
# Data aggregation between loan status and dependents
df_viz_3 = df_train.groupby(['Loan_Status', 'Education'])['Loan_ID'].count().reset_index(name = 'Total')
# Map the loan status
df_viz_3['Loan_Status'] = df_viz_3['Loan_Status'].map(
    {
        0: 'Not default',
        1: 'Default'
    }
)

In [None]:
# Show the data
df_viz_3

In [None]:
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(
        data = df_viz_3
    )+
    geom_bar(
        aes(
            x = 'Education',
            y = 'Total',
            fill = 'Loan_Status'
        ),
        stat = 'identity',
        position = 'fill',
        width = 0.5
    )+
    labs(
        title = 'The composition of loan status by the education',
        fill = 'Loan status'
    )+
    xlab(
        'Educations'
    )+
    ylab(
        'Frequency'
    )+
    scale_x_discrete(
        limits = ['Graduate', 'Not Graduate']
    )+
    scale_fill_manual(
        values = ['#981220','#80797c'],
        labels = ['Default', 'Not Default']
    )+
    theme_minimal()
)

### The distribution of applicant incomes by loan status

In [None]:
# Slice the columns
df_viz_4 = df_train[['ApplicantIncome', 'Loan_Status']].reset_index(drop = True)
# Map the loan status
df_viz_4['Loan_Status'] = df_viz_4['Loan_Status'].map(
    {
        0: 'Not default',
        1: 'Default'
    }
)

In [None]:
# Show the data
df_viz_4.head()

In [None]:
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(
        data = df_viz_4
    )+
    geom_density(
        aes(
            x = 'ApplicantIncome',
            fill = 'Loan_Status'
        ),
        color = 'white',
        alpha = 0.85
    )+
    labs(
        title = 'The distribution of applicant incomes by loan status'
    )+
    scale_fill_manual(
        name = 'Loan Status',
        values = ['#981220','#80797c'],
        labels = ['Default', 'Not Default']
    )+
    xlab(
        'Applicant income'
    )+
    ylab(
        'Density'
    )+
    theme_minimal()
)

### The distribution of loan amount by loan status

In [None]:
# Slice the columns
df_viz_5 = df_train[['LoanAmount', 'Loan_Status']].reset_index(drop = True)
# Map the loan status
df_viz_5['Loan_Status'] = df_viz_5['Loan_Status'].map(
    {
        0: 'Not default',
        1: 'Default'
    }
)

In [None]:
# Show the data
df_viz_5.head()

In [None]:
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(
        data = df_viz_5
    )+
    geom_density(
        aes(
            x = 'LoanAmount',
            fill = 'Loan_Status'
        ),
        color = 'white',
        alpha = 0.85
    )+
    labs(
        title = 'The distribution of loan amount by loan status'
    )+
    scale_fill_manual(
        name = 'Loan Status',
        values = ['#981220','#80797c'],
        labels = ['Default', 'Not Default']
    )+
    xlab(
        'Loan amount'
    )+
    ylab(
        'Density'
    )+
    theme_minimal()
)

## One-hot encoder

In [None]:
# Add new column of Loan_Status with 999 in testing data
df_test['Loan_Status'] = 999
# Concat the training and testing data
df_concat = pd.concat(objs = [df_train , df_test], axis = 0)

In [None]:
# Drop the column of Loan_ID
df_concat.drop(columns = ['Loan_ID'], inplace = True)

In [None]:
# Categorical columns
cols_obj_train = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
print(cols_obj_train)

In [None]:
# One-hot encoding
df_concat = pd.get_dummies(data = df_concat, columns = cols_obj_train, drop_first = True)
print('Dimension data: {} rows and {} columns'.format(len(df_concat), len(df_concat.columns)))
df_concat.head()

## Data partitioning

In [None]:
# Unique values of Loan_Status
df_concat['Loan_Status'].value_counts()

In [None]:
# Training set
df_train = df_concat[df_concat['Loan_Status'].isin([0, 1])].reset_index(drop = True)
print('Dimension data: {} rows and {} columns'.format(len(df_train), len(df_train.columns)))
df_train.head()

In [None]:
# Testing set
df_test = df_concat[df_concat['Loan_Status'].isin([999])].reset_index(drop = True)
print('Data dimension: {} rows and {} columns'.format(len(df_test), len(df_test.columns)))
df_test.head()

In [None]:
# Data partitioning >>> training set into training and validation
df_train_final = df_train.reset_index(drop = True)
X = df_train_final[df_train_final.columns[~df_train_final.columns.isin(['Loan_Status'])]]
y = df_train_final['Loan_Status']

# Training = 70% and validation = 30%
X_train, X_val, y_train, y_val = train_test_split(X , y, test_size = 0.3, random_state = 42)
print('Data dimension of training set   :', X_train.shape)
print('Data dimension of validation set :', X_val.shape)

# Testing set
X_test = df_test[df_test.columns[~df_test.columns.isin(['Loan_Status'])]]
print('Data dimension of testing set    :', X_test.shape)

## Machine learning model development

In [None]:
# XGBoost model
xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic',
    use_label_encoder = False
)

In [None]:
# Define parameter range 
params = {
    'eta': np.arange(0.1, 0.26, 0.05),
    'min_child_weight': np.arange(1, 5, 0.5).tolist(),
    'gamma': [5],
    'subsample': np.arange(0.5, 1.0, 0.11).tolist(),
    'colsample_bytree': np.arange(0.5, 1.0, 0.11).tolist()
}

In [None]:
# Make a scorer from a performance metric or loss function
scorers = {
    'f1_score': make_scorer(f1_score),
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [None]:
# k-fold cross validation
skf = KFold(n_splits = 10, shuffle = True)

In [None]:
# Set up the grid search CV
grid = GridSearchCV(
    estimator = xgb_model,
    param_grid = params,
    scoring = scorers,
    n_jobs = -1,
    cv = skf.split(X_train, np.array(y_train)),
    refit = 'accuracy_score'
)

In [None]:
# Fit the model
grid.fit(X = X_train, y = y_train)

In [None]:
# Best parameters
grid.best_params_

In [None]:
# Create a prediction of training 
predicted = grid.predict(X_val)

In [None]:
# Model evaluation - training data
accuracy_baseline = accuracy_score(predicted, np.array(y_val))
recall_baseline = recall_score(predicted, np.array(y_val))
precision_baseline = precision_score(predicted, np.array(y_val))
f1_baseline = f1_score(predicted, np.array(y_val))

print('Accuracy for baseline   :{}'.format(round(accuracy_baseline, 5)))
print('Recall for baseline     :{}'.format(round(recall_baseline, 5)))
print('Precision for baseline  :{}'.format(round(precision_baseline, 5)))
print('F1 Score for baseline   :{}'.format(round(f1_baseline, 5)))

## Store the ML model

In [None]:
# Store the model into a pickle file
filename = '../bin/xgboostModel.pkl'
joblib.dump(grid.best_estimator_, filename)