## Predict Credit Card Approval

### Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### UDFs

In [None]:
def count_plot(x, dataframe, ax=None, **kwargs):
    if len(kwargs)==1 and kwargs['mode']=='horizontal':
        sns.countplot(y=x, data=dataframe, ax=ax, order=dataframe[x].value_counts().index)
    else:
        sns.countplot(x=x, data=dataframe, ax=ax, order=dataframe[x].value_counts().index)
    plt.box(False)
    return None

    
def pie_plot(df_col, fig_size, title):
    fig, ax = plt.subplots(figsize=fig_size)
    ax.pie(df_col.values, autopct='%1.2f%%', shadow=False, startangle=90)
    ax.axis('equal')
    plt.legend(labels=df_col.index, bbox_to_anchor=(1.05, 1))
    plt.title(title)
    plt.show()
    return None


def stacked_vBar_plot(dataframe, value, index, column, xlabel, fig_size, scale='linear', with_percent=True):
    
    df_1 = dataframe.pivot_table(values=[value], index=[index], columns=[column], aggfunc=len, margins=True)
    df_1_percent = df_1.div(df_1.iloc[:,-1], axis=0).mul(100, axis=0).round(2)
    df_2_percent = df_1_percent.iloc[:, :-1].drop('All')
    
    columns = df_2_percent.columns.levels[1].tolist()
    columns.remove('All')
    
    ax = df_2_percent.plot.bar(stacked=True)
    ax.figure.set_size_inches(fig_size)
    ax.grid(False)
    plt.legend(labels=columns, bbox_to_anchor=(1.05, 1), title='NPS Type')
    plt.xlabel(xlabel)
    plt.ylabel('%GT Count of NPS_Type')
    
    if with_percent:
        # Add this loop to add the annotations
        for p in ax.patches:
            width, height = p.get_width(), p.get_height()
            x, y = p.get_xy() 
            ax.annotate('{:.2f}%'.format(height), (x + width/8, y + height/2))
    plt.box(False)
    plt.show()
    return None


def pdf_distribution_plots(df, features, target):
    nrow = int((len(features)/3) + len(features)%3)
    
    t0 = df.loc[df[target] == 0]
    t1 = df.loc[df[target] == 1]

    sns.set_style('whitegrid')
    fig = plt.figure(figsize=(15,10))

    for indx, feature in enumerate(features):
        ax = fig.add_subplot(nrow, 3, indx+1)
        sns.kdeplot(t0[feature], label="0", legend=True)
        sns.kdeplot(t1[feature], label="1", legend=True)
        ax.set_ylabel('Density', fontsize=12)
        ax.set_xlabel(feature, fontsize=12)
        ax.tick_params(axis='both', which='major', labelsize=15)
        ax.legend(loc='best')
    
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None,wspace= 0.3, hspace=0.5)
    plt.show()
    return None


def clipping(dataframe, num_cols):
    df_copy = dataframe.copy()
    for col in num_cols:
        p25 = np.percentile(df_copy[col], 25)
        p75 = np.percentile(df_copy[col], 75)
        iqr = p75 - p25
        df_copy[col] = np.clip(df_copy[col], a_min=np.floor((p25 - 1.5*iqr)), a_max=np.ceil((p75 + 1.5*iqr)))
    return df_copy


def get_category(df, col, binsnum, labels, qcut = False):
    if qcut:
        localdf = pd.qcut(df[col], q = binsnum, labels = labels) # quantile cut
    else:
        localdf = pd.cut(df[col], bins = binsnum, labels = labels) # equal-length cut
        
    localdf = pd.DataFrame(localdf)
    name = col + '_CAT'
    localdf[name] = localdf[col]
    df = df.join(localdf[name])
    df[name] = df[name].astype(object)
    return df

### Configurations

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

### Load Data

In [None]:
application_df = pd.read_csv('./data/application_record.csv')

application_df.head()

In [None]:
credit_df = pd.read_csv('./data/credit_record.csv')

credit_df.head()

### Data Pre-processing

In [None]:
# Drop duplicate applications
application_df = application_df.drop_duplicates(subset='ID')

In [None]:
# Target label creation - target=1 (high risk) iff there is at least one month where user is late on payments by 30 days or more
credit_df['target_status'] = np.where((credit_df['STATUS']=='0')|(credit_df['STATUS']=='C')|(credit_df['STATUS']=='X'), 0, 1)
target_df=pd.DataFrame(credit_df.groupby(['ID'])['target_status'].agg(max)).reset_index()

# Merge target label to application dataset
merged_df = pd.merge(application_df, target_df, how='inner', on='ID')
merged_df['target_status'] = merged_df['target_status'].astype("category")

merged_df.head()

In [None]:
# Fill missing values
merged_df['OCCUPATION_TYPE'].fillna(value='Other', inplace=True)

In [None]:
# Derive new features
merged_df = get_category(merged_df, col='AMT_INCOME_TOTAL', binsnum=3, labels=["low","medium", "high"], qcut = True)

In [None]:
# Data transformation
merged_df['DAYS_BIRTH'] = -1 * merged_df['DAYS_BIRTH']
merged_df['DAYS_EMPLOYED'] = -1 * merged_df['DAYS_EMPLOYED']
merged_df['DAYS_EMPLOYED'] = np.where((merged_df['DAYS_EMPLOYED']<=0), 0, merged_df['DAYS_EMPLOYED'])

In [None]:
cat_features = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
                'NAME_HOUSING_TYPE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'AMT_INCOME_TOTAL_CAT']

dropped_cat_features = ['FLAG_MOBIL']

num_features = ['AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS']

dropped_num_features = ['CNT_CHILDREN']

all_features = num_features + cat_features

In [None]:
# Handle outliers
merged_df = clipping(dataframe=merged_df, num_cols=num_features)

In [None]:
merged_df[num_features].describe().applymap('{:,.2f}'.format)

In [None]:
merged_df[cat_features] = merged_df[cat_features].astype("category")
merged_df[cat_features].describe()

In [None]:
# Label encode categorical features
to_label_encode = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']

for feature in to_label_encode:
    labelencoder = LabelEncoder()
    merged_df[feature] = labelencoder.fit_transform(merged_df[feature])

In [None]:
# Ordinal encode CO features - NAME_EDUCATION_TYPE, AMT_INCOME_TOTAL_CAT
edu_type_oe = OrdinalEncoder(categories=[['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 'Higher education', 'Academic degree']], dtype='int32')
merged_df['NAME_EDUCATION_TYPE'] = edu_type_oe.fit_transform(np.array(merged_df['NAME_EDUCATION_TYPE']).reshape(-1,1))

incm_cat_oe = OrdinalEncoder(categories=[["low","medium", "high"]], dtype='int32')
merged_df["AMT_INCOME_TOTAL_CAT"] = incm_cat_oe.fit_transform(np.array(merged_df['AMT_INCOME_TOTAL_CAT']).reshape(-1,1))

In [None]:
merged_df.shape

In [None]:
# One-hot encode nominal features
to_onehot_encode = ['NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']

merged_df = pd.get_dummies(data=merged_df, prefix=['inc_typ', 'fam_sta', 'hou_typ', 'occu_typ'], columns=to_onehot_encode)

print("Dataset shape: {}".format(merged_df.shape))

merged_df.head()

In [None]:
merged_df.info(verbose=True)

In [None]:
# Transform data types
merged_df['FLAG_WORK_PHONE'] = merged_df['FLAG_WORK_PHONE'].astype('uint8')
merged_df['FLAG_PHONE'] = merged_df['FLAG_PHONE'].astype('uint8')
merged_df['FLAG_EMAIL'] = merged_df['FLAG_EMAIL'].astype('uint8')
merged_df['target_status'] = merged_df['target_status'].astype('uint8')

### Model Development

In [None]:
# Create independent and dependent variables
X = merged_df.drop(columns=['ID', 'FLAG_MOBIL', 'CNT_CHILDREN', 'target_status'])
y = merged_df['target_status']

X.shape

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1295, shuffle=True, stratify=y)

y_train.value_counts()

In [None]:
# Normalize numerical features
scaler = StandardScaler().fit(X_train[num_features])

X_train[num_features] = scaler.transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

#### Naive Bayes

In [None]:
gnb_clf = GaussianNB()

gnb_clf.fit(X_train, y_train)

#### Logistic Regression

In [None]:
lr_clf = LogisticRegression(solver='liblinear')

grid_values = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20)
}

grid_lr_clf = GridSearchCV(lr_clf, param_grid=grid_values, scoring='recall', cv=5, n_jobs=4, verbose=10)

grid_lr_clf.fit(X_train, y_train)

In [None]:
print('Grid best parameter: ', grid_lr_clf.best_params_)
print('Grid best score (recall): ', grid_lr_clf.best_score_)
print('\nBest Estimator: ', grid_lr_clf.best_estimator_)

#### K Nearest Neighbours (kNN)

In [None]:
knn_clf = KNeighborsClassifier(weights='uniform', n_jobs=1)

grid_values = {
    'n_neighbors': [1, 2, 3, 5, 7, 9, 10]
}

grid_knn_clf = GridSearchCV(knn_clf, param_grid=grid_values, scoring='recall', cv=5, n_jobs=4, verbose=10)

grid_knn_clf.fit(X_train, y_train)

In [None]:
print('Grid best parameter: ', grid_knn_clf.best_params_)
print('Grid best score (recall): ', grid_knn_clf.best_score_)
print('\nBest Estimator: ', grid_knn_clf.best_estimator_)

#### Decision Tree

In [None]:
dt_clf = DecisionTreeClassifier(random_state=1295)

grid_values = {
    'max_depth' : [2, 3, 5, 7, 9],
    'min_samples_split' : [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

grid_dt_clf = GridSearchCV(dt_clf, param_grid=grid_values, scoring='recall', cv=5, n_jobs=4, verbose=10)

grid_dt_clf.fit(X_train, y_train)

In [None]:
print('Grid best parameter: ', grid_dt_clf.best_params_)
print('Grid best score (recall): ', grid_dt_clf.best_score_)
print('\nBest Estimator: ', grid_dt_clf.best_estimator_)

In [None]:
'''
fig, _ = plt.subplots(nrows=1, ncols=1, figsize=(100,50), dpi=300)
tree.plot_tree(
    grid_dt_clf.best_estimator_,
    feature_names=all_features,
    filled=True
)

fig.savefig('./outputs/dt_clf.png', transparent=False)
'''

#### Support Vector Machine (SVM)

In [None]:
svm_clf = SVC(gamma='scale')

grid_values = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf']
}

grid_svm_clf = GridSearchCV(svm_clf, param_grid=grid_values, scoring='recall', cv=5, n_jobs=-1, verbose=3)

grid_svm_clf.fit(X_train, y_train)

In [None]:
print('Grid best parameter: ', grid_svm_clf.best_params_)
print('Grid best score (recall): ', grid_svm_clf.best_score_)
print('\nBest Estimator: ', grid_svm_clf.best_estimator_)

### Model Evaluation

#### Naive Bayes

In [None]:
model = gnb_clf

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Training set
print("------------------------Training Set------------------------")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Training Set Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
print("Training Set Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
print("Training Set Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
print("Training Set f1: {:.4f}".format(f1_score(y_train, y_train_pred)))


# Test set
print("\n--------------------------Test Set--------------------------")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("Test Set Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
print("Test Set Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("Test Set f1: {:.4f}".format(f1_score(y_test, y_test_pred)))

#### Logistic Regression

In [None]:
model = grid_lr_clf.best_estimator_

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


# Training set
print("------------------------Training Set------------------------")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Training Set Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
print("Training Set Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
print("Training Set Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
print("Training Set f1: {:.4f}".format(f1_score(y_train, y_train_pred)))


# Test set
print("\n--------------------------Test Set--------------------------")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("Test Set Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
print("Test Set Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("Test Set f1: {:.4f}".format(f1_score(y_test, y_test_pred)))

#### kNN

In [None]:
model = grid_knn_clf.best_estimator_

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


# Training set
print("------------------------Training Set------------------------")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Training Set Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
print("Training Set Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
print("Training Set Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
print("Training Set f1: {:.4f}".format(f1_score(y_train, y_train_pred)))


# Test set
print("\n--------------------------Test Set--------------------------")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("Test Set Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
print("Test Set Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("Test Set f1: {:.4f}".format(f1_score(y_test, y_test_pred)))

#### Decision Tree

In [None]:
model = grid_dt_clf.best_estimator_

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


# Training set
print("------------------------Training Set------------------------")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Training Set Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
print("Training Set Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
print("Training Set Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
print("Training Set f1: {:.4f}".format(f1_score(y_train, y_train_pred)))


# Test set
print("\n--------------------------Test Set--------------------------")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("Test Set Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
print("Test Set Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("Test Set f1: {:.4f}".format(f1_score(y_test, y_test_pred)))

#### SVM

In [None]:
model = grid_svm_clf.best_estimator_

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


# Training set
print("------------------------Training Set------------------------")
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

print("Training Set Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
print("Training Set Precision: {:.4f}".format(precision_score(y_train, y_train_pred)))
print("Training Set Recall: {:.4f}".format(recall_score(y_train, y_train_pred)))
print("Training Set f1: {:.4f}".format(f1_score(y_train, y_train_pred)))


# Test set
print("\n--------------------------Test Set--------------------------")
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

print("Test Set Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("Test Set Precision: {:.4f}".format(precision_score(y_test, y_test_pred)))
print("Test Set Recall: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("Test Set f1: {:.4f}".format(f1_score(y_test, y_test_pred)))