In [None]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# Read data 

df = pd.read_csv('data.csv')

In [None]:
df.head()

# Data discovery

In [None]:
df.info()

In [None]:
# Check the null value in each attributes

df.isnull().sum()

# Data pre-processing

In [None]:
# total_rev_hi_lim 

# Fill missing values in total_rev_hi_lim with Simple Imputer

imputer = SimpleImputer(strategy='median')
total_rev_hi_lim = df['total_rev_hi_lim'].values.reshape(-1,1)
total_rev_hi_lim_imputed = imputer.fit_transform(total_rev_hi_lim)
df['total_rev_hi_lim'] = total_rev_hi_lim_imputed

In [None]:
# home_ownership 

# Remove rows with value ANY

df = df[df['home_ownership'] != 'ANY']
df['home_ownership'].unique()

# Level encoding for home ownership 

home_type = ['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE']  # Unique values for encoding

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(home_type)

# Encode the 'grade' column in the DataFrame
df['home_ownership'] = encoder.transform(df['home_ownership'])


In [None]:
# Purpose 

# Label encoding for purpose

df['purpose'].unique()

purposes = ['credit_card', 'car', 'small_business', 'other', 'wedding',
       'debt_consolidation', 'home_improvement', 'major_purchase',
       'medical', 'moving', 'vacation', 'house', 'renewable_energy',
       'educational']

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(purposes)
# Encode the 'grade' column in the DataFrame
df['purpose'] = encoder.transform(df['purpose'])


In [None]:
# sub_grade

# Sort the order of subgrades and do label encoding

subgrades = ['B2', 'C4', 'C5', 'C1', 'B5', 'A4', 'E1', 'F2', 'C3', 'B1', 'D1',
       'A1', 'B3', 'B4', 'C2', 'D2', 'A3', 'A5', 'D5', 'A2', 'E4', 'D3',
       'D4', 'F3', 'E3', 'F4', 'F1', 'E5', 'G4', 'E2', 'G3', 'G2', 'G1',
       'F5', 'G5']

def custom_sort_key(subgrade):
    match = re.match(r'([A-Za-z]+)(\d+)', subgrade)
    letter = match.group(1)
    number = int(match.group(2))
    
    return letter, number

sorted_subgrades = sorted(subgrades, key=custom_sort_key)

# Level encoding for sorted sub-grade 

encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(sorted_subgrades)

# Encode the 'grade' column in the DataFrame
df['sub_grade'] = encoder.transform(df['sub_grade'])

In [None]:
# employment_lengths

employment_lengths = ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years', 'nan']

label_encoder = LabelEncoder()

# Fit the LabelEncoder with unique values
label_encoder.fit(employment_lengths)

# Encode the attribute values
df['emp_length'] = label_encoder.transform(df['emp_length'])

In [None]:
# mths_since_last_delinq

df['mths_since_last_delinq'] = df['mths_since_last_delinq'].fillna(-1)

In [None]:
# mths_since_last_record

df['mths_since_last_record'] = df['mths_since_last_record'].fillna(-1)

In [None]:
# revol_util
# Handle missing value with imputer

imputer = SimpleImputer(strategy='mean')

revol_util = df['revol_util'].values.reshape(-1,1)

revol_util_imputed = imputer.fit_transform(revol_util)

df['revol_util'] = revol_util_imputed

In [None]:
# aggregate annual_inc and annual_inc_joint

df.loc[df['application_type'] == 'JOINT', 'annual_inc'] = df.loc[df['application_type'] == 'JOINT', 'annual_inc_joint']
df = df.drop('annual_inc_joint', axis=1)

In [None]:
# dti_joint

df.loc[df['application_type'] == 'JOINT', 'dti'] = df.loc[df['application_type'] == 'JOINT', 'dti_joint']
df = df.drop('dti_joint', axis=1)

In [None]:
# verification_status_joint

df.loc[df['application_type'] == 'JOINT', 'verification_status'] = df.loc[df['application_type'] == 'JOINT', 'verification_status_joint']
df = df.drop('verification_status_joint', axis=1)

In [None]:
# term 
# Label encoding for term

term = [' 36 months', ' 60 months']  # Unique values for encoding

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(term)

# Encode the 'grade' column in the DataFrame
df['term'] = encoder.transform(df['term'])

In [None]:
# verification_status

veri = ['Not Verified', 'Source Verified', 'Verified']  # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(veri)


df['verification_status'] = encoder.transform(df['verification_status'])

In [None]:
# pymnt_plan

plan = ['n', 'y']  # Unique values for encoding

encoder = LabelEncoder()
encoder.fit(plan)
df['pymnt_plan'] = encoder.transform(df['pymnt_plan'])

In [None]:
# application_type

type = ['INDIVIDUAL', 'JOINT'] # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(type)

df['application_type'] = encoder.transform(df['application_type'])

In [None]:

# initial_list_status

status = ['f', 'w'] # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(status)

df['initial_list_status'] = encoder.transform(df['initial_list_status'])

In [None]:
# Fill missing last credit pull
df['last_credit_pull_d'].fillna("25-07-2023", inplace=True)

In [None]:
# Credit History Length:
# Calculated as: last_credit_pull - earliest_cr_line
def date_difference(date_str1, date_str2):
    # Convert date strings to datetime objects
    date_format = "%d-%m-%Y"
    date1 = datetime.strptime(date_str1, date_format)
    date2 = datetime.strptime(date_str2, date_format)

    # Calculate the difference
    difference = date2 - date1

    # Return the difference in days
    return difference.days

df['credit_history_length'] = df.apply(lambda row: date_difference(row['earliest_cr_line'], row['last_credit_pull_d']), axis=1)

# Swap the values and column names
df['default_ind'], df['credit_history_length'] = df['credit_history_length'], df['default_ind']
df.rename(columns={'default_ind': 'credit_history_length', 'credit_history_length': 'default_ind'}, inplace=True)

In [None]:
# Drop not usable attributes 

remove_col = [
    'id',
    'member_id',
    'emp_title',
    'issue_d',
    'desc',
    'title',
    'zip_code',
    'addr_state',
    'earliest_cr_line',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'collections_12_mths_ex_med',
    'mths_since_last_major_derog',
    'policy_code',
    'tot_coll_amt',
    'tot_cur_bal', 
    'open_acc_6m',
    'open_il_6m', 
    'open_il_12m', 
    'open_il_24m', 
    'mths_since_rcnt_il', 
    'total_bal_il', 
    'il_util', 
    'open_rv_12m' ,
    'open_rv_24m', 
    'max_bal_bc', 
    'all_util', 
    'inq_fi', 
    'total_cu_tl', 
    'inq_last_12m',
    'grade'
]

df = df.drop(remove_col, axis=1)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# Visualize default_ind attribute

sns.countplot(data=df, x='default_ind')
plt.show()

## New feature using user defined transformer

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class CreditUtilizationRatioTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, include=True):
        self.revol_bal_col = 'revol_bal'
        self.annual_inc_col = 'annual_inc'
        self.installment_col = 'installment'
        self.total_rec_prncp_col = 'total_rec_prncp'
        self.funded_amnt_col = 'funded_amnt'

        self.include_rev_to_inc_ratio = include
        self.include_loan_to_inc_ratio = include
        self.include_repayment_progress = include


    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Find index of columns
        revol_bal_ix = np.where(X.columns == self.revol_bal_col)[0][0]
        annual_inc_ix = np.where(X.columns == self.annual_inc_col)[0][0]
        installment_ix = np.where(X.columns == self.installment_col)[0][0]
        total_rec_prncp_ix = np.where(X.columns == self.total_rec_prncp_col)[0][0]
        funded_amnt_ix = np.where(X.columns == self.funded_amnt_col)[0][0]

        # Calculate the Revolving Credit Balance to Annual Income Ratio.
        rev_to_inc_ratio = X.iloc[:, revol_bal_ix] / X.iloc[:, annual_inc_ix]

        # Calculate the Loan Payment-to-Income Ratio.
        loan_to_inc_ratio = X.iloc[:, installment_ix] / (X.iloc[:, annual_inc_ix] / 12)

        # Calculate the Repayment Progress.
        repayment_progress = (X.iloc[:, total_rec_prncp_ix] / X.iloc[:, funded_amnt_ix]) * 100

        if self.include_rev_to_inc_ratio:
            # Add the calculated Revolving Credit Balance to Annual Income Ratio as a new column to the input data.
            X['Rev_to_Inc_Ratio'] = rev_to_inc_ratio

        if self.include_loan_to_inc_ratio:
            # Add the calculated Loan Payment-to-Income Ratio as a new column to the input data.
            X['Loan_Payment_to_Income_Ratio'] = loan_to_inc_ratio

        if self.include_repayment_progress:
            # Add the calculated Repayment Progress as a new column to the input data.
            X['Repayment_Progress'] = repayment_progress

        return X


In [None]:
# Visualize - Correlation matrix

# Create a correlation matrix
corr_matrix = df.corr()

# Select the correlation values with 'default_ind'
target_corr = corr_matrix['default_ind']

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show() 

# Model selection and training

#### Feature selection

In [None]:
selected_features = [
    'collection_recovery_fee'   ,                  
    'acc_now_delinq' ,                                    
    'funded_amnt'   ,               
    'funded_amnt_inv'   ,            
    'mths_since_last_record' ,       
    'delinq_2yrs'   ,                          
    'dti'            ,              
    'mths_since_last_delinq'  ,       
    'emp_length',                      
    'pub_rec'   ,                  
    'revol_bal'    ,                                
    'credit_history_length'  ,        
    'term'   ,                       
    'home_ownership' ,                                
    'total_rev_hi_lim'  ,              
    'total_pymnt'      ,                     
    'total_pymnt_inv'  ,              
    'purpose'   ,                   
    'revol_util'   ,                 
    'total_rec_int'   ,               
    'inq_last_6mths'  ,              
    'total_rec_prncp' ,                        
    'sub_grade'        ,             
    'total_rec_late_fee'   ,          
    'int_rate'   ,                    
    'out_prncp_inv'  ,                 
    'out_prncp'   ,                       
    'recoveries'                     
]
X = df[selected_features]
y = df['default_ind']

#### Stratified sampling + undersampling method

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

def stratified_sampling_and_undersampling(X, y, test_size=0.2, random_state=42):
    """
    Perform stratified sampling and undersampling on the input dataset.

    Parameters:
        X (array-like): Input features.
        y (array-like): Target variable.
        test_size (float): Proportion of the dataset to include in the test split.
        random_state (int or None): Random seed for reproducibility.

    Returns:
        X_train_undersampled (array-like): Undersampled training features.
        X_test (array-like): Test features.
        y_train_undersampled (array-like): Undersampled training target variable.
        y_test (array-like): Test target variable.
    """
    # Split the data into training and testing sets using stratified sampling
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)

    # Create a DataFrame to easily manipulate the data
    data = np.column_stack((X_train, y_train))
    data_df = pd.DataFrame(data, columns=np.arange(X_train.shape[1]).tolist() + ['default_ind'])

    # Get the counts of each class
    class_counts = data_df['default_ind'].value_counts()

    # Calculate the target count of the least represented class
    min_class_count = class_counts.min()

    # Undersample the majority class to balance the dataset
    undersampled_data = data_df.groupby('default_ind', group_keys=False).apply(lambda x: resample(x, n_samples=min_class_count, random_state=random_state))

    # Separate the features and target variable after undersampling
    X_train_undersampled = undersampled_data.iloc[:, :-1].values
    y_train_undersampled = undersampled_data['default_ind'].values

    return X_train_undersampled, X_test, y_train_undersampled, y_test

#### Stratified sampling + Oversampling method 

In [None]:
from imblearn.over_sampling import SMOTE

def oversampling(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)
    # Instantiate the SMOTE object
    smote = SMOTE(random_state=42)

    # Perform SMOTE only on the training data
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    return X_train_balanced, X_test, y_train_balanced, y_test

#### NB Classifier with default setting

In [None]:
# Create a Gaussian Naive Bayes classifier
X0_train, X0_test, y0_train, y0_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
nb_classifier_default = GaussianNB()

# Train the classifier on the training data
nb_classifier_default.fit(X0_train, y0_train)

In [None]:
y_pred_def = nb_classifier_default.predict(X0_test)

#### NB classifier using undersampling

In [None]:
# Create a Gaussian Naive Bayes classifier
X_train, X_test, y_train, y_test = stratified_sampling_and_undersampling(X, y)
nb_classifier = GaussianNB()

# Train the classifier on the training data
nb_classifier.fit(X_train, y_train)

In [None]:
y_pred = nb_classifier.predict(X_test)

#### NB classifier using oversampling

In [None]:
X2_train, X2_test, y2_train, y2_test = oversampling(X, y)
nb_classifier_over = GaussianNB()

# Train the classifier on the training data
nb_classifier_over.fit(X2_train, y2_train)

In [None]:
y_pred_o = nb_classifier_over.predict(X2_test)

# Fine tune the model

#### Using user-defined transformer

In [None]:
# Decide to use new generated features

transformer_with_features = CreditUtilizationRatioTransformer(include=True)

# Apply the transformer to add the new features to df.
df = transformer_with_features.transform(df)

#### Add new feature to selection list as hyperparameter

In [None]:
selected_features.append('Repayment_Progress')
selected_features.append('Loan_Payment_to_Income_Ratio')
selected_features.append('Rev_to_Inc_Ratio')
X1 = df[selected_features]
y1 = df['default_ind']

#### NB classifier using only hyperparameter

In [None]:
X4_train, X4_test, y4_train, y4_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
nb_classifier_hyper_def = GaussianNB()

# Train the classifier on the training data
nb_classifier_hyper_def.fit(X4_train, y4_train)

In [None]:
y_pred_hyperparam_def = nb_classifier_hyper_def.predict(X4_test)

#### NB classifier using hyperparameter and undersampling

In [None]:
# Create a Gaussian Naive Bayes classifier
X1_train, X1_test, y1_train, y1_test = stratified_sampling_and_undersampling(X1, y1)
nb_classifier_hyper = GaussianNB()

# Train the classifier on the training data
nb_classifier_hyper.fit(X1_train, y1_train)

In [None]:
y_pred_hyperparam = nb_classifier_hyper.predict(X1_test)

#### NB classifier using hyperparameter and oversampling

In [None]:
X3_train, X3_test, y3_train, y3_test = oversampling(X1, y1)
nb_classifier_hyper_o = GaussianNB()

# Train the classifier on the training data
nb_classifier_hyper_o.fit(X3_train, y3_train)

In [None]:
y_pred_hyperparam_o = nb_classifier_hyper_o.predict(X3_test)

# Eveluate the outcome

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Evaluate the model accuracy

accuracy0 = accuracy_score(y0_test, y_pred_def)
accuracy = accuracy_score(y_test, y_pred)
accuracy_o = accuracy_score(y2_test, y_pred_o)
accuracy_hyperparam0 = accuracy_score(y4_test, y_pred_hyperparam_def)
accuracy_hyperparam = accuracy_score(y1_test, y_pred_hyperparam)
accuracy_hyperparam_o = accuracy_score(y3_test, y_pred_hyperparam_o)

print("Accuracy of default NB:", accuracy0)
print("Accuracy of default NB (Undersampling):", accuracy)
print("Accuracy of default NB (Oversampling):", accuracy_o)
print("Accuracy of default NB with hyper parameter:", accuracy_hyperparam0)
print("Accuracy of default NB with hyper parameter (Undersampling):", accuracy_hyperparam)
print("Accuracy of default NB with hyper parameter (Oversampling):", accuracy_hyperparam_o)

In [None]:
# Generate the confusion matrix for default NB
cm = confusion_matrix(y0_test, y_pred_def)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes Confusion Matrix')
plt.show()

In [None]:
# Generate the confusion matrix for default NB + undersampling
cm = confusion_matrix(y_test, y_pred)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes Confusion Matrix (Undersampling)')
plt.show()

In [None]:
# Generate the confusion matrix for default NB + oversampling
cm = confusion_matrix(y2_test, y_pred_o)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes (Oversampling) Confusion Matrix')
plt.show()

In [None]:
# Generate the confusion matrix for hyperparam NB
cm = confusion_matrix(y4_test, y_pred_hyperparam_def)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes Confusion Matrix (Hyper parameter)')
plt.show()

In [None]:
# Generate the confusion matrix for hyperparam NB + undersampling
cm = confusion_matrix(y_test, y_pred_hyperparam)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes Confusion Matrix (Hyper parameter & Undersampling)')
plt.show()

In [None]:
# Generate the confusion matrix for hyperparam NB + oversampling
cm = confusion_matrix(y3_test, y_pred_hyperparam_o)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-default', 'Default'], yticklabels=['Non-default', 'Default'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Gaussian Naive Bayes Confusion Matrix (Hyper parameter & Oversampling)')
plt.show()

# Conclusion