# Import lib


In [None]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
import re


# Read data

In [None]:
df = pd.read_csv('../data.csv')

# Discover and visualise the data

#### remove irrelevant features

In [None]:
remove_col = [
    'id',
    'member_id',
    'emp_title',
    'issue_d',
    'desc',
    'title',
    'zip_code',
    'addr_state',
    'earliest_cr_line',
    'last_pymnt_d',
    'last_pymnt_amnt',
    'next_pymnt_d',
    'last_credit_pull_d',
    'collections_12_mths_ex_med',
    'mths_since_last_major_derog',
    'policy_code',

    # Vu Anh
    # 'home-ownership'
    # 'purpose', 
    # 'sub_grade',
    # 'tot_coll_amt', 
    # 'tot_cur_bal', 
    # 'open_acc_6m', 
    # 'open_il_6m', 

    # Darrel
    'open_il_12m', 
    'open_il_24m', 
    'mths_since_rcnt_il', 
    'total_bal_il', 
    'il_util', 
    'open_rv_12m' ,
    'open_rv_24m', 

    # Vanness
    'max_bal_bc', 
    'all_util', 
    'total_rev_hi_lim', 
    'inq_fi', 
    'total_cu_tl', 
    'inq_last_12m'
]

## Handle home-ownership

In [None]:
# Remove rows with value ANY

df = df[df['home_ownership'] != 'ANY']

df['home_ownership'].unique()

In [None]:
# Level encoding for home ownership 

home_type = ['RENT', 'OWN', 'MORTGAGE', 'OTHER', 'NONE']  # Unique values for encoding

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(home_type)

# Encode the 'grade' column in the DataFrame
df['home_ownership'] = encoder.transform(df['home_ownership'])

## Handle purpose

In [None]:
df['purpose'].unique()

purposes = ['credit_card', 'car', 'small_business', 'other', 'wedding',
       'debt_consolidation', 'home_improvement', 'major_purchase',
       'medical', 'moving', 'vacation', 'house', 'renewable_energy',
       'educational']

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(purposes)

# Encode the 'grade' column in the DataFrame
df['purpose'] = encoder.transform(df['purpose'])

In [None]:
# Count the occurrences of each label
label_counts = df['purpose'].value_counts().sort_index()

# Print the label counts

'''
for label, count in label_counts.items():
    print(f"{label}: {count}")
'''

## Handle sub_grade

The sub-grade is a more detailed classification that further divides borrowers within each grade. It typically includes additional factors such as credit history, employment stability, and debt-to-income ratio. This additional level of detail can help in distinguishing the risk profile of borrowers within the same grade.

This is why we should keep the subgrade

In [None]:
df['sub_grade'].unique()

# TODO: check if we should use data binning for this attribute


To make best use of level encoding, we sort the sub-grade by order to present the relationship between subgrades

In [None]:
subgrades = ['B2', 'C4', 'C5', 'C1', 'B5', 'A4', 'E1', 'F2', 'C3', 'B1', 'D1',
       'A1', 'B3', 'B4', 'C2', 'D2', 'A3', 'A5', 'D5', 'A2', 'E4', 'D3',
       'D4', 'F3', 'E3', 'F4', 'F1', 'E5', 'G4', 'E2', 'G3', 'G2', 'G1',
       'F5', 'G5']

def custom_sort_key(subgrade):
    match = re.match(r'([A-Za-z]+)(\d+)', subgrade)
    letter = match.group(1)
    number = int(match.group(2))
    
    return letter, number

sorted_subgrades = sorted(subgrades, key=custom_sort_key)

print(sorted_subgrades)

In [None]:
# Level encoding for sorted sub-grade 

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(sorted_subgrades)

# Encode the 'grade' column in the DataFrame
df['sub_grade'] = encoder.transform(df['sub_grade'])

In [None]:
df['sub_grade'].unique()

## Handle 'tot_coll_amt'

In [None]:
df['tot_coll_amt'].isna().any()

# There are 67313 na values for tot_coll_amt

# Handle missing values 

df['tot_coll_amt'].isna().any()

In [None]:
# Handle missing values using Simple Imputer

imputer = SimpleImputer(strategy='mean')

tot_coll_amt = df['tot_coll_amt'].values.reshape(-1,1)

tot_coll_amt_imputed = imputer.fit_transform(tot_coll_amt)

df['tot_coll_amt'] = tot_coll_amt_imputed

## Hanle 'tot_cur_bal'

In [None]:
df['tot_cur_bal'].isna().any()

# Need to handle missing values using simple imputer

In [None]:
imputer = SimpleImputer(strategy='mean')

tot_cur_bal = df['tot_cur_bal'].values.reshape(-1,1)

tot_cur_bal_imputed = imputer.fit_transform(tot_cur_bal)

df['tot_cur_bal'] = tot_cur_bal_imputed

## Handle open_acc_6m

Potential Risk of Overextension: A higher number of newly opened accounts may suggest that the borrower has taken on additional credit obligations in a relatively short period. This could indicate a higher risk of overextension, where the borrower may have difficulty managing multiple credit accounts simultaneously.

In [None]:
df['open_acc_6m'].unique()

# Handle missing value using Simple Imputer


In [None]:
imputer = SimpleImputer(strategy='mean')

open_acc_6m = df['open_acc_6m'].values.reshape(-1,1)

open_acc_6m_imputed = imputer.fit_transform(open_acc_6m)

df['open_acc_6m'] = open_acc_6m_imputed

## Handle 'open_il_6m'

Recent Credit Activity: 'open_il_6m' provides insights into the borrower's recent credit behavior and the number of installment accounts they have opened within the last 6 months. A higher number of open installment accounts may indicate an increased demand for credit or a need for additional funds, which could impact the borrower's creditworthiness assessment.

In [None]:
df['open_il_6m'].unique()

# TODO: check if we should use data binning for this attribute

In [None]:
# Handle missing values using simple imputer

imputer = SimpleImputer(strategy='mean')

open_il_6m = df['open_il_6m'].values.reshape(-1,1)

open_il_6m_imputed = imputer.fit_transform(open_il_6m)

df['open_il_6m'] = open_il_6m_imputed


In [None]:
df = df.drop(remove_col, axis=1)

In [None]:
np.shape(df)

In [None]:
df.info()

### Process missing value

In [None]:
# Display all attributes with missing values

null_columns = df.columns[df.isnull().any()] 
null_columns_result = df.isnull().any()[null_columns] 
null_columns_result

#### Emp_length

In [None]:
df['emp_length'].unique()

In [None]:
label_encoder = LabelEncoder()

# Fit the LabelEncoder with unique values
label_encoder.fit(df['emp_length'].unique())

# Encode the attribute values
df['emp_length'] = label_encoder.transform(df['emp_length'])

In [None]:
df['emp_length'].unique()

#### mths_since_last_delinq 

In [None]:
df['mths_since_last_delinq'] = df['mths_since_last_delinq'].fillna(-1)

#### mths_since_last_record

In [None]:
df['mths_since_last_record'] = df['mths_since_last_record'].fillna(-1)

#### revol_util

In [None]:
imputer = SimpleImputer(strategy='mean')

revol_util = df['revol_util'].values.reshape(-1,1)

revol_util_imputed = imputer.fit_transform(revol_util)

df['revol_util'] = revol_util_imputed

#### annual_inc_joint

In [None]:
# aggregate annual_inc and annual_inc_joint
df.loc[df['application_type'] == 'JOINT', 'annual_inc'] = df.loc[df['application_type'] == 'JOINT', 'annual_inc_joint']
df = df.drop('annual_inc_joint', axis=1)

#### dti_joint 

In [None]:
df.loc[df['application_type'] == 'JOINT', 'dti'] = df.loc[df['application_type'] == 'JOINT', 'dti_joint']
df = df.drop('dti_joint', axis=1)

#### verification_status_joint 

In [None]:
df.loc[df['application_type'] == 'JOINT', 'verification_status'] = df.loc[df['application_type'] == 'JOINT', 'verification_status_joint']
df = df.drop('verification_status_joint', axis=1)

### Data binning

#### term

In [None]:
df['term'].unique()

In [None]:
term = [' 36 months', ' 60 months']  # Unique values for encoding

# Create an instance of LabelEncoder
encoder = LabelEncoder()

# Fit the encoder on the grades
encoder.fit(term)

# Encode the 'grade' column in the DataFrame
df['term'] = encoder.transform(df['term'])

### grade

In [None]:
grades = ['A', 'B', 'C', 'D', 'E', 'F', 'G']  # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(grades)

df['grade'] = encoder.transform(df['grade'])

### verification_status

In [None]:
veri = ['Not Verified', 'Source Verified', 'Verified']  # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(veri)


df['verification_status'] = encoder.transform(df['verification_status'])

### pymnt_plan

In [None]:
plan = ['n', 'y']  # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(plan)

df['pymnt_plan'] = encoder.transform(df['pymnt_plan'])

### application_type

In [None]:
type = ['INDIVIDUAL', 'JOINT'] # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(type)

df['application_type'] = encoder.transform(df['application_type'])

#### initial_list_status

In [None]:

status = ['f', 'w'] # Unique values for encoding

encoder = LabelEncoder()

encoder.fit(status)

df['initial_list_status'] = encoder.transform(df['initial_list_status'])

### Generate new feature

In [None]:
# Generate the new feature
df['open_acc_rate'] = df['open_acc'] / df['total_acc']
# Swap the values and column names
df['open_acc_rate'], df['default_ind'] = df['default_ind'].copy(), df['open_acc_rate'].copy()
df.rename(columns={'open_acc_rate': 'default_ind', 'default_ind': 'open_acc_rate'}, inplace=True)

In [None]:
df.head()

In [None]:
np.shape(df)

### Visualize

In [None]:
# Create a correlation matrix
corr_matrix = df.corr()

# Select the correlation values with 'default_ind'
target_corr = corr_matrix['default_ind']

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

### Attribute ranking

In [None]:
# Take the absolute values of the correlation
abs_corr = target_corr.abs()

# Sort the absolute correlation values
sorted_corr = abs_corr.sort_values(ascending=False)

# Display the attribute ranking
attribute_ranking = sorted_corr.reset_index()
attribute_ranking.columns = ['Attribute', 'Absolute Correlation']
attribute_ranking