# Data Cleaning for PD Prediction

In [59]:
# import the necessary packages
import numpy as np
import os
import sys
import time

import gzip
import pandas as pd


csv_file_path = '../Data/Loan_status_2007-2020Q3.gzip'
df = pd.read_csv(csv_file_path, skiprows=range(1,2825493), nrows=100000)

In [70]:
# Step 1: Determine the total number of rows in the file
total_rows = sum(1 for row in open(csv_file_path)) - 1  # Minus 1 to exclude the header

# Step 2: Randomly select which rows to skip
n = 50000  # Number of rows to sample
skip_rows = sorted(np.random.choice(range(1, total_rows + 1), total_rows - n, replace=False))

# Step 3: Read the file with the skiprows parameter
df_sampled = pd.read_csv(csv_file_path, skiprows=skip_rows)


  df_sampled = pd.read_csv(csv_file_path, skiprows=skip_rows)


In [72]:
df_sampled.shape

(50000, 142)

In [74]:
df_sampled['loan_status'].value_counts() 

loan_status
Fully Paid                                             25498
Current                                                17746
Charged Off                                             6172
Late (31-120 days)                                       293
In Grace Period                                          155
Late (16-30 days)                                         46
Issued                                                    37
Does not meet the credit policy. Status:Fully Paid        30
Does not meet the credit policy. Status:Charged Off       15
Default                                                    8
Name: count, dtype: int64

In [76]:
# display the first 10 columns by percentage of missing data
missing_percentages = df_sampled.isnull().sum() * 100 / len(df)
missing_percentages.sort_values(ascending=False).head(10)

hardship_loan_status       47.545
hardship_reason            47.543
hardship_type              47.542
payment_plan_start_date    47.542
hardship_status            47.542
deferral_term              47.542
hardship_start_date        47.542
hardship_end_date          47.542
hardship_length            47.542
hardship_dpd               47.542
dtype: float64

In [79]:
df_sampled.head()

Unnamed: 0.1,Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag
0,101,1034693,16000,16000,16000.0,60 months,17.58%,402.65,D,D4,...,,,,,,,,,,N
1,195,1067038,12000,12000,12000.0,36 months,14.27%,411.71,C,C2,...,,,,,,,,,,N
2,234,1065929,3500,3500,3500.0,36 months,7.90%,109.52,A,A4,...,,,,,,,,,,N
3,263,1066364,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,,,,,,,,,,N
4,294,1063126,6000,6000,6000.0,36 months,10.65%,195.44,B,B2,...,,,,,,,,,,N


Columns that contain more than 50% missing data are not utilised in this study.

In [108]:
# drop the columns with more than 50% missing values
columns_to_drop = missing_percentages[missing_percentages > 50].index
df = df_sampled.drop(columns=columns_to_drop)
df.shape

(50000, 142)

Categorical columns are selected in this phase.

In [109]:
# treat categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns].head()

Unnamed: 0,term,int_rate,grade,sub_grade,emp_title,emp_length,home_ownership,verification_status,issue_d,loan_status,...,sec_app_earliest_cr_line,hardship_flag,hardship_type,hardship_reason,hardship_status,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_loan_status,debt_settlement_flag
0,60 months,17.58%,D,D4,Menards,7 years,RENT,Not Verified,Dec-2011,Fully Paid,...,,N,,,,,,,,N
1,36 months,14.27%,C,C2,Corning Inc.,8 years,MORTGAGE,Not Verified,Dec-2011,Charged Off,...,,N,,,,,,,,N
2,36 months,7.90%,A,A4,impax laboratory,7 years,MORTGAGE,Not Verified,Dec-2011,Fully Paid,...,,N,,,,,,,,N
3,36 months,13.49%,C,C1,St. Joseph Hospital,10+ years,MORTGAGE,Not Verified,Dec-2011,Fully Paid,...,,N,,,,,,,,N
4,36 months,10.65%,B,B2,Neil Nakai Inc.,10+ years,RENT,Not Verified,Dec-2011,Fully Paid,...,,N,,,,,,,,N


In [110]:
# check variance of categorical columns
df[categorical_columns].nunique().sort_values()

pymnt_plan                       1
term                             2
hardship_flag                    2
application_type                 2
initial_list_status              2
debt_settlement_flag             2
verification_status_joint        3
verification_status              3
hardship_type                    4
hardship_status                  4
home_ownership                   6
grade                            7
hardship_loan_status             9
loan_status                     10
emp_length                      11
purpose                         14
hardship_reason                 17
sub_grade                       35
hardship_start_date             40
payment_plan_start_date         40
hardship_end_date               41
next_pymnt_d                    46
addr_state                      50
last_credit_pull_d             125
last_pymnt_d                   142
issue_d                        157
sec_app_earliest_cr_line       457
int_rate                       534
earliest_cr_line    

The idea is to reduce model complexity by retaining the categorical variables that do not suffer from high cardinality (many categories).

Purpose and title covney similar information. Purpose however contains less categoris and it is thus preferred. Therefore, we keep only purpose. 

Grade and subgrade are hierarchically related. Grade contains less categories and is therefore perferable as it bears less complexity than Subgrade. Therefore, we will keep only the grade column.

Issue date will be kept as time reference and will later be translated into a date format. 

The categorical variables we keep comprise the grade, type of home ownership, the interest rate, loan status, title (type of loan), and term

In [111]:
cat_columns_to_keep = ['issue_d', 'grade', 'home_ownership', 'loan_status', 'int_rate', 'term', 'purpose']
cat_columns_to_drop = set(categorical_columns) - set(cat_columns_to_keep)
df.drop(columns=cat_columns_to_drop, inplace=True)
df.shape

(50000, 115)

In [112]:
# treat numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# check variance of numeric columns
df[numeric_columns].nunique().sort_values()

policy_code             1
num_tl_120dpd_2m        2
num_tl_30dpd            3
acc_now_delinq          3
hardship_length         3
                    ...  
tot_cur_bal         44142
total_rec_int       46339
total_pymnt_inv     47552
total_pymnt         48126
id                  50000
Length: 108, dtype: int64

Policy code is useless. We can drop it. Also id is useless and will be removed. 

Columns regarding settlement produce leakage of future information. Therefore they are not included. 

To drive down the number of predictors further, we base the set of numeric predictors on a previous study on Kaggle: https://www.kaggle.com/code/krishnaraj30/xgboost-loan-defaulters-prediction/notebook#%F0%9F%92%BE-Data-Description

Therefore, the numeric features used include:
- loan amount
- installment
- annual income
- debt to income ratio
- number of credit lines the borrower uses
- number of public record bankruptcies

In [113]:
numeric_columns_to_keep = ['loan_amnt', 'installment', 'annual_inc', 'dti', 'total_acc', 'pub_rec_bankruptcies']
numeric_columns_to_drop = set(numeric_columns) - set(numeric_columns_to_keep)
df.drop(columns=numeric_columns_to_drop, inplace=True)
df.shape

(50000, 13)

In [115]:
df['pub_rec_bankruptcies'].value_counts()

pub_rec_bankruptcies
0.0    44299
1.0     5446
2.0      188
3.0       29
4.0       11
5.0        1
Name: count, dtype: int64

In [116]:
df.to_csv('../Data/Loan_status_Cleaned.csv', index=False)