In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, 
                             precision_recall_curve, auc)
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency

# Load the dataset
df = pd.read_csv('loan_data_2015.csv',low_memory=False)

In [62]:
df.shape

(421094, 74)

In [63]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,60516983,64537751,20000,20000,20000,36 months,12.29,667.06,C,C1,...,,,,,,,41000,,,
1,60187139,64163931,11000,11000,11000,36 months,12.69,369.0,C,C2,...,,,,,,,13100,,,
2,60356453,64333218,7000,7000,7000,36 months,9.99,225.84,B,B3,...,,,,,,,16300,,,
3,59955769,63900496,10000,10000,10000,36 months,10.99,327.34,B,B4,...,,,,,,,34750,,,
4,58703693,62544456,9550,9550,9550,36 months,19.99,354.87,E,E4,...,,,,,,,14100,,,


In [64]:
print(df.columns)

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421094 entries, 0 to 421093
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           421094 non-null  int64  
 1   member_id                    421094 non-null  int64  
 2   loan_amnt                    421094 non-null  int64  
 3   funded_amnt                  421094 non-null  int64  
 4   funded_amnt_inv              421094 non-null  int64  
 5   term                         421094 non-null  object 
 6   int_rate                     421094 non-null  float64
 7   installment                  421094 non-null  float64
 8   grade                        421094 non-null  object 
 9   sub_grade                    421094 non-null  object 
 10  emp_title                    397220 non-null  object 
 11  emp_length                   397277 non-null  object 
 12  home_ownership               421094 non-null  object 
 13 

In [66]:
missing_proportions = df.isnull().sum() / df.shape[0]
pd.reset_option('display.max_rows')
missing_proportions.columns = ['Column', 'Missing Proportion']
# Print all values
print(missing_proportions)

id                  0.000000
member_id           0.000000
loan_amnt           0.000000
funded_amnt         0.000000
funded_amnt_inv     0.000000
                      ...   
all_util            0.949246
total_rev_hi_lim    0.000000
inq_fi              0.949246
total_cu_tl         0.949246
inq_last_12m        0.949246
Length: 74, dtype: float64


In [67]:
df['loan_status'].value_counts(normalize = True)

loan_status
Current               0.896600
Fully Paid            0.054582
Issued                0.020091
Late (31-120 days)    0.011140
In Grace Period       0.007378
Charged Off           0.006585
Late (16-30 days)     0.002705
Default               0.000919
Name: proportion, dtype: float64

In [68]:
bad_loan_statuses = [
    'Charged Off', 
    'Default', 
    'Late (31-120 days)', 
    'Does not meet the credit policy. Status:Charged Off'
]
df['target'] = [1 if status in bad_loan_statuses else 0 for status in df['loan_status']]
df.drop(columns=['loan_status'], inplace=True)

In [70]:
df['target'].value_counts(normalize = True)

target
0    0.981356
1    0.018644
Name: proportion, dtype: float64

In [94]:
crosstab= pd.crosstab(df.home_ownership,df.target,margins=True).style.background_gradient(cmap='summer_r')
crosstab

target,0,1,All
home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ANY,2,0,2
MORTGAGE,204579,3103,207682
OWN,44885,881,45766
RENT,163777,3867,167644
All,413243,7851,421094
