# Importing packages

In [1]:
import pandas as pd 
import numpy as np
import datetime

# Setting the option to display the dataframe 
pd.options.display.max_columns = None
pd.options.display.max_rows = 150

# Features in the analysis 

The main features that will be used in the modeling of credit risk are: 

* **earliest_cr_line** - The month the borrower's earliest reported credit line was opened.

* **emp_length** - Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. 

* **term** - number of payments on the loan. Values can be either 36 or 60. 

* **issue_d** - the month which the loan was funded (issued)

* **grade** - loan companies assigned loan grade.

* **sub_grade** - loan companies assigned loan subgrade.

* **home_ownership** - The home ownership status provided by the borrower during registration or obtained from the credit report. The values are: RENT, OWN, MORTGAGE, OTHER.

* **verification_status** - Indicates if income was verified by LC, not verified, or if the income source was verified.

* **purpose** - the purpose of the loan; a category provided by the borrower for the loan request.

* **addr_state** - the state provided by the borrower in the loan application.

* **initial_list_status** - the initial listing status of the loan. Possible values are – W, F.

* **loan_amnt** - the amount of $ applied by the borrower.

* **funded_amnt** - the amount of requested loan that was funded in $.

* **annual_inc** - the self-reported annual income by the borrower.

* **acc_now_delinq** - the amount of accounts which the borrower is a delinquet.

* **total_acc** - The total number of credit lines currently in the borrower's credit file.

* **pub_rec** - Number of derogatory public records.

* **open_acc** - The number of open credit lines in the borrower's credit file.

The feature that we will use for the creation of the $Y$ (also known as the dependant variable) is the loan_status variable. 

* **loan_status** - current status of the loan (as of creation of the data).

In [2]:
columns = [
    'earliest_cr_line',
    'emp_length',
    'term',
    'issue_d',
    'grade',
    'sub_grade',
    'home_ownership',
    'verification_status',
    'loan_status',
    'purpose',
    'addr_state',
    'initial_list_status',
    'loan_amnt',
    'funded_amnt',
    'annual_inc',
    'acc_now_delinq',
    'total_acc',
    'pub_rec',
    'open_acc'
]

# Reading the data 

In [3]:
# Reading the data
d = pd.read_csv(
    'data/appl_accepted_20072019Q3.csv', 
    low_memory=False,
    usecols=columns
)

In [4]:
print(f'Shape of the data: {d.shape}')

Shape of the data: (2650550, 19)


In [5]:
# Eyeballing the data
d.head()

Unnamed: 0,loan_amnt,funded_amnt,term,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,earliest_cr_line,open_acc,pub_rec,total_acc,initial_list_status,acc_now_delinq
0,3600.0,3600.0,36 months,C,C4,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,PA,Aug-2003,7.0,0.0,13.0,w,0.0
1,24700.0,24700.0,36 months,C,C1,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,SD,Dec-1999,22.0,0.0,38.0,w,0.0
2,20000.0,20000.0,60 months,B,B4,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,IL,Aug-2000,6.0,0.0,18.0,w,0.0
3,35000.0,35000.0,60 months,C,C5,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,NJ,Sep-2008,13.0,0.0,17.0,w,0.0
4,10400.0,10400.0,60 months,F,F1,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,PA,Jun-1998,12.0,0.0,35.0,w,0.0


In [6]:
# Seeing the types of the columns to make sure numerics are numeris, 
# categorical variables are categorical, etc.
d.dtypes

loan_amnt              float64
funded_amnt            float64
term                    object
grade                   object
sub_grade               object
emp_length              object
home_ownership          object
annual_inc             float64
verification_status     object
issue_d                 object
loan_status             object
purpose                 object
addr_state              object
earliest_cr_line        object
open_acc               float64
pub_rec                float64
total_acc              float64
initial_list_status     object
acc_now_delinq         float64
dtype: object

# Filling missing data

In [7]:
# Getting the number of rows missing
d.isnull().sum()

loan_amnt                  33
funded_amnt                33
term                       33
grade                      33
sub_grade                  33
emp_length             181960
home_ownership             33
annual_inc                 37
verification_status        33
issue_d                    33
loan_status                33
purpose                    33
addr_state                 33
earliest_cr_line           62
open_acc                   62
pub_rec                    62
total_acc                  62
initial_list_status        33
acc_now_delinq             62
dtype: int64

According to the documentation in the data, if the emp_length is missing, it means that there is no employment info. The suggested procedure is to treat this value the same as "< 1 year" value.

In [8]:
d['emp_length'] = d['emp_length'].fillna('< 1 year')

All the other missing rows will be dropped. The amount of rows dropped is very minimal when compared to the total number of rows. 

In [9]:
d.dropna(inplace=True)
d.reset_index(inplace=True, drop=True)

print(f"Shape of data after dropping rows with missing values: {d.shape}")

Shape of data after dropping rows with missing values: (2650488, 19)


# General data preprocesing 

## Categories to numerics

The **term** and **emp_length** features are stored as categorical. We need to convert them first to numeric. 

In [10]:
d['term'].unique()

array([' 36 months', ' 60 months'], dtype=object)

In [11]:
d['emp_length'].unique()

array(['10+ years', '3 years', '4 years', '6 years', '1 year', '7 years',
       '8 years', '5 years', '2 years', '9 years', '< 1 year'],
      dtype=object)

In [12]:
# Cleaning employment length 
d['emp_length_int'] = d['emp_length'].str.replace('\+ years| years| year', '')
d['emp_length_int'] = d['emp_length_int'].str.replace('< 1', str(0))

# Converting the column type to numeric 
d['emp_length_int'] = pd.to_numeric(d['emp_length_int'])

In [13]:
# Cleaning the term column
d['term_int'] = d['term'].str.replace(' months', '')
d['term_int'] = d['term_int'].str.replace(' ', '')

# Converting the column type to numeric 
d['term_int'] = pd.to_numeric(d['term_int'])

## Categories to dates

In [14]:
def convert_to_2digit(x: str) -> datetime.datetime:
    """
    A function to convert 4 digit last part of the date string to 2 digit
    """
    try:
        x = x.split('-')
        month_part = x[0]
        year_part = x[1][-2:]

        return datetime.datetime.strptime(f'{month_part}-{year_part}', '%b-%y')
    except:
        return np.nan

In [15]:
# Inspecting the structure of dates

# Head
d['earliest_cr_line'].unique().tolist()[0:10]

['Aug-2003',
 'Dec-1999',
 'Aug-2000',
 'Sep-2008',
 'Jun-1998',
 'Oct-1987',
 'Jun-1990',
 'Feb-1999',
 'Apr-2002',
 'Nov-1994']

In [16]:
# Tail 
d['earliest_cr_line'].unique().tolist()[-10:]

['Feb-64',
 'Apr-61',
 'Jan-56',
 'Aug-62',
 'Sep-60',
 'Aug-59',
 'Jun-56',
 'Aug-50',
 'Jun-59',
 'Aug-61']

In [17]:
# Converting to 2 digit years
d['earliest_cr_line_date'] = [convert_to_2digit(x) for x in d['earliest_cr_line']]

# The exact same logic applies to the column issue_d
d['issue_d_date'] = [convert_to_2digit(x) for x in d['issue_d']]

In [18]:
# Calculating the dates between loan issue and first credit line issued 
d['issue_cr_diff'] = d['issue_d_date'] - d['earliest_cr_line_date']

In [19]:
d['issue_cr_diff'].describe()

count                         2650488
mean     5902 days 06:11:26.248268288
std      2965 days 08:25:30.837980128
min             -21915 days +00:00:00
25%                4109 days 00:00:00
50%                5388 days 00:00:00
75%                7366 days 00:00:00
max               18505 days 00:00:00
Name: issue_cr_diff, dtype: object

In [20]:
print(f"Very old credit lines make of {round(d[d['issue_cr_diff'].dt.days<0].shape[0] * 100 / d.shape[0], 2)} % of the data")

Very old credit lines make of 0.21 % of the data


For now, we will drop the very old users. The droping of these rows will have very little influence to the final coefficients.

In [21]:
d = d[d['issue_cr_diff'].dt.days>0]

d.reset_index(inplace=True, drop=True)

By convention, in credit risk modeling, months are prefered vs days. 

In [22]:
d['issue_cr_diff_mnth'] = [int(x) for x in d['issue_cr_diff'] / np.timedelta64(1, 'M')]

Let us imagine that the current date when we are calculating the risk is 2020-01-01.

In [23]:
cur_date = datetime.datetime(2020, 1, 1)

# Creating an additional column to see days past since the loan issue 
d['days_since_issue'] = cur_date - d['issue_d_date']
d['mnth_since_issue'] = [int(x) for x in d['days_since_issue'] / np.timedelta64(1, 'M')]

In [24]:
# Inspecting the results of engineering
d[[
    'earliest_cr_line', 
    'earliest_cr_line_date', 
    'issue_d', 
    'issue_d_date', 
    'issue_cr_diff',
    'issue_cr_diff_mnth',
    'mnth_since_issue'
]].sample(10)

Unnamed: 0,earliest_cr_line,earliest_cr_line_date,issue_d,issue_d_date,issue_cr_diff,issue_cr_diff_mnth,mnth_since_issue
214873,Feb-1975,1975-02-01,Jul-2015,2015-07-01,14760 days,484,54
2167471,Oct-1991,1991-10-01,Dec-2016,2016-12-01,9193 days,302,36
1195606,Jan-2001,2001-01-01,Sep-2014,2014-09-01,4991 days,163,64
1547283,Jun-2004,2004-06-01,May-2018,2018-05-01,5082 days,166,20
1060557,Apr-2008,2008-04-01,Feb-2016,2016-02-01,2862 days,94,46
1666056,Jun-1999,1999-06-01,Mar-2017,2017-03-01,6483 days,212,34
1600890,Aug-2006,2006-08-01,Apr-2018,2018-04-01,4261 days,139,21
1095651,Nov-2006,2006-11-01,Jan-2016,2016-01-01,3348 days,109,48
767281,Sep-2007,2007-09-01,Sep-2018,2018-09-01,4018 days,132,16
789753,Dec-2005,2005-12-01,Aug-2018,2018-08-01,4626 days,151,17


## Preprocesing the categorical variables 

The aim of categorical variable preprocesing is to convert them into dummy variables. 

In [25]:
categorical_vars = [
    'grade',
    'sub_grade',
    'home_ownership',
    'verification_status',
    'purpose',
    'addr_state',
    'initial_list_status'
]

dummylist = []

for categorical_var in categorical_vars:
    dummylist.append(pd.get_dummies(d[categorical_var], prefix=f'{categorical_var}'))
    
# Creating the dummy dataframe for the categorical variables
dummydf = pd.concat(dummylist, axis=1)

In [26]:
# Listing all the columns
dummydf.columns.values

array(['grade_A', 'grade_B', 'grade_C', 'grade_D', 'grade_E', 'grade_F',
       'grade_G', 'sub_grade_A1', 'sub_grade_A2', 'sub_grade_A3',
       'sub_grade_A4', 'sub_grade_A5', 'sub_grade_B1', 'sub_grade_B2',
       'sub_grade_B3', 'sub_grade_B4', 'sub_grade_B5', 'sub_grade_C1',
       'sub_grade_C2', 'sub_grade_C3', 'sub_grade_C4', 'sub_grade_C5',
       'sub_grade_D1', 'sub_grade_D2', 'sub_grade_D3', 'sub_grade_D4',
       'sub_grade_D5', 'sub_grade_E1', 'sub_grade_E2', 'sub_grade_E3',
       'sub_grade_E4', 'sub_grade_E5', 'sub_grade_F1', 'sub_grade_F2',
       'sub_grade_F3', 'sub_grade_F4', 'sub_grade_F5', 'sub_grade_G1',
       'sub_grade_G2', 'sub_grade_G3', 'sub_grade_G4', 'sub_grade_G5',
       'home_ownership_ANY', 'home_ownership_MORTGAGE',
       'home_ownership_NONE', 'home_ownership_OTHER',
       'home_ownership_OWN', 'home_ownership_RENT',
       'verification_status_Not Verified',
       'verification_status_Source Verified',
       'verification_status_Verified', 'pu

In [27]:
# Getting the numeric feature list 
numeric_features = [
    'loan_amnt',
    'funded_amnt',
    'term',
    'emp_length',
    'annual_inc',
    'mnth_since_issue',
    'term_int',
    # Adding load_status for Y creation
    'loan_status'
]

In [28]:
# Creating the dataframe with numeric and dummy variables 
d = pd.concat([d, dummydf], axis=1)

# Probability of default (PD) modeling

The main objective in PD modeling is to create a model to infer a probability for a default given a set of features $X$.

It is established by various auditing firms that the interpretation of the model needs to be easy even for not statistical savy people. One highly interpretable machine learning model is logistic regression. 

Another agreed upon part of PD modeling is that the $X$ matrix needs to be comprised of dummy variable data. Thus, even continues numeric variables need to be converted to categorical features. Additionally, when creating the $Y$ variable, positive coefficients in logistic regression need to lead to a "positive" class, meaning, the higher a given value, the less chance of default.

# Creation of the dependant variable 

One of the widespread default definition is that if a person is not able to pay their obligations in 90 days, it is considered default. In the PD modeling space, the Y encoding to non defaul and default is often termed "good bad" encoding.

In [29]:
d['loan_status'].value_counts() / d['loan_status'].count()

Current                                                0.459636
Fully Paid                                             0.419460
Charged Off                                            0.103261
Late (31-120 days)                                     0.010211
In Grace Period                                        0.004294
Late (16-30 days)                                      0.002094
Does not meet the credit policy. Status:Fully Paid     0.000740
Does not meet the credit policy. Status:Charged Off    0.000286
Default                                                0.000017
Name: loan_status, dtype: float64

In [30]:
# Creating the Y (good_bad) variable for the probability of default analysis
d['good_bad'] = np.where(d['loan_status'].isin([
    'Charged Off', 
    'Default',
    'Late (31-120 days)',
    'Does not meet the credit policy. Status:Charged Off'
]), 0, 1)

In [31]:
# Distribution of classes
d['good_bad'].value_counts() / d['good_bad'].count()

1    0.886225
0    0.113775
Name: good_bad, dtype: float64

# Spliting the data into train, validation and test sets

The amount of data for training will be 80 percent of the whole dataset, validation (used for hyperparameter tuning) will be 10 percent and the test set (which we will measure some accuracy metrics) will be 10 percent. 

In [32]:
train, val, test = np.split(d.sample(frac=1, random_state=42), [int(.8*len(d)), int(.9*len(d))])

In [33]:
print(f"Shape of training data: {train.shape}")
print(f"Shape of training data: {val.shape}")
print(f"Shape of training data: {test.shape}")

Shape of training data: (2115933, 146)
Shape of training data: (264492, 146)
Shape of training data: (264492, 146)


# Fine classing, weight of evidence and coarse classing

Fine classing is a technique that groups a variable's values into a number of fine bins. Using these bins, a measure of the variable's predictive power, known as information value (IV), can be computed. Also from these fine bins, further grouping can be carried out to result in coarse classing.

One of the fundamental concepts in information value is the weight of evidence concept. The weight of evidence tells the predictive power of an independent variable in relation to the dependent variable.

For a given feature $i$, the weight of evidence, or, $WoE_{i}$ is calculated by: 

$$ WoE_{i} = log\left(\dfrac{\% of y = 1 | x = i}{\% of y = 0 | x = i}\right)$$

Thus, the woe is calculated for each feature in our dataset.

## Caluclating the WoE for home ownership

Lets go step by step and calculate the WoE statistic for all the levels of the home ownership categorical value. 

In [56]:
# Subseting the data
woe = d[['home_ownership', 'good_bad']]

# Lets calculate the number of borrowers for each status 
grouped = woe.groupby('home_ownership', as_index=False)['good_bad'].count()

In [57]:
# Inspecting the total number of people in each group 
grouped

Unnamed: 0,home_ownership,good_bad
0,ANY,3344
1,MORTGAGE,1301433
2,NONE,51
3,OTHER,181
4,OWN,297331
5,RENT,1042577


In [58]:
# Getting the total good borrowers in each group 
groupedgood = woe.groupby('home_ownership', as_index=False)['good_bad'].sum().rename(columns={'good_bad': 'good_borrowers'})

# Merging with the initial dataframe
grouped = pd.merge(grouped, groupedgood, on='home_ownership')

# Getting the number of bad borrowers
grouped['bad_borrowers'] = grouped['good_bad'] - grouped['good_borrowers']

# Calculating the share of good borrowers
grouped['share_good'] = grouped['good_borrowers'] / grouped['good_bad']

# Calculating the share of bad borrowers
grouped['share_bad'] = 1 - grouped['share_good']

# Seeing the results
grouped

Unnamed: 0,home_ownership,good_bad,good_borrowers,bad_borrowers,share_good,share_bad
0,ANY,3344,3166,178,0.94677,0.05323
1,MORTGAGE,1301433,1172969,128464,0.90129,0.09871
2,NONE,51,43,8,0.843137,0.156863
3,OTHER,181,143,38,0.790055,0.209945
4,OWN,297331,263485,33846,0.886167,0.113833
5,RENT,1042577,904185,138392,0.86726,0.13274


In [62]:
# Calculating the total proportion of good/bad borrowers in each group 
grouped['prop_n_good'] = grouped['good_borrowers'] / grouped['good_borrowers'].sum()
grouped['prop_n_bad'] = grouped['bad_borrowers'] / grouped['bad_borrowers'].sum()

grouped

Unnamed: 0,home_ownership,good_bad,good_borrowers,bad_borrowers,share_good,share_bad,prop_n_good,prop_n_bad
0,ANY,3344,3166,178,0.94677,0.05323,0.001351,0.000592
1,MORTGAGE,1301433,1172969,128464,0.90129,0.09871,0.500415,0.426896
2,NONE,51,43,8,0.843137,0.156863,1.8e-05,2.7e-05
3,OTHER,181,143,38,0.790055,0.209945,6.1e-05,0.000126
4,OWN,297331,263485,33846,0.886167,0.113833,0.112409,0.112473
5,RENT,1042577,904185,138392,0.86726,0.13274,0.385746,0.459887


In the above dataset we have everything we need to calculate the weight of evidence for each categorical variable level. 

In [63]:
grouped['woe'] = np.log(grouped['prop_n_good'] / grouped['prop_n_bad'])

In [64]:
grouped

Unnamed: 0,home_ownership,good_bad,good_borrowers,bad_borrowers,share_good,share_bad,prop_n_good,prop_n_bad,woe
0,ANY,3344,3166,178,0.94677,0.05323,0.001351,0.000592,0.825695
1,MORTGAGE,1301433,1172969,128464,0.90129,0.09871,0.500415,0.426896,0.158899
2,NONE,51,43,8,0.843137,0.156863,1.8e-05,2.7e-05,-0.370987
3,OTHER,181,143,38,0.790055,0.209945,6.1e-05,0.000126,-0.727487
4,OWN,297331,263485,33846,0.886167,0.113833,0.112409,0.112473,-0.00057
5,RENT,1042577,904185,138392,0.86726,0.13274,0.385746,0.459887,-0.175802


In [69]:
# Lets calculate the IV for the overall variable home_ownership
IV = np.sum((grouped['prop_n_good'] - grouped['prop_n_bad']) * grouped['woe'])

print(f'The information value is :{IV}')

The information value is :0.025393800387701254


## Defining a function that calculates the woe and IV 

In [80]:
def get_woe(d: pd.DataFrame, cat_var: str, good_bad_var: str):
    # Subseting the dataframe 
    d = d[[cat_var, good_bad_var]]
    
    # Calculating the number of borrowers for each status 
    grouped = d.groupby(cat_var, as_index=False)[good_bad_var].count().rename(columns={good_bad_var: "n"})
    
    groupedgood = d.groupby(cat_var, as_index=False)[good_bad_var].sum().rename(columns={good_bad_var: 'good_borrowers'})

    # Merging with the initial dataframe
    grouped = pd.merge(grouped, groupedgood, on=cat_var)

    # Getting the number of bad borrowers
    grouped['bad_borrowers'] = grouped['n'] - grouped['good_borrowers']

    # Calculating the share of good borrowers
    grouped['share_good'] = grouped['good_borrowers'] / grouped['n']

    # Calculating the share of bad borrowers
    grouped['share_bad'] = 1 - grouped['n']
    
    # Calculating the total proportion of good/bad borrowers in each group 
    grouped['prop_n_good'] = grouped['good_borrowers'] / grouped['good_borrowers'].sum()
    grouped['prop_n_bad'] = grouped['bad_borrowers'] / grouped['bad_borrowers'].sum()
    
    # Calculating the WoE statistic
    grouped['woe'] = np.log(grouped['prop_n_good'] / grouped['prop_n_bad'])
    
    # Sorting by woe
    grouped.sort_values('woe', inplace=True, ascending=False)
    grouped.reset_index(inplace=True, drop=True)
    
    # Calculating IV
    grouped['IV'] = np.sum((grouped['prop_n_good'] - grouped['prop_n_bad']) * grouped['woe'])
    
    # Returning the aggregated dataframe
    return grouped

In [81]:
# Printing out the frame for all the categorical variables
for var in categorical_vars:
    print('-----------')
    print(f'{var}:')
    print(get_woe(d, var, 'good_bad'))
    print('-----------')

-----------
grade:
  grade       n  good_borrowers  bad_borrowers  share_good  share_bad  \
0     A  557886          541095          16791    0.969902    -557885   
1     B  775434          715839          59595    0.923146    -775433   
2     C  740301          643741          96560    0.869567    -740300   
3     D  378537          309355          69182    0.817238    -378536   
4     E  138808          100074          38734    0.720953    -138807   
5     F   41781           26577          15204    0.636103     -41780   
6     G   12170            7310           4860    0.600657     -12169   

   prop_n_good  prop_n_bad       woe       IV  
0     0.230843    0.055798  1.420006  0.53152  
1     0.305393    0.198039  0.433138  0.53152  
2     0.274635    0.320876 -0.155614  0.53152  
3     0.131978    0.229897 -0.554997  0.53152  
4     0.042694    0.128716 -1.103554  0.53152  
5     0.011338    0.050524 -1.494258  0.53152  
6     0.003119    0.016150 -1.644541  0.53152  
-----------


   addr_state       n  good_borrowers  bad_borrowers  share_good  share_bad  \
0          ME    6321            5989            332    0.947477      -6320   
1          ID    5671            5277            394    0.930524      -5670   
2          VT    5842            5420            422    0.927764      -5841   
3          WV   10594            9757            837    0.920993     -10593   
4          NH   13137           12050           1087    0.917257     -13136   
5          OR   31713           29057           2656    0.916249     -31712   
6          DC    6141            5622            519    0.915486      -6140   
7          ND    4418            4027            391    0.911498      -4417   
8          SC   33054           30113           2941    0.911024     -33053   
9          CO   56017           50920           5097    0.909010     -56016   
10         WA   55275           50125           5150    0.906829     -55274   
11         CT   42151           38204           3947

Based on the WoE statistic we need to figure out how to organize and lump together the original discrete features.