# Business Objective: Identify the Probability of a Retail Loan Default

We will begin by importing the relevant libraries and dataset

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import gc 

In [2]:
# Import dataset
df_backup = pd.read_csv(r'C:\Users\Ahmed\Desktop\lending-club-loan-data\loan.csv')

In [3]:
df = df_backup.copy()

Let's take a quick glance at our dataset.

In [4]:
pd.options.display.max_columns = 15
pd.options.display.max_rows = 15
df.shape

(2260668, 145)

In [5]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,...,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,2500,2500,2500.0,36 months,13.56,...,N,,,,,,
1,,,30000,30000,30000.0,60 months,18.94,...,N,,,,,,
2,,,5000,5000,5000.0,36 months,17.97,...,N,,,,,,
3,,,4000,4000,4000.0,36 months,18.94,...,N,,,,,,
4,,,30000,30000,30000.0,60 months,16.14,...,N,,,,,,


In [6]:
null_count = df.isnull().sum()
null_pct = ((df.isnull().sum())/(df.isnull().count()))*100
null_type = df.dtypes
missing_data = pd.concat([null_count, null_pct, null_type], axis = 1, keys = ['Null Values', 'Percent of Total', 'Data Type'])
missing_data = missing_data.sort_values(by = 'Percent of Total', ascending = False).round(2)
missing_data

Unnamed: 0,Null Values,Percent of Total,Data Type
id,2260668,100.00,float64
url,2260668,100.00,float64
member_id,2260668,100.00,float64
orig_projected_additional_accrued_interest,2252242,99.63,float64
hardship_length,2250055,99.53,float64
hardship_reason,2250055,99.53,object
hardship_status,2250055,99.53,object
...,...,...,...
total_pymnt_inv,0,0.00,float64
total_pymnt,0,0.00,float64


Looks like we need to change some data types and treat missing data. 

# Data Preprocessing

We will start the preprocessing by transforming some data types.

In [7]:
# Convert objects to datetime where appropriate
date_cols = ['earliest_cr_line', 'sec_app_earliest_cr_line', 'hardship_start_date', 'payment_plan_start_date',
              'debt_settlement_flag_date', 'settlement_date', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 'issue_d']
for col in date_cols:
     df[col] = pd.to_datetime(df[col], format = '%b-%Y')

In [8]:
# Convert objects to integers where appropriate
df['emp_length'] = df['emp_length'].str.replace('\+ years', '')
df['emp_length'] = df['emp_length'].str.replace('< 1 year', str(0))
df['emp_length'] = df['emp_length'].str.replace(' years', '')
df['emp_length'] = df['emp_length'].str.replace(' year', '')
# We will also replace null values with 0 
df['emp_length'] = df['emp_length'].fillna(0).astype(int)
df['emp_length'].dtypes

dtype('int32')

In [9]:
# Convert objects to integers where appropriate
df['term'] = df['term'].str.replace(' months', '').astype(int)
df['term'].dtypes

dtype('int32')

Several of our datetime features appear to be a beginning and end point for a particualar predictor. Capturing the difference between the start and end points will tell us more than the datetime features by themselves.

In [10]:
print('Ordinarily, we would use the difference between the start date and current date to calculate duration where appropriate.') 
print('However, this dataset ends at {}, so we will use this value as the end point.'.format(df['issue_d'].max()))

Ordinarily, we would use the difference between the start date and current date to calculate duration where appropriate.
However, this dataset ends at 2018-12-01 00:00:00, so we will use this value as the end point.


In [11]:
# Get the duration for some datetime variables
df['oldest_cr_line_age'] = round(pd.to_numeric((pd.to_datetime('2018-12-01') - df['earliest_cr_line']) / np.timedelta64(1, 'M')))
df['oldest_cr_line_age_sec_app'] = round(pd.to_numeric((df['last_pymnt_d'] - df['sec_app_earliest_cr_line']) / np.timedelta64(1, 'M')))
df['next_last_payment_difference'] = round(pd.to_numeric((df['next_pymnt_d'] - df['last_pymnt_d']) / np.timedelta64(1, 'M')))
df['mths_since_issue_d'] = round(pd.to_numeric((pd.to_datetime('2018-12-01') - df['issue_d']) / np.timedelta64(1, 'M')))
print('Hardship, settlement plan and time since most recent inquiry durations are already given so there is no need to derive them.')

Hardship, settlement plan and time since most recent inquiry durations are already given so there is no need to derive them.


Now that we have changed the data types of the continuous features, where appropriate, let's do the same for continuous features. Here we will take 2 approaches: 
i)  We will label encode ordinal categorical features
ii) We will get dummy variables for nomincal categorical features

In [12]:
# First we will assign a score to each grade
grade_list = sorted(list(df['grade'].unique()))

grade_list = grade_list[::-1]

grade_dict = {}
for letter,c in zip(grade_list, list(range(1,len(grade_list)+1))):
     grade_dict[letter] = c

# And the same for sub grade
sub_grade_list = sorted(list(df['sub_grade'].unique()))

sub_grade_list = sub_grade_list[::-1]

sub_grade_dict = {}
for letter,c in zip(sub_grade_list, list(range(1,len(sub_grade_list)+1))):
     sub_grade_dict[letter] = c
        
print(grade_dict, sub_grade_dict, sep="\n")

{'G': 1, 'F': 2, 'E': 3, 'D': 4, 'C': 5, 'B': 6, 'A': 7}
{'G5': 1, 'G4': 2, 'G3': 3, 'G2': 4, 'G1': 5, 'F5': 6, 'F4': 7, 'F3': 8, 'F2': 9, 'F1': 10, 'E5': 11, 'E4': 12, 'E3': 13, 'E2': 14, 'E1': 15, 'D5': 16, 'D4': 17, 'D3': 18, 'D2': 19, 'D1': 20, 'C5': 21, 'C4': 22, 'C3': 23, 'C2': 24, 'C1': 25, 'B5': 26, 'B4': 27, 'B3': 28, 'B2': 29, 'B1': 30, 'A5': 31, 'A4': 32, 'A3': 33, 'A2': 34, 'A1': 35}


In [13]:
# Encode ordinal categorical features
df['grade'] = df.grade.map(grade_dict)
df['sub_grade'] = df.sub_grade.map(sub_grade_dict)

In [14]:
df['pymnt_plan'] = np.where(df['pymnt_plan']=='Y', 1, 0).astype(int)
df['hardship_flag'] = np.where(df['hardship_flag']=='Y', 1, 0).astype(int)
df['debt_settlement_flag'] = np.where(df['debt_settlement_flag']=='Y', 1, 0).astype(int)
df['application_type'] = np.where(df['application_type']=='Joint App', 1, 0).astype(int)
df['settlement_plan_flag'] = np.where(df['settlement_date'].notnull(), 1, 0).astype(int)
print(df['pymnt_plan'].dtypes, df['hardship_flag'].dtypes, df['debt_settlement_flag'].dtypes, df['settlement_plan_flag'].dtype)

int32 int32 int32 int32


Let's a take detour and treat missing data. We will add dummy variables once we are free of null values. 

In [15]:
# Treat missing values for continuous features
from sklearn.impute import SimpleImputer
imputer_annual_inc = SimpleImputer(missing_values = np.nan, strategy = 'median')
imputer_annual_inc = imputer_annual_inc.fit(df[['annual_inc']])
df['annual_inc'] = imputer_annual_inc.transform(df[['annual_inc']])

In [16]:
# Treat missing values for continuous features
cols_null_to_zero = df.columns[df.dtypes==float]
df[cols_null_to_zero]=df[cols_null_to_zero].fillna(0)

# Treat missing values for categorical features
cols_null_to_none = df.columns[df.dtypes==object]
df[cols_null_to_none]=df[cols_null_to_none].fillna('none')

In [17]:
null_count = df.isnull().sum()
null_pct = ((df.isnull().sum())/(df.isnull().count()))*100
null_type = df.dtypes
missing_data = pd.concat([null_count, null_pct, null_type], axis = 1, keys = ['Null Values', 'Percent of Total', 'Data Type'])
missing_data = missing_data.sort_values(by = 'Percent of Total', ascending = False).round(2)
missing_data.head(25)

Unnamed: 0,Null Values,Percent of Total,Data Type
payment_plan_start_date,2250055,99.53,datetime64[ns]
hardship_start_date,2250055,99.53,datetime64[ns]
settlement_date,2227612,98.54,datetime64[ns]
debt_settlement_flag_date,2227612,98.54,datetime64[ns]
sec_app_earliest_cr_line,2152647,95.22,datetime64[ns]
next_pymnt_d,1303607,57.66,datetime64[ns]
last_pymnt_d,2426,0.11,datetime64[ns]
...,...,...,...
tax_liens,0,0.00,float64
tot_hi_cred_lim,0,0.00,float64


That takes care of the missing values. We still have some nulls in the DateTime features but we will drop those columns soon as we have already extracted the necessary features from them. Let's get back to encoding features. 

In [18]:
# dummy_cols = ['home_ownership', 'verification_status', 'verification_status_joint', 'purpose', 'addr_state', 'initial_list_status',
#         'hardship_type', 'hardship_reason', 'hardship_status', 'hardship_loan_status', 'disbursement_method',
#         'settlement_status', 'sec_app_earliest_cr_line', 'earliest_cr_line', 'title', 'zip_code']
# for col in dummy_cols:
#     df = pd.concat([df, pd.get_dummies(df[col], prefix = col, prefix_sep = ' ')])
#     df.drop(col, axis=1, inplace=True)
print('This would be a good way to add dummy variables and delete the redundant predecessors. However it requires significant memory.')

This would be a good way to add dummy variables and delete the redundant predecessors. However it requires significant memory.


In [19]:
# Encode nominal categorical features
df_dummies = [pd.get_dummies(df['home_ownership'], prefix = 'home_ownership', prefix_sep = ' '),
            pd.get_dummies(df['verification_status'], prefix = 'verification_status', prefix_sep = ' '),
            pd.get_dummies(df['verification_status_joint'], prefix = 'verification_status_joint', prefix_sep = ' '),
            pd.get_dummies(df['purpose'], prefix = 'purpose', prefix_sep = ' '),
            pd.get_dummies(df['addr_state'], prefix = 'addr_state', prefix_sep = ' '),
            pd.get_dummies(df['hardship_type'], prefix = 'hardship_type', prefix_sep = ' '),
            pd.get_dummies(df['hardship_reason'], prefix = 'hardship_reason', prefix_sep = ' '),
            pd.get_dummies(df['hardship_status'], prefix = 'hardship_status', prefix_sep = ' '),
            pd.get_dummies(df['hardship_loan_status'], prefix = 'hardship_loan_status', prefix_sep = ' '),
            pd.get_dummies(df['disbursement_method'], prefix = 'disbursement_method', prefix_sep = ' '),
            pd.get_dummies(df['settlement_status'], prefix = 'settlement_status', prefix_sep = ' '),
            pd.get_dummies(df['initial_list_status'], prefix = 'initial_list_status', prefix_sep = ' '),
            pd.get_dummies(df['zip_code'], prefix = 'zip_code', prefix_sep = ' ')]
df_dummies = pd.concat(df_dummies, axis = 1)
df = pd.concat([df, df_dummies], axis = 1)

In [20]:
# Drop redundant features
df.drop(['home_ownership', 'verification_status', 'verification_status_joint', 'purpose', 'addr_state', 'initial_list_status',
         'hardship_type', 'hardship_reason', 'hardship_status', 'hardship_loan_status', 'disbursement_method',
         'settlement_status', 'sec_app_earliest_cr_line', 'earliest_cr_line', 'title', 'zip_code'], axis = 1, inplace = True)

In [21]:
# We will also drop features that are empty columns or of no value
df.drop(['id', 'url', 'member_id', 'emp_title', 'hardship_start_date', 'payment_plan_start_date',
        'debt_settlement_flag_date', 'settlement_date', 'next_pymnt_d', 'last_pymnt_d', 
        'last_credit_pull_d', 'desc', 'issue_d', 'hardship_end_date'], axis = 1, inplace = True)


In [22]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,...,zip_code 994xx,zip_code 995xx,zip_code 996xx,zip_code 997xx,zip_code 998xx,zip_code 999xx,zip_code none
0,2500,2500,2500.0,36,13.56,84.92,5,...,0,0,0,0,0,0,0
1,30000,30000,30000.0,60,18.94,777.23,4,...,0,0,0,0,0,0,0
2,5000,5000,5000.0,36,17.97,180.69,4,...,0,0,0,0,0,0,0
3,4000,4000,4000.0,36,18.94,146.51,4,...,0,0,0,0,0,0,0
4,30000,30000,30000.0,60,16.14,731.78,5,...,0,0,0,0,0,0,0


## PD Model Preparation

### Define Target Variable

Until now we have simply preprocessed our features- we have not yet defined our target variable. We will begin building our PD (Prediction Default) model by first defining our target variable.

In [23]:
# Let's look at the target variable
print(df['loan_status'].value_counts())
print()
print((df['loan_status'].value_counts()/df['loan_status'].count())*100)

Fully Paid                                             1041952
Current                                                 919695
Charged Off                                             261655
Late (31-120 days)                                       21897
In Grace Period                                           8952
Late (16-30 days)                                         3737
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     31
Name: loan_status, dtype: int64

Fully Paid                                             46.090448
Current                                                40.682444
Charged Off                                            11.574234
Late (31-120 days)                                      0.968608
In Grace Period                                         0.395989
Late (16-30 days)                                       0.165305
Does not m

Here is the <b>'Good/ Bad'</b> definition which represents: <br>- customers who <b>will not</b> default on their loan as <b>1</b> <br>- customers who <b>will</b> default on their loan as <b>0</b>

In [24]:
df['loan_status'] = np.where(df['loan_status'].isin(['Charged Off', 
                                                  'Does not meet the credit policy. Status:Charged Off',
                                                  'Late (31-120 days)']), 0, 1)
df['loan_status'].value_counts()

1    1976355
0     284313
Name: loan_status, dtype: int64

Let's make our dataset more readible by reordering it so our target variable is the last column.

In [25]:
# Reordering the columns
loan_status = df['loan_status']
df.drop(['loan_status'], axis = 1, inplace = True)
df['loan_status'] = loan_status

Our dataset is nearly ready. But we should have an ID column to use before we start training and testing models. 

In [26]:
# Create ID column
Id = df.index + 1
cols = list(df.columns)
cols = [cols[-1]] + cols[:-1]
df = df[cols]

In [27]:
df.head()

Unnamed: 0,loan_status,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,...,zip_code 994xx,zip_code 995xx,zip_code 996xx,zip_code 997xx,zip_code 998xx,zip_code 999xx,zip_code none
0,1,2500,2500,2500.0,36,13.56,84.92,...,0,0,0,0,0,0,0
1,1,30000,30000,30000.0,60,18.94,777.23,...,0,0,0,0,0,0,0
2,1,5000,5000,5000.0,36,17.97,180.69,...,0,0,0,0,0,0,0
3,1,4000,4000,4000.0,36,18.94,146.51,...,0,0,0,0,0,0,0
4,1,30000,30000,30000.0,60,16.14,731.78,...,0,0,0,0,0,0,0


In [28]:
df.tail()

Unnamed: 0,loan_status,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,...,zip_code 994xx,zip_code 995xx,zip_code 996xx,zip_code 997xx,zip_code 998xx,zip_code 999xx,zip_code none
2260663,1,12000,12000,12000.0,60,14.08,279.72,...,0,0,0,0,0,0,0
2260664,1,12000,12000,12000.0,60,25.82,358.01,...,0,0,0,0,0,0,0
2260665,1,10000,10000,10000.0,36,11.99,332.1,...,0,0,0,0,0,0,0
2260666,1,12000,12000,12000.0,60,21.45,327.69,...,0,0,1,0,0,0,0
2260667,1,16550,16550,16550.0,60,21.45,451.94,...,0,0,0,0,0,0,0


# Estimate PD Model

Let's split our data into a train and test set. We will use a 70/30 split. 

In [29]:
df['loan_status'].value_counts(dropna=False)

1    1976355
0     284313
Name: loan_status, dtype: int64

In [30]:
# Train Test Split; 70:30
X = df.drop(['loan_status'], axis = 1)
y = df[['loan_status']].astype(float)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1582467, 1184) (1582467, 1) (678201, 1184) (678201, 1)


In [171]:
gc.collect()

20

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_acc, y, test_size = 0.30, random_state = 42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

### Time for some exploratory data analysis

In [None]:
pd.concat([X,y]).corr(method='pearson')

In [None]:
# Focused view of features correlated with target variable
corrmat_focused = pd.concat([X,y]).corr(method='pearson')
plt.figure(figsize = (10, 10))
plt.title('Focused View of Features Correlated with Target Variable')
sns.heatmap(train[corrmat_focused].corr(),cmap = 'RdYlGn', annot = True)

### Random Forest

In [33]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50, max_depth = 2, n_jobs = 99)
rf.fit(X_train, y_train)
rf_train = rf.predict_proba(X_train)
rf_test = rf.predict_proba(X_test)


In [34]:
rf_train = rf_train[:,[1]]
rf_test = rf_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, rf_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, rf_test),sep="\n")

Performance on train set: 0.93
Performance on test set: 0.93


### Boosted Model

In [37]:
# XG Boost
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators = 50, max_depth = 2, n_jobs = 99)
xgb = xgb.fit(X_train, y_train)
xgb_train = xgb.predict_proba(X_train)
xgb_test = xgb.predict_proba(X_test)


In [38]:
xgb_train = xgb_train[:,[1]]
xgb_test = xgb_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, xgb_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, xgb_test),sep="\n")

Performance on train set: 0.99
Performance on test set: 0.99


### Multi-Layer Perceptron

In [35]:
#Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(500,500))
nn = nn.fit(X_train,y_train)
nn_train = nn.predict_proba(X_train)
nn_test = nn.predict_proba(X_test)

In [36]:
nn_train = nn_train[:,[1]]
nn_test = nn_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, nn_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, nn_test),sep="\n")

Performance on train set: 0.96
Performance on test set: 0.96


### Let's see what features drive whether someone will default

In [39]:
# Feature Importance
feat_importances = pd.DataFrame(zip(list(X_train.columns),rf.feature_importances_))
feat_importances.columns = ['feature','f_score']
feat_importances = feat_importances.sort_values('f_score',ascending=False)
feat_importances = feat_importances.reset_index(drop=True)

pd.options.display.max_columns = 10000
pd.options.display.max_rows = 10000
feat_importances

Unnamed: 0,feature,f_score
0,settlement_plan_flag,0.09774556
1,out_prncp,0.07382966
2,settlement_term,0.07089805
3,settlement_status none,0.06781226
4,recoveries,0.05863894
5,settlement_status COMPLETE,0.0494309
6,settlement_amount,0.04722371
7,total_rec_prncp,0.04687773
8,out_prncp_inv,0.04436436
9,next_last_payment_difference,0.04094659


### We are getting some very high performing models. But our results seem dubious... We expect a greater amount of uncertainty when modeling consumer behavior. We have a leakage problem!

Let's assign a new set of features which reflects the information in the order we would receive it. Meaning let's not use features that we would have after the time of default. 

In [54]:
X_acc = X[['out_prncp', 'total_rec_prncp', 'out_prncp_inv', 'next_last_payment_difference', 'mths_since_rcnt_il',
          'total_pymnt_inv', 'int_rate', 'total_rec_late_fee', 'open_act_il', 'mths_since_issue_d', 'hardship_type INTEREST ONLY-3 MONTHS DEFERRAL',
          'deferral_term', 'hardship_reason MEDICAL', 'settlement_status BROKEN', 'hardship_status BROKEN', 'total_pymnt',
          'avg_cur_bal', 'hardship_dpd', 'percent_bc_gt_75', 'num_tl_op_past_12m', 'sec_app_inq_last_6mths', 'home_ownership RENT',
          'max_bal_bc', 'sub_grade']]

Time for a new train/test split

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X_acc, y, test_size = 0.30, random_state = 42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1582467, 24) (1582467, 1) (678201, 24) (678201, 1)


## Updated Random Forest 

In [56]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 50, max_depth = 2, n_jobs = 99)
rf.fit(X_train, y_train)
rf_train = rf.predict_proba(X_train)
rf_test = rf.predict_proba(X_test)

In [57]:
rf_train = rf_train[:,[1]]
rf_test = rf_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, rf_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, rf_test),sep="\n")

Performance on train set: 0.92
Performance on test set: 0.92


## Updated Boosted Model

In [58]:
# XG Boost
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators = 50, max_depth = 2, n_jobs = 99)
xgb = xgb.fit(X_train, y_train)
xgb_train = xgb.predict_proba(X_train)
xgb_test = xgb.predict_proba(X_test)


In [59]:
xgb_train = xgb_train[:,[1]]
xgb_test = xgb_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, xgb_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, xgb_test),sep="\n")

Performance on train set: 0.96
Performance on test set: 0.96


## Multi Layer Perceptron

In [172]:
#Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(500,500))
nn = nn.fit(X_train,y_train)
nn_train = nn.predict_proba(X_train)
nn_test = nn.predict_proba(X_test)

In [173]:
nn_train = nn_train[:,[1]]
nn_test = nn_test[:,[1]]
print('Performance on train set: %.2f' % roc_auc_score(y_train, nn_train),
      'Performance on test set: %.2f' % roc_auc_score(y_test, nn_test),sep="\n")

Performance on train set: 0.99
Performance on test set: 0.99


## Let's look at the updated feature importance

In [174]:
# Feature Importance
feat_importances = pd.DataFrame(zip(list(X_train.columns),rf.feature_importances_))
feat_importances.columns = ['feature','f_score']
feat_importances = feat_importances.sort_values('f_score',ascending=False)
feat_importances = feat_importances.reset_index(drop=True)

pd.options.display.max_columns = 10000
pd.options.display.max_rows = 10000
feat_importances

Unnamed: 0,feature,f_score
0,next_last_payment_difference,0.233349
1,out_prncp,0.209989
2,total_rec_prncp,0.123268
3,int_rate,0.075535
4,out_prncp_inv,0.057881
5,total_pymnt,0.055696
6,sub_grade,0.05567
7,mths_since_issue_d,0.05216
8,max_bal_bc,0.048003
9,total_pymnt_inv,0.028527
