In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   12180 non-null  float64
 1   int_rate                    12180 non-null  float64
 2   installment                 12180 non-null  float64
 3   home_ownership              12180 non-null  object 
 4   annual_inc                  12180 non-null  float64
 5   verification_status         12180 non-null  object 
 6   pymnt_plan                  12180 non-null  object 
 7   dti                         12180 non-null  float64
 8   delinq_2yrs                 12180 non-null  float64
 9   inq_last_6mths              12180 non-null  float64
 10  open_acc                    12180 non-null  float64
 11  pub_rec                     12180 non-null  float64
 12  revol_bal                   12180 non-null  float64
 13  total_acc                   121

In [4]:
train_df.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,...,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0,12180.0
mean,17296.672824,0.141039,505.758204,88183.01,22.782614,0.249589,0.565928,12.639737,0.1133,17488.418966,...,0.058867,2.343432,94.640681,33.397685,0.1133,0.0,198175.8,62760.313875,27957.233169,56829.34
std,10208.296576,0.05259,293.253172,125760.4,25.305738,0.776071,0.815326,6.044768,0.325156,22978.078793,...,0.361183,1.971186,8.75185,34.196938,0.325156,0.0,190984.4,60404.20343,25467.53458,53977.71
min,1000.0,0.06,30.89,500.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,20.0,0.0,0.0,0.0,3300.0,960.0,200.0,486.0
25%,10000.0,0.1033,285.41,50000.0,14.34,0.0,0.0,8.0,0.0,6235.5,...,0.0,1.0,92.3,0.0,0.0,0.0,62064.75,26880.75,10800.0,23054.5
50%,15000.0,0.1308,437.685,73000.0,20.45,0.0,0.0,11.0,0.0,12022.5,...,0.0,2.0,100.0,25.0,0.0,0.0,128262.5,45830.5,20700.0,42576.0
75%,24000.0,0.1774,673.42,102000.0,27.27,0.0,1.0,16.0,0.0,21204.5,...,0.0,3.0,100.0,57.1,0.0,0.0,285419.2,78041.25,36300.0,73454.5
max,40000.0,0.2897,1671.88,9500000.0,999.0,14.0,5.0,65.0,4.0,517103.0,...,11.0,20.0,100.0,100.0,4.0,0.0,3129332.0,917986.0,284800.0,1319104.0


In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   4702 non-null   float64
 1   int_rate                    4702 non-null   float64
 2   installment                 4702 non-null   float64
 3   home_ownership              4702 non-null   object 
 4   annual_inc                  4702 non-null   float64
 5   verification_status         4702 non-null   object 
 6   pymnt_plan                  4702 non-null   object 
 7   dti                         4702 non-null   float64
 8   delinq_2yrs                 4702 non-null   float64
 9   inq_last_6mths              4702 non-null   float64
 10  open_acc                    4702 non-null   float64
 11  pub_rec                     4702 non-null   float64
 12  revol_bal                   4702 non-null   float64
 13  total_acc                   4702 

In [6]:
test_df.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,...,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0,4702.0
mean,16959.410889,0.148018,504.334028,89862.94,23.033847,0.207571,0.686729,12.729902,0.108039,18313.320927,...,0.057848,2.561038,95.053998,32.822671,0.107826,0.0,197988.0,63715.605487,30728.753722,56833.438537
std,10155.556866,0.058096,295.725642,125378.1,21.33624,0.668909,0.897366,6.090486,0.314547,22514.741749,...,0.351332,2.040537,8.283599,33.108024,0.314282,0.0,188495.8,56956.32168,28254.276304,50636.661039
min,1000.0,0.0646,31.43,100.0,0.26,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,27.3,0.0,0.0,0.0,3000.0,1408.0,300.0,526.0
25%,9062.5,0.1033,279.13,52000.0,14.8,0.0,0.0,8.0,0.0,6658.25,...,0.0,1.0,92.9,0.0,0.0,0.0,62558.0,28043.0,11500.0,23648.75
50%,15000.0,0.143,446.01,75000.0,21.205,0.0,0.0,12.0,0.0,12636.0,...,0.0,2.0,100.0,25.0,0.0,0.0,128818.0,48205.0,22800.0,43881.5
75%,24000.0,0.1862,689.3875,105000.0,28.22,0.0,1.0,16.0,0.0,22439.0,...,0.0,4.0,100.0,50.0,0.0,0.0,285577.5,80131.5,41175.0,74320.75
max,40000.0,0.288,1604.18,6503700.0,999.0,9.0,5.0,57.0,2.0,512728.0,...,9.0,14.0,100.0,100.0,2.0,0.0,2450518.0,670055.0,307200.0,618528.0


In [7]:
train_df['target'].value_counts()


low_risk     6090
high_risk    6090
Name: target, dtype: int64

In [8]:
test_df['target'].value_counts()

low_risk     2351
high_risk    2351
Name: target, dtype: int64

In [9]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop('target', axis=1)
X_train = pd.get_dummies(X_train)
X_train.head(10)
#92 columns 

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0
5,6000.0,0.0756,186.81,90500.0,23.48,0.0,0.0,9.0,0.0,7092.0,...,0,1,0,1,1,0,1,0,1,0
6,10500.0,0.225,293.0,32000.0,16.09,0.0,1.0,10.0,0.0,2269.0,...,1,1,0,1,1,0,1,0,1,0
7,22000.0,0.0819,448.09,92000.0,22.87,0.0,0.0,17.0,0.0,26020.0,...,0,1,0,1,1,0,1,0,1,0
8,10625.0,0.2055,284.76,54100.0,15.53,0.0,2.0,8.0,0.0,9371.0,...,0,1,0,1,0,1,1,0,1,0
9,20000.0,0.2534,591.02,38000.0,19.68,0.0,0.0,4.0,0.0,20770.0,...,0,1,0,1,1,0,1,0,1,0


In [10]:
y_train = train_df['target']
y_train.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: target, dtype: object

In [11]:
# Convert categorical data to numeric and separate target feature for testing data

X_test = test_df.drop('target', axis=1)
X_test = pd.get_dummies(X_test)
y_test = test_df['target'] 

X_test.head()


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [12]:
# Get missing columns in the training test
missing_cols = set(X_train.columns ) - set(X_test.columns )

# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0

# Ensure the order of column in the test set is in the same order than in train set
test = X_test[X_train.columns]

# Prediction 

## Random Forest will perform better than logistic regression because RF tends to be robust against overfitting and outliers.  



In [13]:
# Train the Logistic Regression model on the unscaled data and print the model score
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=1)


In [14]:
model = LogisticRegression(tol=1e-1,max_iter=10000, solver='lbfgs')
model

LogisticRegression(max_iter=10000, tol=0.1)

In [15]:
model.fit(X_train, y_train)
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.6981937602627257
Testing Data Score: 0.7037766830870279


In [16]:
# Train a Random Forest Classifier model and print the model score
randomF_model = RandomForestClassifier(random_state=1, n_estimators=1000).fit(X_train, y_train)
print(f'Training Score: {randomF_model.score(X_train, y_train)}')
print(f'Testing Score: {randomF_model.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.7825944170771757


In [17]:
# Scale the data and fit the trained data 
X_scaler = StandardScaler().fit(X_train)

In [18]:
#scale X_train data 
X_train_scaled = X_scaler.transform(X_train)
X_train_scaled 

array([[-1.21025177,  0.42766313, -1.12546751, ..., -0.16284911,
         0.01812499, -0.01812499],
       [-0.22388831, -1.35596931, -0.14514751, ..., -0.16284911,
         0.01812499, -0.01812499],
       [-1.21025177,  0.68246776, -1.11403979, ..., -0.16284911,
         0.01812499, -0.01812499],
       ...,
       [-0.04634289, -1.46245483,  0.03037397, ..., -0.16284911,
         0.01812499, -0.01812499],
       [-0.22388831,  0.68246776,  0.11844265, ..., -0.16284911,
         0.01812499, -0.01812499],
       [-0.65542232,  1.21679688, -0.75601702, ..., -0.16284911,
         0.01812499, -0.01812499]])

In [19]:
#Scale X_test data 
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

array([[ 0.26929342,  1.11221289,  0.08265406, ..., -0.16284911,
         0.01812499, -0.01812499],
       [ 0.76247515,  0.37442037,  0.35524269, ..., -0.16284911,
         0.01812499, -0.01812499],
       [-0.63816096,  2.78555676, -0.18582881, ..., -0.16284911,
         0.01812499, -0.01812499],
       ...,
       [-0.51979735,  2.78555676, -0.0142078 , ..., -0.16284911,
         0.01812499, -0.01812499],
       [ 0.26929342,  2.78555676,  1.1298296 , ..., -0.16284911,
         0.01812499, -0.01812499],
       [-0.71707004,  0.68246776, -0.49779857, ..., -0.16284911,
         0.01812499, -0.01812499]])

In [20]:
#Train the Logistic Regression model on the scaled data and print the model score
model_scaled = LogisticRegression(tol=1e-1,max_iter=1000, solver='lbfgs')
model_scaled.fit(X_train_scaled, y_train)
print(f"Training Data Score: {model_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model_scaled.score(X_test_scaled, y_test)}")

Training Data Score: 0.7082649151614668
Testing Data Score: 0.7205254515599343


In [21]:
# Train a Random Forest Classifier model and print the model score
randomF_model_scaled = RandomForestClassifier(random_state=1, n_estimators=1000).fit(X_train_scaled, y_train)
print(f'Training Score: {randomF_model_scaled.score(X_train_scaled, y_train)}')
print(f'Testing Score: {randomF_model_scaled.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.7832512315270936


# Results/Conclusion: 

# RF with Scaled Data peformed best. This was in line with my prediction. # Scaling the data just made the model slightly more accurate. 


## LR: performed better with scaled data 
## RF: performed better with scaled data 





### Non-Scaled LR: 
##### Training Data Score: 0.6981937602627257
##### Testing Data Score: 0.7037766830870279

### Non-Scaled RF : 
##### Training Score: 1.0
##### Testing Score: 0.7825944170771757