In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
lending_df = pd.read_csv(Path('Resources/lending_data.csv'))


In [3]:
lending_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [4]:
lending_df.shape

(77536, 8)

In [5]:
# Convert categorical data to numeric and separate target feature for training data
y = lending_df["loan_status"]
x = lending_df.drop(columns = ["loan_status"])

In [6]:
# Split the data into X_train, X_test, y_train, y_test
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=1)

In [7]:
x_train.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
29175,8600.0,6.792,44500,0.325843,3,0,14500
23020,7800.0,6.419,41000,0.268293,2,0,11000
31269,10000.0,7.386,50100,0.401198,4,1,20100
35479,9300.0,7.093,47300,0.365751,3,0,17300
13470,9200.0,7.045,46900,0.360341,3,0,16900


In [8]:
len(x_train)

58152

In [9]:
len(x_test)

19384

# Pre-Modeling Prediction

I think random forest classifier will perform better than logistic regression model in ML

In [10]:
# Train the Logistic Regression model print the model score
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(solver='lbfgs', max_iter=100, random_state=1).fit(x_train, y_train)
reg.score(x_test, y_test)

0.9918489475856377

In [11]:
# Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [12]:
# Train the Logistic Regression model on the scaled data and print the model score
logisticRegr_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
logisticRegr_scaled.fit(x_train_scaled, y_train)
print("LogisticRegressin scaled score: ", logisticRegr_scaled.score(x_test_scaled, y_test))

LogisticRegressin scaled score:  0.9936545604622369


In [13]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
randomForestClass = RandomForestClassifier(random_state=1)
randomForestClass.fit(x_train, y_train)
print("RandomForestClassifier score: ", randomForestClass.score(x_test,y_test))

RandomForestClassifier score:  0.9914878250103177


In [14]:
# Train a Random Forest Classifier model on the scaled data and print the model score
randomForestClass_scaled = RandomForestClassifier(random_state=1)
randomForestClass_scaled.fit(x_train_scaled, y_train)
print("RandomForestClassifier scaled score: ", randomForestClass_scaled.score(x_test_scaled, y_test))

RandomForestClassifier scaled score:  0.9914878250103177


### Conclusion and Results

Results

LogisticRegressin unscaled score:       0.9918489475856377

LogisticRegressin scaled score:         0.9936545604622369

RandomForestClassifier unscaled score:  0.9914878250103177

RandomForestClassifier scaled score:    0.9914878250103177

Conclusion

Scaling the data using Standard Scaler improved test prediction accuracy by nearly 0.2 basis points for the logistic regression model ( 0.9918489475856377 to 0.9936545604622369), but there was any noticeable improvement for the random forest classifier when scaling data with StandardScaler
It's surprising, but LogisticRegression with data scaled using StandardScaler performed better than any of the other 3 models with an accuracy of 0.9936545604622369.