In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [40]:
df = pd.read_csv("./lending_data.csv")

df.dropna(inplace=True)

df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Prediction

#### After reading in the data we can see that I quickly noticed that the "loan_status" column is binary, which signals logistic regression. Since we are determining whether a loan will be approved or not we will be using "loan_status" as our target. I'm actually going to predict that the Random Forest Classifier is a better predictor in this case due to a contectual lecture I particated in that described Random Forest as more accurate than logistic regression but also more complex and less interpretable. 

In [41]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [42]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
clf = LogisticRegression().fit(X_train_scaled, y_train)

In [44]:
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9942908240473243
Testing Score: 0.9936545604622369


In [45]:
clf = RandomForestClassifier(random_state=1, n_estimators=100).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9975409272252029
Testing Score: 0.9914878250103177


### Results & Thoughts

#### As we can see by the results both models performed very well. Of course, this would suggest over-fitting given how high both models scored. I wasn't able to see a difference in the random forest score after changing the "n_estimators", even after setting it to 500. Since setting this hyper-parameter lower reduces the time needed to run the cell, and since the additional trees didn't improve our score, I decided to limit this hyper-parameter.