In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [48]:
from sklearn.metrics import confusion_matrix

In [49]:
# Prediction: I predict that the logistic regression will do better compared to random forest because this data output seems fairly linear to me, therefore logistic regression seems very fitting with our objective

In [50]:
# Turning csv's into DataFrames
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [51]:
# Dropping the column that I want to predict on the training data
X_train = train_df.drop('loan_status',axis = 1)

In [52]:
# Creating training data X and Y
X_dummies = pd.get_dummies(X_train)
y_train_label = LabelEncoder().fit_transform(train_df['loan_status'])

In [53]:
# Creating testing data X and Y
X_test = test_df.drop('loan_status',axis = 1)
X_dummies_test = pd.get_dummies(X_test).reindex(columns=X_dummies.columns,fill_value=0)
y_test_label = LabelEncoder().fit_transform(test_df['loan_status'])

In [54]:
# Performing Logistic Regression on unscaled data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_dummies, y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [55]:
# Printing our Training and Testing Score
print(f"Training Data Score: {classifier.score(X_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_dummies_test, y_test_label)}")

Training Data Score: 0.6485221674876848
Testing Data Score: 0.5253083794130158


In [56]:
predictions = classifier.predict(X_dummies_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,1,0
4698,1,0
4699,1,0
4700,1,0


In [57]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = classifier.predict(X_dummies_test)
confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.22      0.32      2351
           1       0.52      0.83      0.64      2351

    accuracy                           0.53      4702
   macro avg       0.54      0.53      0.48      4702
weighted avg       0.54      0.53      0.48      4702



In [58]:
# Finding our accuracy on unscaled logistic regression
print(f"Accuracy: {accuracy}")

Accuracy: 0.5253083794130158


In [59]:
# Train a Random Forest Classifier model and print the model score
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_dummies, y_train_label)
print(f'Training Score: {clf.score(X_dummies, y_train_label)}')
print(f'Testing Score: {clf.score(X_dummies_test, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6180348787749894


In [60]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_dummies_test)
confusion_matrix(y_true, y_pred)

array([[ 780, 1571],
       [ 225, 2126]], dtype=int64)

In [61]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test_label})
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.33      0.46      2351
           1       0.58      0.90      0.70      2351

    accuracy                           0.62      4702
   macro avg       0.68      0.62      0.58      4702
weighted avg       0.68      0.62      0.58      4702



In [63]:
# Finding our accuracy on unscaled random forest classifier
print(f"Accuracy: {accuracy}")

Accuracy: 0.6180348787749894


In [None]:
# With a near 62% accuracy from random forest classifier compared to a 52% accuracy from logistic regression, random forest classifier is noticably more accurate on unscaled data

In [64]:
# Scale the data
scaler = StandardScaler().fit(X_dummies)
X_train_scaled = scaler.transform(X_dummies)
X_test_scaled = scaler.transform(X_dummies_test)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_label)}")

Training Data Score: 0.6386699507389163
Testing Data Score: 0.5070182900893236


In [None]:
# With scaled data, I would still maintain that logistic regression will do better. This time, it's because knowing that logistic regression does better with data scaling compared to random forest, which does not respond well to scaling. 

In [72]:
# Train the Logistic Regression model on the scaled data
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.24      0.33      2351
           1       0.50      0.77      0.61      2351

    accuracy                           0.51      4702
   macro avg       0.51      0.51      0.47      4702
weighted avg       0.51      0.51      0.47      4702



In [73]:
# Train a Random Forest Classifier model on the scaled data
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train_label)
print(f'Training Score: {clf.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6193109315185028


In [74]:
# Finding our accuracy on scaled logistic regression
print(f"Accuracy: {accuracy}")

Accuracy: 0.5070182900893236


In [75]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_dummies_test)
confusion_matrix(y_true, y_pred)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.44      0.67      0.53      2351
           1       0.33      0.16      0.22      2351

    accuracy                           0.42      4702
   macro avg       0.39      0.42      0.37      4702
weighted avg       0.39      0.42      0.37      4702



In [76]:
# Finding our accuracy on scaled logistic regression
print(f"Accuracy: {accuracy}")

Accuracy: 0.415142492556359


In [None]:
# For scaled data, logistic regression yieled an accuracy score of 50% compared to random forest's accuracy score of 41%