In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
X = train_df.drop('loan_status', axis=1)
X_train = pd.get_dummies(X)
y = LabelEncoder().fit_transform(train_df['loan_status'])

In [4]:
# Convert categorical data to numeric and separate target feature for testing data
X2 = test_df.drop('loan_status', axis=1)
X_test = pd.get_dummies(X2)
y2 = LabelEncoder().fit_transform(test_df['loan_status'])

In [5]:
# add missing dummy variables to testing set
for x in X_train.columns:
    if x not in X_test.columns:
        X_test[x] = 0

# Unscaled Data Prediction
Logistic regression Vs. Random Forest 

I predict that the random forest classifier will perform better than the logistic regression model, because there are so many features in the data. The random forest classifier will provide greater opportunity to predict the outcome correctly, based on the features, compared to the logistic regression. Also, since the data is not scaled, the logistic regression model might not perform well.

In [6]:
# Train the LOGISTIC REGRESSION model on the unscaled data and print the model score
classifier = LogisticRegression(max_iter=1000).fit(X_train, y)

print(f"Training Data Score: {classifier.score(X_train, y)}")
print(f"Testing Data Score: {classifier.score(X_test, y2)}")

Training Data Score: 0.6811986863711001
Testing Data Score: 0.5542322415993194


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [7]:
# Train a RANDOM FOREST Classifier model and print the model score
# i.e. 70 % training dataset and 30 % test datasets
clf = RandomForestClassifier(random_state=1, n_estimators=200).fit(X_train, y)

print(f'Training Score: {clf.score(X_train, y)}')
print(f'Testing Score: {clf.score(X_test, y2)}')

Training Score: 1.0
Testing Score: 0.6210123351765207


# Unscaled Data Result

The random forest classifier performed better than the logistic regression as I had anticipated. However, the random forest classifier model is overfitted to the training data and may not be the best representation for the 2020 test data.


# Scaled Data Prediction
I predict that scaling the data will improve the score for the logistic regression model since the scale for each feature will be more standard compared to the model. Scaling the data likely won't affect the random forest classifier model.

In [8]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier_scaled = LogisticRegression(max_iter=1000).fit(X_train_scaled, y) 
print(f"Training Data Score: {classifier_scaled.score(X_train_scaled, y)}")
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y2)}")

Training Data Score: 0.7127257799671592
Testing Data Score: 0.7201190982560612


In [10]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf_scaled = RandomForestClassifier(random_state=1, n_estimators=200).fit(X_train_scaled, y)
print(f'Training Score: {clf_scaled.score(X_train_scaled, y)}')
print(f'Testing Score: {clf_scaled.score(X_test_scaled, y2)}')

Training Score: 1.0
Testing Score: 0.6214376860910251


# Findings

As anticipated, scaling the data improved the results for the logistic regression, and the random forest classifier model does not seem affected. Additionally, the logistic regression performs better for the scaled data than random forest classifier model.

Therefore, scaling the data and using a logistic regression model would be best the best approach out of the 4 attempted here. The data did not seem overfitted, and this approach gave the highest r-squared of approximately 0.720119, on the test data.