In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
from pathlib import Path

# Processing Libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [None]:
X_train_dummies = pd.get_dummies(train_df)
print(X_train_dummies.columns)
X_train_dummies

In [None]:
train_df.head()

In [None]:
test_df

In [None]:
# Convert categorical data to numeric and separate target feature for training data

X_train = train_df.drop(["loan_status"], axis=1)
X_train = pd.get_dummies(X_train)
y_train = LabelEncoder().fit_transform(train_df["loan_status"])
print(X_train.columns)
y_train

In [None]:
# Convert categorical data to numeric and separate target feature for testing data

X_test = test_df.drop(["loan_status"], axis=1)
X_test = pd.get_dummies(X_test)
y_test = LabelEncoder().fit_transform(test_df["loan_status"])
print(X_test.columns)
y_test

In [None]:
# add missing dummy variables to testing set

X_test["debt_settlement_flag_Y"] = X_test.apply(lambda row: round(abs(row["debt_settlement_flag_N"] - 1),0), axis=1)
X_test = X_test.convert_dtypes()
X_test

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score

classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)
train_score = classifier.score(X_train, y_train)
test_score = classifier.score(X_test, y_test)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")

In [None]:
# Train a Random Forest Classifier model and print the model score

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
train_score = rfc.score(X_train, y_train)
test_score = rfc.score(X_test, y_test)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")

In [None]:
# Scale the data

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train_scaled, y_train)
train_score = classifier.score(X_train_scaled, y_train)
test_score = classifier.score(X_test_scaled, y_test)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_scaled, y_train)
train_score = rfc.score(X_train_scaled, y_train)
test_score = rfc.score(X_test_scaled, y_test)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")