In [13]:
# Importing initial libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
path = "Resources/lending_data.csv"
ld = pd.read_csv(path)
ld.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [14]:
# setting up the multidimentional variable x and one dimentional y variables
x = ld[[
    "loan_size",
    "interest_rate",
    "borrower_income",
    "debt_to_income",
    "num_of_accounts",
    "derogatory_marks",
    "total_debt"]]
y = ld["loan_status"]

### Prediction: I beleive that the Random Forest Classifier will give better results that the Logistic Regression model:
##### RFC is suitable in our case as it does not focus on interpretability.
##### In addition, the main focus of the project would benefit more from trained decision trees that lead to the prediction.
##### Finally, RFC adds additional randomness to the model while growing the decision trees, which results in a wide diversity that results a better model.

In [15]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42)

In [16]:
#Scaling data based on the values provided
from sklearn.preprocessing import StandardScaler
# Train data
scaler_train = StandardScaler().fit(X_train)
X_train_scaled = scaler_train.transform(X_train)
# Test data
scaler_test = StandardScaler().fit(X_test)
X_test_scaled = scaler_test.transform(X_test)

In [17]:
# Train a Logistic Regression model print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver = 'saga',penalty = 'elasticnet', l1_ratio = 0.1, class_weight = 'balanced')
classifier

LogisticRegression(class_weight='balanced', l1_ratio=0.1, penalty='elasticnet',
                   solver='saga')

In [18]:
# Training the classifier
classifier.fit(X_train_scaled, y_train)



LogisticRegression(class_weight='balanced', l1_ratio=0.1, penalty='elasticnet',
                   solver='saga')

In [19]:
# Checking validity against test data
print(f"Training data model score: {classifier.score(X_train_scaled, y_train):.5f}")
print(f"Testing data model score: {classifier.score(X_test_scaled, y_test):.5f}")

Training data model score: 0.99398
Testing data model score: 0.99381


In [20]:
# Compare predictions with actual results
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Predictions": predictions, "Actuals": y_test})
results.head(10)

Unnamed: 0,Predictions,Actuals
67504,0,0
55887,0,0
14010,0,0
43768,0,0
24257,0,0
36048,0,0
17518,0,0
34544,0,0
77433,1,1
18838,0,0


In [21]:
# Train a Random Forest Classifier model and print the model score
from sklearn.ensemble import RandomForestClassifier
rclassifier = RandomForestClassifier(max_depth=3, random_state=42, n_estimators = 50)
rclassifier.fit(X_train_scaled, y_train)

RandomForestClassifier(max_depth=3, n_estimators=50, random_state=42)

In [22]:
# Checking validity against test data
print(f"Training data model score: {rclassifier.score(X_train_scaled, y_train):.5f}")
print(f"Testing data model score: {rclassifier.score(X_test_scaled, y_test):.5f}")

Training data model score: 0.99445
Testing data model score: 0.99438


In [23]:
# Compare predictions with actual results
rpredictions = rclassifier.predict(X_test_scaled)
rcresults = pd.DataFrame({"Predictions": rpredictions, "Actuals": y_test})
rcresults.head(10)

Unnamed: 0,Predictions,Actuals
67504,0,0
55887,0,0
14010,0,0
43768,0,0
24257,0,0
36048,0,0
17518,0,0
34544,0,0
77433,1,1
18838,0,0


### Conclusion
##### The RFC testing model score of 0.99438 is higher than the Logistic Regression testing model score of 0.99381
##### The above numbers do explain my prediction. As the model is looking at credit risk evaluation model, the decision tree usage would be a great benefit.
##### The RFC forest had a total of 50 trees that the model averaged to get the model score.
##### Both models had a test score > train score which is not best practice. This is where i had to add hyper-parameters to both models to make it a more normal model where the train model score is higher than the test model score