In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


## Metrics 
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import learning_curve, train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  StandardScaler

## For ML Models:
from sklearn.linear_model import LogisticRegression,LinearRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import random


In [9]:
df = pd.read_csv('cleaned.csv')

In [10]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

In [11]:
y_test

21781    0
16129    1
12830    0
20940    1
18392    1
        ..
27097    0
6161     1
28543    0
3388     0
3310     1
Name: loan_status, Length: 8592, dtype: int64

In [12]:
print(f'X Test Shape: {X_test.shape}')
print(f'y Test Shape: {y_test.shape}')
print(f'X Train Shape: {X_train.shape}')
print(f'y Train Shape: {y_train.shape}')


X Test Shape: (8592, 11)
y Test Shape: (8592,)
X Train Shape: (20046, 11)
y Train Shape: (20046,)


## Creating the Pipeline will require:
* Preprocessing numerical features:
    1. StandardScaler - To balance scale among features 
*
* Applying SMOTE to handle imbalance in our Target variable

## Function for Model Results

In [13]:
def results(model, X, y):
    y_pred = model.predict(X)
    print(f'Accuracy Score: {accuracy_score(y,y_pred)}')
    print(f'Precision Score: {precision_score(y,y_pred)}')
    print(f'Recall Score: {recall_score(y,y_pred)}')
    print(f'F1 Score: {f1_score(y,y_pred)}')
    # ConfusionMatrixDisplay(model,X,y)

In [14]:
numericals = list(X.select_dtypes(['float64','int64']).columns)
numericals

['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_grade',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_default_on_file',
 'cb_person_cred_hist_length']

In [15]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(),numericals),],remainder= 'passthrough')

# LinearRegression

In [16]:
lr = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', LinearRegression())])

In [17]:
lr.fit(X_train,y_train)
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.079


### Linear Regression performs poorly with a model score of only 0.079, which suggests that linear regression is not suitable for this classification task. Linear regression is more suitable for regression tasks where the target variable is continuous.

# LogisticRegression 

In [18]:
logreg = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', LogisticRegression())])

In [19]:
logreg.fit(X_train,y_train)
print("model score: %.3f" % logreg.score(X_test, y_test))

model score: 0.781


In [20]:
results(logreg,X_train, y_train)

Accuracy Score: 0.7853935947321161
Precision Score: 0.5017926501344487
Recall Score: 0.7764678687008784
F1 Score: 0.6096188747731397


In [21]:
results(logreg,X_test, y_test)

Accuracy Score: 0.7810754189944135
Precision Score: 0.49929922915206726
Recall Score: 0.7591901971230687
F1 Score: 0.6024096385542168


In [22]:
results(logreg,X_test, y_test)

Accuracy Score: 0.7810754189944135
Precision Score: 0.49929922915206726
Recall Score: 0.7591901971230687
F1 Score: 0.6024096385542168


In [23]:
logreg_params = {'classifier__max_iter':[100,1000, 10000],
                 'classifier__solver':['liblinear', 'lbfgs','newton-cg']}

In [24]:
logreg_grid = GridSearchCV(logreg, logreg_params)
logreg_grid.fit(X_train, y_train)

In [25]:
logreg_grid.best_params_

{'classifier__max_iter': 1000, 'classifier__solver': 'lbfgs'}

In [26]:
logreg_grid.best_estimator_

In [27]:
best_logreg = logreg_grid.best_estimator_

In [28]:
results(best_logreg,X_train,y_train)

Accuracy Score: 0.7859922178988327
Precision Score: 0.5027043269230769
Recall Score: 0.7734627831715211
F1 Score: 0.6093607721726462


In [29]:
results(best_logreg, X_test, y_test)

Accuracy Score: 0.7811918063314711
Precision Score: 0.49947201689545934
Recall Score: 0.7559936068193926
F1 Score: 0.6015260703688003


These evaluation metrics suggest that the logistic regression model performs reasonably well in predicting loan status, but precision indicates that theres still room for improvement

# RidgeClassifier

In [30]:
rr = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', RidgeClassifier())])

In [31]:
rr.fit(X_train,y_train)
print("model score: %.3f" % rr.score(X_test, y_test))

model score: 0.784


In [32]:
results(rr,X_train,y_train)

Accuracy Score: 0.7886361368851641
Precision Score: 0.5067639458884329
Recall Score: 0.7706888580674989
F1 Score: 0.611462631820266


In [33]:
results(rr,X_test,y_test)

Accuracy Score: 0.784217877094972
Precision Score: 0.5041115480872363
Recall Score: 0.7511987213638786
F1 Score: 0.6033376123234917


In [38]:
rr_params = {
    'classifier__alpha': [0.1, 1.0, 10.0],
    'classifier__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg'],
    'classifier__max_iter': [1000, 5000, 10000]
}

In [39]:
rr_grid = GridSearchCV(rr, rr_params)
rr_grid.fit(X_train, y_train)

In [41]:
best_rr = rr_grid.best_estimator_

In [42]:
results(best_rr,X_train,y_train)

Accuracy Score: 0.7894343011074528
Precision Score: 0.5080091533180778
Recall Score: 0.7697642163661581
F1 Score: 0.6120760959470637


In [43]:
results(best_rr, X_test, y_test)

Accuracy Score: 0.785498137802607
Precision Score: 0.5060757684060043
Recall Score: 0.7543953116675546
F1 Score: 0.6057754010695188


### Ridge Classifier outperforms Logistic Regression in terms of accuracy and F1 score on both the training and test sets. However, the differences in performance between the two models are relatively small.

# Random Forest 

In [45]:
rf = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', RandomForestClassifier())])

In [47]:
rf.fit(X_train,y_train)
print("RandomForest Classifier score: %.3f" % rf.score(X_test, y_test))

model score: 0.925


In [48]:
results(rf,X_train,y_train)

Accuracy Score: 1.0
Precision Score: 1.0
Recall Score: 1.0
F1 Score: 1.0


In [49]:
results(rf,X_test,y_test)

Accuracy Score: 0.9250465549348231
Precision Score: 0.9219712525667351
Recall Score: 0.7176345231752796
F1 Score: 0.8070701018573996


In [53]:
rf_params =  {'classifier__criterion': ['gini', 'entropy'], 
               'classifier__max_depth':[15,20],
               'classifier__n_estimators':[15,20,25]
             
             }

In [55]:
rf_grid = GridSearchCV(rf,rf_params,cv=5)
rf_grid.fit(X_train, y_train)

In [60]:
best_rf_grid = rf_grid.best_estimator_
best_rf_grid.fit(X_train,y_train)
print("RandomForest Classifier Grid score: %.3f" % best_rf_grid.score(X_test, y_test))

RandomForest Classifier Grid score: 0.919


In [61]:
results(best_rf_grid,X_train,y_train)

Accuracy Score: 0.9882270777212412
Precision Score: 0.998780487804878
Recall Score: 0.9466019417475728
F1 Score: 0.9719914550201756


In [62]:
results(best_rf_grid,X_test, y_test)

Accuracy Score: 0.9187616387337058
Precision Score: 0.8891089108910891
Recall Score: 0.7176345231752796
F1 Score: 0.7942216981132075


# Support Vector Machine (SVM)

In [72]:
svc = Pipeline([('preprocessor', preprocessor), ('SMOTE',SMOTE()), ('classifier', SVC())])
svc.fit(X_train, y_train)
print("SVM score: %.3f" % svc.score(X_test, y_test))

SVM score: 0.852


In [81]:
svc_params = {'classifier__kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
              'classifier__degree':[3,10,15]}

In [82]:
svc_grid = GridSearchCV(svc,svc_params, cv=5)
svc_grid.fit(X_train,y_train)

15 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/imblearn/pipeline.py", line 297, in fit
    self._final_estimator.fit(Xt, yt, **fit_params_last_step)
  File "/usr/local/lib/python3.10/site-packages/sklearn/svm/_base.py", line 217, in fit
    raise ValueError(
ValueError: Precomputed matrix must be a square matrix. Input is a 25152x11 matrix.

 0.85862491 0.86166859 0.67669372        nan 0.78698985 0.8455547
 0.86101

In [83]:
best_svc_grid = svc_grid.best_estimator_
best_svc_grid.fit(X_train, y_train)
print("SVC Grid score: %.3f" % best_svc_grid.score(X_test, y_test))

SVC Grid score: 0.851


In [84]:
results(best_svc_grid,X_train, y_train) 

Accuracy Score: 0.8698992317669361
Precision Score: 0.6724899598393574
Recall Score: 0.7741562644475266
F1 Score: 0.7197506984741027


In [85]:
results(best_svc_grid,X_test, y_test)

Accuracy Score: 0.8505586592178771
Precision Score: 0.6373320981936081
Recall Score: 0.7330847096430474
F1 Score: 0.6818632309217046


# Testing Models on Random 

In [None]:
def test_random_row(models, df):
    random_index = random.randint(0, len(df) - 1)
    random_row = df.iloc[random_index]

    features = random_row.drop('loan_status')
    target = random_row['loan_status']

    results = {}
    for model_name, model in models.items():
        score = model.predict_proba([features])[0][target]
        results[model_name] = score

    return results

In [86]:
NEEDS TO BE FIXED 
# 
# models = {
#     'Logistic Regression': logistic_regression_model,
#     'Random Forest': random_forest_model,
#     'SVM': svm_model,
#     # Add more models as needed
# }

# results = test_random_row(models, df)
