In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 


from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


## Metrics 
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import learning_curve, train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import  StandardScaler

## For ML Models:
from sklearn.linear_model import LogisticRegression,LinearRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [8]:
df = pd.read_csv('cleaned.csv')

In [9]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state=42)

In [39]:
y_test

21781    0
16129    1
12830    0
20940    1
18392    1
        ..
27097    0
6161     1
28543    0
3388     0
3310     1
Name: loan_status, Length: 8592, dtype: int64

In [14]:
print(f'X Test Shape: {X_test.shape}')
print(f'y Test Shape: {y_test.shape}')
print(f'X Train Shape: {X_train.shape}')
print(f'y Train Shape: {y_train.shape}')


X Test Shape: (8592, 11)
y Test Shape: (8592,)
X Train Shape: (20046, 11)
y Train Shape: (20046,)


## Creating the Pipeline will require:
* Preprocessing numerical features:
    1. StandardScaler - To balance scale among features 
*
* Applying SMOTE to handle imbalance in our Target variable

## Function for Model Results

In [68]:
def results(model, X, y):
    y_pred = model.predict(X)
    print(f'Accuracy Score: {accuracy_score(y,y_pred)}')
    print(f'Precision Score: {precision_score(y,y_pred)}')
    print(f'Recall Score: {recall_score(y,y_pred)}')
    print(f'F1 Score: {f1_score(y,y_pred)}')
    # ConfusionMatrixDisplay(model,X,y)

In [20]:
numericals = list(X.select_dtypes(['float64','int64']).columns)
numericals

['person_age',
 'person_income',
 'person_home_ownership',
 'person_emp_length',
 'loan_intent',
 'loan_grade',
 'loan_amnt',
 'loan_int_rate',
 'loan_percent_income',
 'cb_person_default_on_file',
 'cb_person_cred_hist_length']

In [19]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(),numericals),],remainder= 'passthrough')

# LinearRegression

In [25]:
lr = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', LinearRegression())])

In [26]:
lr.fit(X_train,y_train)
print("model score: %.3f" % lr.score(X_test, y_test))

model score: 0.083


# LogisticRegression 

In [52]:
logreg = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', LogisticRegression())])

In [56]:
logreg.fit(X_train,y_train)
print("model score: %.3f" % logreg.score(X_test, y_test))

model score: 0.781


In [83]:
results(logreg,X_train, y_train)

Accuracy Score: 0.7856430210515813
Precision Score: 0.5021683864214147
Recall Score: 0.7762367082755433
F1 Score: 0.6098247525651502


In [69]:
results(logreg,X_test, y_test)

Accuracy Score: 0.7814245810055865
Precision Score: 0.49982449982449983
Recall Score: 0.7586574320724561
F1 Score: 0.6026237833262802


In [None]:
results(logreg,X_test, y_test)

In [85]:
logreg_params = {'classifier__max_iter':[100,1000, 10000],
                 'classifier__solver':['liblinear', 'lbfgs','newton-cg']}

In [86]:
logreg_grid = GridSearchCV(logreg, logreg_params)
logreg_grid.fit(X_train, y_train)

In [88]:
logreg_grid.best_params_

{'classifier__max_iter': 100, 'classifier__solver': 'liblinear'}

In [89]:
logreg_grid.best_estimator_

In [81]:
best_logreg = logreg_grid.best_estimator_

In [90]:
results(best_logreg,X_train,y_train)

Accuracy Score: 0.7856430210515813
Precision Score: 0.5021696842735298
Recall Score: 0.7757743874248728
F1 Score: 0.6096829866472886


In [91]:
results(best_logreg, X_test, y_test)

Accuracy Score: 0.7811918063314711
Precision Score: 0.49947349947349945
Recall Score: 0.7581246670218433
F1 Score: 0.6022005924672027


These evaluation metrics suggest that the logistic regression model performs reasonably well in predicting loan status, but precision indicates that theres still room for improvement

# RidgeClassifier

In [95]:
rr = Pipeline([('preprocessor', preprocessor), ('SMOTE', SMOTE()), ('classifier', RidgeClassifier())])

In [96]:
rr.fit(X_train,y_train)
print("model score: %.3f" % rr.score(X_test, y_test))

model score: 0.785


In [97]:
results(rr,X_train,y_train)

Accuracy Score: 0.7890352189963085
Precision Score: 0.507385411908025
Recall Score: 0.7702265372168284
F1 Score: 0.6117690259799872


In [98]:
results(rr,X_test,y_test)

Accuracy Score: 0.784683426443203
Precision Score: 0.5048162682839814
Recall Score: 0.753862546616942
F1 Score: 0.6047008547008547


In [99]:
rr_params = {'rr__criterion': ['gini', 'entropy'], 
               'rr__max_depth':[15,20],
               'rr__n_estimators':[15,20,25]}

In [101]:
rr_grid = GridSearchCV(rr, rr_params)
rr_grid.fit(X_train, y_train)

ValueError: Invalid parameter 'rf' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['person_age',
                                                   'person_income',
                                                   'person_home_ownership',
                                                   'person_emp_length',
                                                   'loan_intent', 'loan_grade',
                                                   'loan_amnt', 'loan_int_rate',
                                                   'loan_percent_income',
                                                   'cb_person_default_on_file',
                                                   'cb_person_cred_hist_length'])])),
                ('SMOTE', SMOTE()), ('classifier', RidgeClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].