In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import RandomOverSampler

In [2]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [3]:
data = pd.read_csv('../../Data/grade_feat_scaled.csv')

In [4]:
data.head(5)

Unnamed: 0,grade,loan_status_bin
0,0.196276,0.0
1,0.196276,0.0
2,-0.575883,0.0
3,2.512753,0.0
4,0.196276,0.0


In [28]:
data.loan_status_bin.value_counts()
#0 = Loan Fully Paid
#1 = Loan Defaulted

0.0    1076751
1.0     268559
Name: loan_status_bin, dtype: int64

In [5]:
X = data.drop('loan_status_bin', axis=1)
X.shape

(1345310, 1)

In [6]:
y = data['loan_status_bin']

In [7]:
steps = [('over', SMOTE()), ('model', clf)]
pipeline = Pipeline(steps=steps)

In [8]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1) 

print(scores)
score = np.mean(scores)
print(score)

[0.74988364 0.75325812 0.75148943 0.75232723 0.74992553]
0.7513767868768115


In [9]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1) 

print(scores)
score = np.mean(scores)
print(score)

[0.56677271 0.56646424 0.56697713 0.56818131 0.56682475]
0.5670440270272279


In [23]:
scaled_grades = np.array(sorted(data.grade.unique())).reshape(-1,1)
scaled_grades

array([[-1.34804177],
       [-0.57588288],
       [ 0.19627602],
       [ 0.96843491],
       [ 1.7405938 ],
       [ 2.51275269],
       [ 3.28491158]])

In [16]:
clf2 = LogisticRegression()

In [17]:
clf2.fit(X,y)

LogisticRegression()

In [24]:
clf2.predict(scaled_grades)

array([0., 0., 0., 0., 0., 1., 1.])

In [29]:
clf2.predict_proba(scaled_grades)

array([[0.91329787, 0.08670213],
       [0.86600632, 0.13399368],
       [0.79860776, 0.20139224],
       [0.7087109 , 0.2912891 ],
       [0.59884318, 0.40115682],
       [0.47805493, 0.52194507],
       [0.35977962, 0.64022038]])

In [31]:
#Basic accuracy score of roughly 80%
clf2.score(X,y)

0.7980785097858486

In [33]:
#Basic recall score of 7%
from sklearn.metrics import recall_score
y_pred = clf2.predict(X)
recall_score(y, y_pred)

0.07093785723062716

In [34]:
from sklearn import metrics
cm = metrics.confusion_matrix(y, y_pred)
print(cm)

[[1054612   22139]
 [ 249508   19051]]


In [36]:
clf3 = LogisticRegression(class_weight='balanced')

In [38]:
clf3.fit(X,y)

LogisticRegression(class_weight='balanced')

In [39]:
clf3.predict(scaled_grades)

array([0., 0., 1., 1., 1., 1., 1.])

In [40]:
clf3.predict_proba(scaled_grades)

array([[0.73588536, 0.26411464],
       [0.62400313, 0.37599687],
       [0.49711455, 0.50288545],
       [0.37059656, 0.62940344],
       [0.2596526 , 0.7403474 ],
       [0.17280289, 0.82719711],
       [0.11066091, 0.88933909]])

In [41]:
clf3.score(X,y)

0.5670440270272279

In [43]:
y_pred3 = clf3.predict(X)
recall_score(y, y_pred3)

0.7513767924366713

In [44]:
cm3 = metrics.confusion_matrix(y, y_pred3)
print(cm3)

[[561061 515690]
 [ 66770 201789]]
