In [3]:
# Import packages
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, average_precision_score

In [4]:
train = pd.read_excel('lendingclub_traindata.xlsx')
test = pd.read_excel('lendingclub_testdata.xlsx')

# 1 = good, 0 = default

# give column names
cols = ['home_ownership', 'income', 'dt1','fico_low','loan_status']
train.columns = test.columns = cols

print(train.head())
print("---------------------")
print(test.head())

   home_ownership  income    dt1  fico_low  loan_status
0               1  44.304  18.47       690            0
1               0  38.500  33.73       660            0
2               1  54.000  19.00       660            0
3               1  60.000  33.98       695            0
4               0  39.354  10.85       685            0
---------------------
   home_ownership  income    dt1  fico_low  loan_status
0               1   127.0  10.94       675            0
1               1   197.0  15.64       710            0
2               1    25.5  28.75       670            0
3               1    80.0  20.16       660            0
4               0    57.0  30.60       675            0


In [5]:
# remove target column to create feature only dataset
X_train = train.drop('loan_status', 1)
X_test = test.drop('loan_status',1)

# store target column
y_train = train['loan_status']
y_test = test['loan_status']

print(X_train, y_train.shape, X_test.shape, y_test.shape)
X_train.columns

      home_ownership   income    dt1  fico_low
0                  1   44.304  18.47       690
1                  0   38.500  33.73       660
2                  1   54.000  19.00       660
3                  1   60.000  33.98       695
4                  0   39.354  10.85       685
...              ...      ...    ...       ...
8690               0   53.000  36.16       685
8691               1  113.500  11.18       690
8692               0  118.000   1.85       785
8693               1   82.000  10.90       705
8694               0  150.000  19.02       745

[8695 rows x 4 columns] (8695,) (5916, 4) (5916,)


Index(['home_ownership', 'income', 'dt1', 'fico_low'], dtype='object')

In [7]:
majority_class = y_train.mode()[0]
prediction = np.full(shape=y_train.shape, fill_value=majority_class)
accuracy_score(y_train, prediction)

0.8276020701552617

In [8]:
freq = y_train.value_counts() # count frequency of different classes in loan status
freq/sum(freq)*100

1    82.760207
0    17.239793
Name: loan_status, dtype: float64

In [9]:
lgstc_reg = LogisticRegression(penalty="none",solver='newton-cg') 

lgstc_reg.fit(X_train, y_train)

print(lgstc_reg.intercept_, lgstc_reg.coef_)

[-6.5652302] [[ 0.139496    0.00410667 -0.00112302  0.0112521 ]]


In [10]:
y_pred = lgstc_reg.predict(X_test) # predict default loans based on test data set

In [None]:
THRESHOLD = [.75,.80,.85]
results = pd.DataFrame(columns = ['THRESHOLD','accuracy','retail','thr',
                                  'fpr','precision','f1_score'])
result['THRESHOLD'] = THRESHOLD # threshold column

j = 0

for i in THRESHOLD:
    lgstc_reg.fit(X_train, y_train)
    preds = np.where(lgstc_reg.predict_proba(X_test)[:,1] > i,i,0)
    
    cm = (confusion_matrix(y_test, preds, labels=[1,0], sample_weight=None)/5916)*100
    
    print('Confusion matrix for threshold =',i)
    print(cm)
    print(' ')
    
    TP = cm[0][0]
    FN = cm[0][1]
    FP = cm[1][0]
    TN = cm[1][1]
    
    results.iloc[j,1] = accuarcy_score(y_test, preds)
    results.iloc[j,2] = recall_score(y_test, preds)
    results.iloc[j,3] = TN/(FP+TN)
    results.iloc[j,4] = FP/(FP+TN)
    results.iloc[j,5] = precision_score(y_test, preds)
    results.iloc[j,6] = f1_score(y_test, preds)
    
    j+=1
    
print('ALL METRICS')
print(results.T)
