In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy as sc
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [2]:
df_1 = pd.read_csv('Data/loan_data.csv.gz',compression='gzip')
df_1.head()

Unnamed: 0.1,Unnamed: 0,status,loan_amnt,term,annual_inc,dti,payment_inc_ratio,revol_bal,revol_util,purpose,...,delinq_2yrs_zero,pub_rec_zero,open_acc,grade,outcome,emp_length,purpose_,home_,emp_len_,borrower_score
0,1,Charged Off,2500,60 months,30000,1.0,2.3932,1687,9.4,car,...,1,1,3,4.8,default,1,major_purchase,RENT,> 1 Year,0.65
1,2,Charged Off,5600,60 months,40000,5.55,4.5717,5210,32.6,small_business,...,1,1,11,1.4,default,5,small_business,OWN,> 1 Year,0.8
2,3,Charged Off,5375,60 months,15000,18.08,9.716,9279,36.5,other,...,1,1,2,6.0,default,1,other,RENT,> 1 Year,0.6
3,4,Charged Off,9000,36 months,30000,10.08,12.2152,10452,91.7,debt_consolidation,...,1,1,4,4.2,default,1,debt_consolidation,RENT,> 1 Year,0.5
4,5,Charged Off,10000,36 months,100000,7.06,3.90888,11997,55.5,other,...,1,1,14,5.4,default,4,other,RENT,> 1 Year,0.55


In [None]:
preds_1 = ['purpose_','home_','emp_len_']
out_1 = ['outcome']

X_1 = pd.get_dummies(df_1[preds_1],prefix='',prefix_sep='')
y_1 = df_1[out_1]

mod_1 = MultinomialNB(alpha=0.01,fit_prior=True)
mod_1.fit(X_1,y_1)

#conditional probs
print(mod_1.feature_log_prob_)

new_loan = X_1.loc[146:146,:]

print('\npredicted class: ', mod_1.predict(new_loan)[0])

probs = pd.DataFrame(mod_1.predict_proba(new_loan),columns=['default %','paid off %'])

print('\nPredicted Probabilities', probs.mul(100).head())

In [None]:
df_1.outcome = df_1.outcome.astype('category')
preds_2 = ['borrower_score','payment_inc_ratio']
out_2 = ['outcome']

X2 = df_1[preds_2]
y2 = df_1[out_2]

LD_model = LinearDiscriminantAnalysis()

LD_model.fit(X2,y2)

print(pd.DataFrame(LD_model.scalings_,index=X2.columns))

pred = pd.DataFrame(LD_model.predict_proba(df_1[preds_2]),columns=LD_model.classes_)
pred.head()

In [48]:
preds_3 = ['payment_inc_ratio', 'purpose_', 'home_', 'emp_len_','borrower_score']
out_3 = ['outcome']

#getting dummies
X3 = pd.get_dummies(df_1[preds_3],prefix='',prefix_sep='',drop_first=True)
y3 = df_1[out_3]
#The arguments penalty and C are used to prevent overfitting by L1 or L2 regularization.
log_model = LogisticRegression(penalty='l2',C=1e42,solver='liblinear')

log_model.fit(X3,y3)

print(log_model)

print(log_model.predict(X3))

print(log_model.predict_proba(X3))

print(confusion_matrix(y3['outcome'],log_model.predict(X3)))
print(metrics.accuracy_score([1 if i == 'default' else 0 for i in y3['outcome']],[1 if i == 'default' else 0 for i in log_model.predict(X3)]))
print(metrics.recall_score([1 if i == 'default' else 0 for i in y3['outcome']],[1 if i == 'default' else 0 for i in log_model.predict(X3)]))

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1e+42, solver='liblinear')
['paid off' 'paid off' 'default' ... 'paid off' 'paid off' 'default']
[[0.24250183 0.75749817]
 [0.31440728 0.68559272]
 [0.51662729 0.48337271]
 ...
 [0.44768059 0.55231941]
 [0.27833029 0.72166971]
 [0.54263247 0.45736753]]
[[14336  8335]
 [ 8148 14523]]
0.6364739093996736
0.6323496978518812


In [24]:
#Generalized Linear Model
y_numbers = [1 if yi == 'default' else 0 for yi in y3['outcome']]

logit_glm_model = sm.GLM(y_numbers,X3.assign(const=1),family=sm.families.Binomial())

logit_glm_result = logit_glm_model.fit()
pred = logit_glm_result.predict(X3.assign(const=1))
pred_y = logit_glm_result.predict(X3.assign(const=1)) == 'default'
true_y = y3['outcome'] == 'default'

In [19]:
#Generalized Additive Model
import statsmodels.formula.api as smf
formula = ('outcome ~ bs(payment_inc_ratio, df=4) + purpose_ + ' +
'home_ + emp_len_ + bs(borrower_score, df=4)')
model = smf.glm(formula=formula, data=df_1, family=sm.families.Binomial())
results = model.fit()

In [7]:
#undersampling
pred_4 = ['payment_inc_ratio', 'purpose_', 'home_', 'emp_len_','dti', 'revol_bal', 'revol_util']
out_4 = 'outcome'

X4 = pd.get_dummies(df_1[pred_4],drop_first=True,prefix='',prefix_sep='')
y4 = df_1[out_4]

undersample_model = LogisticRegression(penalty='l2',C=1e42,solver='liblinear')

undersample_model.fit(X4,y4)

print('Percentage of loans predicted as default: ',np.mean(undersample_model.predict(X4)=='default')*100)

Percentage of loans predicted as default:  49.256759737109086


In [15]:
#oversampling
default_wt = 1/np.mean(df_1[out_4]=='default')
wt = [default_wt if outcome == 'default' else 1 for outcome in df_1[out_4]]

oversample_model = LogisticRegression(penalty='l2',C=1e42,solver='liblinear')
oversample_model.fit(X4,y4,sample_weight=wt)
print('Percentage of loans predicted as default(with weights): ',np.mean(oversample_model.predict(X4)=='default')*100)

Percentage of loans predicted as default(with weights):  99.80812491729523
