In [None]:
library(MASS)
library(ggplot2)
library(dplyr)
library(mgcv)


In [None]:
loan_data = read.csv('../loan_data.csv')
# The ordered command determines what we are predicting, since 'default' is second, it's treated as true or 1,
# so predict( ,type="response") will return odds of a 'default'
loan_data$outcome = ordered(loan_data$outcome, levels=c('paid off', 'default'))

head(loan_data)

In [None]:
unique(loan_data$outcome)

In [None]:
logistic_model = glm(outcome ~ payment_inc_ratio + purpose_ + home_ + emp_len_ + borrower_score, data=loan_data, family=binomial)
logistic_model


In [None]:
summary(logistic_model)

In [None]:
pred = predict(logistic_model)
summary(pred)

In [None]:
# odds function takes it back to the 0 to 1 scale.
prob = 1/(1 + exp(-pred))
summary(prob)

In [None]:
pred[147]

In [None]:
prob[147]

In [None]:
head(pred)

In [None]:
head(prob)

In [None]:
guess = predict(logistic_model,loan_data[147, c('payment_inc_ratio','purpose_', 'home_', 'emp_len_','borrower_score')])

In [None]:
guess

In [None]:
# special predict method for glm:
# the type='response' executes the 'odds function' for us.
predict(logistic_model,loan_data[147, c('payment_inc_ratio','purpose_', 'home_', 'emp_len_','borrower_score')],type="response")

In [None]:
odds = 1/(1 + exp(-guess))

In [None]:
odds

In [None]:
exp(1)

In [None]:
p = seq(from=0.01, to=.99, by=.01)
df =  data.frame(p = p ,
                 logit = log(p/(1-p)),
                 odds = p/(1-p))
# logit function maps 0 to 1 probability to a larger range. 

Since we're trying to predict a catagory, not a lienear range, the logit function is used because its closer than a straight line:  good expaination: https://www.theanalysisfactor.com/what-is-logit-function/


In [None]:
ggplot(data=df, aes(x=logit, y=p)) +
  geom_line() +
  labs(x = 'logit(p)', y='p') +
  theme_bw()

In [None]:
ggplot(data=df, aes(x=logit, y=odds)) +
  geom_line() +
  labs(x = 'log(odds ratio) or logit(p)', y='odds ratio') +
  ylim(1, 100) +
  xlim(0, 5) +
  theme_bw()

In [None]:
ggplot(loan_data,aes(x=borrower_score,y=ifelse(outcome=='default',0,1))) +
  geom_point() + geom_line(aes(x=borrower_score,y=log(borrower_score/(1/1-borrower_score))))

We can use a spline in a 'Generalized Additive Model' if we think a predictor is non-linear to the predicted.


In [None]:
logistic_gam <- gam(outcome ~ s(payment_inc_ratio) + purpose_ + 
                      home_ + emp_len_ + s(borrower_score),
                    data=loan_data, family='binomial')
logistic_gam

In [None]:
guess2 = predict(logistic_gam,loan_data[147, c('payment_inc_ratio','purpose_', 'home_', 'emp_len_','borrower_score')],type="response")
guess2

## We can build a binary confusion matrix from the gam model.

In [None]:
predictions = predict(logistic_gam,newdata=loan_data)

In [None]:
pred_default = as.numeric(predictions > 0)
defaulted = as.numeric(loan_data$outcome=='default')

In [None]:
correct_defaults = (pred_default == 1) & (defaulted == 1) 

In [None]:
correct_paid = (pred_default == 0) & (defaulted == 0)

In [None]:
missed_def = (pred_default == 1) & (defaulted == 0)

In [None]:
missed_paid = (pred_default == 0) & (defaulted == 1)

In [None]:
sum(missed_def)

In [None]:
confusion = matrix(c(sum(correct_defaults),sum(missed_def),sum(missed_paid),sum(correct_paid)),2,2)

In [None]:
confusion

In [None]:
colnames(confusion) = c('defaults','paid')
rownames(confusion) = c('def^','paid^')

In [None]:
confusion

In [None]:
default_sum =  sum(confusion[,1])

In [None]:
default_sum

In [None]:
paid_sum = sum(confusion[,2])

In [None]:
paid_sum

In [None]:
sensitivity = confusion[1,1] / default_sum
sensitivity

In [None]:
specifity = confusion[2,2] / paid_sum
specifity

In [None]:
precision = confusion[1,1] / sum(confusion[1,])
precision

In [None]:
accuracy = (sum(correct_defaults) + sum(correct_paid)) / nrow(loan_data)
accuracy

In [None]:
sum(confusion)

In [None]:
idx =  order(-predictions)

In [None]:
sensitivity = cumsum(defaulted[idx]==1)/sum(defaulted==1)

In [None]:
specificity = (sum(defaulted==0) - cumsum(defaulted[idx]==0))/sum(defaulted==0)

In [None]:
roc = data.frame(sensitivity=sensitivity,specificity=specificity)

In [None]:
ggplot(roc, aes(x=specificity, y=sensitivity)) +
  geom_line(color='blue') + 
  scale_x_reverse(expand=c(0, 0)) +
  scale_y_continuous(expand=c(0, 0)) + 
  geom_line(data=data.frame(x=(0:100)/100), aes(x=x, y=1-x),
            linetype='dotted', color='red') +   theme_bw()


## AUC area under curve

In [None]:
ggplot(roc, aes(specificity)) +
  geom_ribbon(aes(ymin=0, ymax=sensitivity), fill='blue', alpha=.3) +
  scale_x_reverse(expand=c(0, 0)) +
  scale_y_continuous(expand=c(0, 0)) +
  labs(y='sensitivity') +
  theme_bw()


### AUC calculated with integration:

In [None]:
sum(head(roc$sensitivity,-1) * diff(1-roc$specificity))

In [None]:
roc$sensitivity

In [None]:
roc

In [None]:
require(ROCR)


In [None]:
plot(logistic_gam)

In [None]:
install.packages('gains')
library(gains)

In [None]:
loan_gains = gains(defaulted,predictions)
loan_gains

In [None]:
options(repr.plot.width=7, repr.plot.height=5)

In [None]:
plot(loan_gains)

In [None]:
plot(loan_gains, y=NULL, xlab="Depth of File", ylab="Mean Response",
type="b", col=c("red3","bisque4","blue4"), pch=c(1,1,1), lty=c(1,1,1),
legend=c("Mean Response","Cumulative Mean Response","Mean Predicted Response"),
ylim=c(min(c(loan_gains$mean.resp,loan_gains$mean.prediction)),
max(c(loan_gains$mean.resp,loan_gains$mean.prediction))), main="Lift Chart")
