# Chapter 10: Logistic Regression

## Import required packages

In [1]:
pip install mord

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from mord import LogisticIT
import matplotlib.pylab as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart,  liftChart
from dmba.metric import AIC_score

## Table 10.2
Load the `UniversalBank.csv` dataset. The columns `ID` and `ZIP Code` are not relevant for model building and therefore removed. Treat Education as categorical

In [3]:
bank_df = pd.read_csv('UniversalBank.csv')

In [4]:
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)

In [5]:
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]

In [6]:
bank_df

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,0,1,0


In [7]:
# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 
   'Advanced/Professional'}
bank_df.Education.cat.rename_categories(new_categories, inplace=True)
bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True)

In [8]:
bank_df

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,Education_Graduate,Education_Advanced/Professional
0,25,1,49,4,1.6,0,0,1,0,0,0,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,0,0
3,35,9,100,1,2.7,0,0,0,0,0,0,1,0
4,35,8,45,4,1.0,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,0,0,0,0,1,0,0,1
4996,30,4,15,4,0.4,85,0,0,0,1,0,0,0
4997,63,39,24,2,0.3,0,0,0,0,0,0,0,1
4998,65,40,49,3,0.5,0,0,0,0,1,0,1,0


In [9]:
y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

In [10]:
# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, 
   test_size=0.4, random_state=1)

In [11]:
# fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
logit_reg = LogisticRegression(penalty="l2", C=1e42, 
   solver='liblinear')
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, solver='liblinear')

In [12]:
logit_reg.intercept_[0]

-12.61895521314035

In [13]:
print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, 
   index=X.columns).transpose())
print('AIC', AIC_score(valid_y, logit_reg.predict(valid_X), df = 
   len(train_X.columns) + 1))

intercept  -12.61895521314035
            Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.032549     0.03416  0.058824  0.614095  0.240534  0.001012   

       Securities_Account  CD_Account    Online  CreditCard  \
coeff           -1.026191    3.647933 -0.677862    -0.95598   

       Education_Graduate  Education_Advanced/Professional  
coeff            4.192204                         4.341697  
AIC -709.1524769205962


In [14]:
print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, 
   index=X.columns))

intercept  -12.61895521314035
                                    coeff
Age                             -0.032549
Experience                       0.034160
Income                           0.058824
Family                           0.614095
CCAvg                            0.240534
Mortgage                         0.001012
Securities_Account              -1.026191
CD_Account                       3.647933
Online                          -0.677862
CreditCard                      -0.955980
Education_Graduate               4.192204
Education_Advanced/Professional  4.341697


In [15]:
print(pd.DataFrame({'Odds': np.exp(logit_reg.coef_[0])}, index=X.columns))

                                      Odds
Age                               0.967975
Experience                        1.034750
Income                            1.060589
Family                            1.847983
CCAvg                             1.271928
Mortgage                          1.001012
Securities_Account                0.358369
CD_Account                       38.395218
Online                            0.507701
CreditCard                        0.384435
Education_Graduate               66.168450
Education_Advanced/Professional  76.837854


In [16]:
print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff': logit_reg.coef_[0], 'Odds': np.exp(logit_reg.coef_[0])}, index=X.columns))

intercept  -12.61895521314035
                                    coeff       Odds
Age                             -0.032549   0.967975
Experience                       0.034160   1.034750
Income                           0.058824   1.060589
Family                           0.614095   1.847983
CCAvg                            0.240534   1.271928
Mortgage                         0.001012   1.001012
Securities_Account              -1.026191   0.358369
CD_Account                       3.647933  38.395218
Online                          -0.677862   0.507701
CreditCard                      -0.955980   0.384435
Education_Graduate               4.192204  66.168450
Education_Advanced/Professional  4.341697  76.837854


## Table 10.3 Predict to get the probabilities

In [17]:
logit_reg_pred = logit_reg.predict(valid_X)
logit_reg_proba = logit_reg.predict_proba(valid_X)
logit_result = pd.DataFrame({'actual': valid_y, 
                             'p(0)': [p[0] for p in logit_reg_proba],
                             'p(1)': [p[1] for p in logit_reg_proba],
                             'predicted': logit_reg_pred })

In [19]:
logit_result

Unnamed: 0,actual,p(0),p(1),predicted
2764,0,0.976162,0.023838,0
4767,0,0.999937,0.000063,0
3814,0,0.999899,0.000101,0
3499,0,0.998381,0.001619,0
2735,0,0.957473,0.042527,0
...,...,...,...,...
4372,0,0.996274,0.003726,0
3401,0,0.999648,0.000352,0
1239,0,0.999790,0.000210,0
437,0,0.998460,0.001540,0


In [18]:
# display four different cases
interestingCases = [2764, 932, 2721, 702]
print(logit_result.loc[interestingCases])

      actual      p(0)      p(1)  predicted
2764       0  0.976162  0.023838          0
932        0  0.334706  0.665294          1
2721       1  0.031650  0.968350          1
702        1  0.985720  0.014280          0


## Table 10.4

In [20]:
classificationSummary(train_y, logit_reg.predict(train_X))
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.9603)

       Prediction
Actual    0    1
     0 2684   29
     1   90  197
Confusion Matrix (Accuracy 0.9595)

       Prediction
Actual    0    1
     0 1791   16
     1   65  128
