In [38]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, 
                        summarize, 
                        poly)
from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import (cross_validate, 
    KFold, 
    ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

from ISLP import confusion_table

import pandas as pd

### (a) Using the summarize() and sm.GLM() functions, determine the estimated standard errors for the coefficients associated with income and balance in a multiple logistic regression model that uses both predictors.

In [7]:
Default = load_data("Default")
X = MS(["income", "balance"]).fit_transform(Default)
y = Default["default"] == "Yes"
fit = sm.GLM(y, 
             X, 
             family=sm.families.Binomial()).fit()
summarize(fit)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
income,2.1e-05,5e-06,4.174,0.0
balance,0.0056,0.0,24.835,0.0


### (b) Write a function, boot_fn(), that takes as input the Default data set as well as an index of the observations, and that outputs the coefficient estimates for income and balance in the multiple logistic regression model.

In [18]:
def boot_fn(model_matrix, response, D, idx): 
    D_ = D.iloc[idx]
    Y_ = D_[response]
    X_ = clone(model_matrix).fit_transform(D_)
    return sm.GLM(Y_, X_, family=sm.families.Binomial()).fit().params

In [19]:
Default["is_default"] = Default["default"] == "Yes"
default_fn = partial(boot_fn, MS(["income", "balance"]), "is_default")
rng = np.random.default_rng(0)
np.array([default_fn(Default, 
                     rng.choice(len(Default), 
                                len(Default), 
                                replace=True)) for _ in range(10)])

array([[-1.16416373e+01,  1.87775777e-05,  5.73877605e-03],
       [-1.27619965e+01,  3.20594655e-05,  6.16200434e-03],
       [-1.12850364e+01,  1.59221870e-05,  5.61832222e-03],
       [-1.09975828e+01,  1.40723398e-05,  5.41168597e-03],
       [-1.13173469e+01,  1.12728778e-05,  5.70216361e-03],
       [-1.17516107e+01,  1.85974460e-05,  5.83443562e-03],
       [-1.12884834e+01,  1.52822182e-05,  5.53172383e-03],
       [-1.13883312e+01,  1.73720495e-05,  5.70192972e-03],
       [-1.11098351e+01,  2.33921172e-05,  5.28010522e-03],
       [-1.10505563e+01,  1.50937413e-05,  5.46083916e-03]])

In [142]:
# the function for computing the bootstrap standard error for arbitrary functions
def boot_SE1(func, 
            D, 
            n=None, 
            B=1000, 
            seed=0): 
    rng = np.random.default_rng(seed)
    first_, second_ = 0, 0
    n = n or D.shape[0] # use the bigger one as n
    
    for _ in range(B): # repeat sampling for B times
        idx = rng.choice(D.index, 
                         n, 
                         replace=True)
        value = func(D, idx)
        first_ += value
        second_ += value ** 2

    return np.sqrt(second_ / B - (first_ / B) ** 2)

In [143]:
# self-made
def boot_SE2(func, 
            D, 
            n=None, 
            B=1000, 
            seed=0): 
    rng = np.random.default_rng(seed)
    n = n or D.shape[0] # use the bigger one as n

    df = pd.DataFrame()
    
    for _ in range(B): # repeat sampling for B times
        idx = rng.choice(D.index, 
                         n, 
                         replace=True)
        value = func(D, idx)

        if len(df) == 0: 
            df = value.to_frame().T
        else:
            new_row = value.to_frame().T
            df = pd.concat([df, new_row], ignore_index=True)
        
    return np.std(df)

In [144]:
import time

In [145]:
start_t = time.time()
SE = boot_SE1(default_fn, Default, B=1000, seed=0)
end_t = time.time()
print(SE)
print("lab func runtime: %dms" %((end_t - start_t) * 1000))

intercept    0.435692
income       0.000005
balance      0.000230
dtype: float64
lab func runtime: 49692ms


In [146]:
start_t = time.time()
SE = boot_SE2(default_fn, Default, B=1000, seed=0)
end_t = time.time()
print(SE)
print("self-made func runtime: %dms" %((end_t - start_t) * 1000))

intercept    0.435692
income       0.000005
balance      0.000230
dtype: float64
self-made func runtime: 55877ms
