In [55]:
import numpy as np
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, 
                        summarize, 
                        poly)
from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import (cross_validate, 
    KFold, 
    ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

from ISLP import confusion_table

### (a) Fit a logistic regression model that uses income and balance to predict default.

In [56]:
Default = load_data("Default")
X = MS(["income", "balance"]).fit_transform(Default)
y = Default["default"] == "Yes"
fit = sm.GLM(y, 
             X, 
             family=sm.families.Binomial()).fit()
summarize(fit)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
income,2.1e-05,5e-06,4.174,0.0
balance,0.0056,0.0,24.835,0.0


### (b) Using the validation set approach, estimate the test error of this model.

In [57]:
Default_train, Default_valid = train_test_split(Default, 
                                                test_size=len(Default) // 2, 
                                                random_state=0)
X_train = MS(["income", "balance"]).fit_transform(Default_train)
y_train = Default_train["default"] == "Yes"
model = sm.GLM(y_train, 
               X_train, 
               family=sm.families.Binomial())
results = model.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.3896,0.635,-17.935,0.0
income,1.6e-05,7e-06,2.151,0.031
balance,0.0056,0.0,16.792,0.0


In [58]:
X_valid = MS(["income", "balance"]).fit_transform(Default_valid)
y_valid = Default_valid["default"]
valid_prob_list = results.predict(X_valid)
valid_pred = np.array(["No"] * len(y_valid))
valid_pred[valid_prob_list > .5] = "Yes"
np.mean(valid_pred != y_valid)

0.0398

### (c) Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. Comment on the results obtained.

In [59]:
for i in range(1, 4): 
    Default_train, Default_valid = train_test_split(Default, 
                                                test_size=len(Default) // 2, 
                                                random_state=i)
    X_train = MS(["income", "balance"]).fit_transform(Default_train)
    y_train = Default_train["default"] == "Yes"
    model = sm.GLM(y_train, 
                   X_train, 
                   family=sm.families.Binomial())
    results = model.fit()
    X_valid = MS(["income", "balance"]).fit_transform(Default_valid)
    y_valid = Default_valid["default"]
    valid_prob_list = results.predict(X_valid)
    valid_pred = np.array(["No"] * len(y_valid))
    valid_pred[valid_prob_list > .5] = "Yes"
    validation_set_error = np.mean(valid_pred != y_valid)
    print("validation set error for seed[%d]: %.4f" %(i, validation_set_error))

validation set error for seed[1]: 0.0366
validation set error for seed[2]: 0.0352
validation set error for seed[3]: 0.0366


### (d) Now consider a logistic regression model that predicts the probability of default using income, balance, and a dummy variable for student. Estimate the test error for this model using the validation set approach.

In [60]:
for i in range(0, 4): 
    Default["is_student"] = Default["student"] == "Yes"
    Default_train, Default_valid = train_test_split(Default, 
                                                    test_size=len(Default) // 2, 
                                                    random_state=i)
    X_train = MS(["income", "balance", "is_student"]).fit_transform(Default_train)
    y_train = Default_train["default"] == "Yes"
    model = sm.GLM(y_train, 
                   X_train, 
                   family=sm.families.Binomial())
    results = model.fit()
    
    X_valid = MS(["income", "balance", "is_student"]).fit_transform(Default_valid)
    y_valid = Default_valid["default"]
    valid_prob_list = results.predict(X_valid)
    valid_pred = np.array(["No"] * len(y_valid))
    validation_set_error = np.mean(valid_pred != y_valid)
    print("validation set error for seed[%d]: %.4f" %(i, validation_set_error))

validation set error for seed[0]: 0.0372
validation set error for seed[1]: 0.0318
validation set error for seed[2]: 0.0308
validation set error for seed[3]: 0.0338


no significant improvement after adding the student variable into the logistic regression model