In [21]:
import numpy as np
import pandas as pd


In [3]:
from ISLP import load_data
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score


In [4]:
np.random.seed(1)

In [5]:
default_df = load_data("Default")
default_df["default_res" ] = np.where(default_df["default"] == "Yes", 1, 0)
default_df


Unnamed: 0,default,student,balance,income,default_res
0,No,No,729.526495,44361.625074,0
1,No,Yes,817.180407,12106.134700,0
2,No,No,1073.549164,31767.138947,0
3,No,No,529.250605,35704.493935,0
4,No,No,785.655883,38463.495879,0
...,...,...,...,...,...
9995,No,No,711.555020,52992.378914,0
9996,No,No,757.962918,19660.721768,0
9997,No,No,845.411989,58636.156984,0
9998,No,No,1569.009053,36669.112365,0


In [6]:
default_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   default      10000 non-null  object 
 1   student      10000 non-null  object 
 2   balance      10000 non-null  float64
 3   income       10000 non-null  float64
 4   default_res  10000 non-null  int32  
dtypes: float64(2), int32(1), object(2)
memory usage: 351.7+ KB


## a)

In [7]:
X = sm.add_constant(default_df[['income', 'balance']], )
y = default_df['default_res']

model = sm.Logit(y, X).fit()

print(model.summary())

Optimization terminated successfully.
         Current function value: 0.078948
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4594
Time:                        14:14:26   Log-Likelihood:                -789.48
converged:                       True   LL-Null:                       -1460.3
Covariance Type:            nonrobust   LLR p-value:                4.541e-292
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.5405      0.435    -26.544      0.000     -12.393     -10.688
income      2.081e-05   4.99

## b)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


result = sm.Logit(y_train, X_train).fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.078638
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                 7500
Model:                          Logit   Df Residuals:                     7497
Method:                           MLE   Df Model:                            2
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4684
Time:                        14:14:26   Log-Likelihood:                -589.78
converged:                       True   LL-Null:                       -1109.5
Covariance Type:            nonrobust   LLR p-value:                1.908e-226
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.7572      0.507    -23.185      0.000     -12.751     -10.763
income      2.259e-05    5.8

In [9]:
probs = result.predict(X_test)
preds = (probs > 0.5).astype('int')
confusion_matrix(y_pred=preds, y_true=y_test)

array([[2410,   11],
       [  52,   27]], dtype=int64)

In [10]:
accuracy_score(y_test, preds)

0.9748

## c)

### First 

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2)


result = sm.Logit(y_train, X_train).fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.082573
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                 6000
Model:                          Logit   Df Residuals:                     5997
Method:                           MLE   Df Model:                            2
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4597
Time:                        14:14:26   Log-Likelihood:                -495.44
converged:                       True   LL-Null:                       -916.91
Covariance Type:            nonrobust   LLR p-value:                9.080e-184
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.2324      0.533    -21.071      0.000     -12.277     -10.188
income      1.962e-05   6.33

In [12]:
probs = result.predict(X_test)
preds = (probs > 0.5).astype('int')
confusion_matrix(y_pred=preds, y_true=y_test)

array([[3863,   16],
       [  82,   39]], dtype=int64)

In [13]:
accuracy_score(y_test, preds)

0.9755

### Second

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)


result = sm.Logit(y_train, X_train).fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.079409
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                 7000
Model:                          Logit   Df Residuals:                     6997
Method:                           MLE   Df Model:                            2
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4450
Time:                        14:14:26   Log-Likelihood:                -555.86
converged:                       True   LL-Null:                       -1001.6
Covariance Type:            nonrobust   LLR p-value:                2.635e-194
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.3107      0.513    -22.039      0.000     -12.317     -10.305
income       1.92e-05   5.92

In [15]:
probs = result.predict(X_test)
preds = (probs > 0.5).astype('int')
confusion_matrix(y_pred=preds, y_true=y_test)

array([[2888,    6],
       [  69,   37]], dtype=int64)

In [16]:
accuracy_score(y_test, preds)

0.975

### Third

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=16)


result = sm.Logit(y_train, X_train).fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.079625
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                 5500
Model:                          Logit   Df Residuals:                     5497
Method:                           MLE   Df Model:                            2
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4803
Time:                        14:14:26   Log-Likelihood:                -437.94
converged:                       True   LL-Null:                       -842.70
Covariance Type:            nonrobust   LLR p-value:                1.627e-176
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.6423      0.582    -20.008      0.000     -12.783     -10.502
income      1.699e-05   6.56

In [18]:
probs = result.predict(X_test)
preds = (probs > 0.5).astype('int')
confusion_matrix(y_pred=preds, y_true=y_test)

array([[4342,   20],
       [  94,   44]], dtype=int64)

In [19]:
accuracy_score(y_test, preds)

0.9746666666666667

## d)

In [25]:


df_encoded = pd.get_dummies(default_df, columns=['student'], drop_first=True)  # Drops first category by default

X = sm.add_constant(df_encoded[['income', 'balance'] + list(df_encoded.filter(like='student'))])

X

Unnamed: 0,const,income,balance,student_Yes
0,1.0,44361.625074,729.526495,0
1,1.0,12106.134700,817.180407,1
2,1.0,31767.138947,1073.549164,0
3,1.0,35704.493935,529.250605,0
4,1.0,38463.495879,785.655883,0
...,...,...,...,...
9995,1.0,52992.378914,711.555020,0
9996,1.0,19660.721768,757.962918,0
9997,1.0,58636.156984,845.411989,0
9998,1.0,36669.112365,1569.009053,0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=16)


result = sm.Logit(y_train, X_train).fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.079478
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:            default_res   No. Observations:                 5500
Model:                          Logit   Df Residuals:                     5496
Method:                           MLE   Df Model:                            3
Date:                Mon, 18 Mar 2024   Pseudo R-squ.:                  0.4813
Time:                        14:18:31   Log-Likelihood:                -437.13
converged:                       True   LL-Null:                       -842.70
Covariance Type:            nonrobust   LLR p-value:                1.651e-175
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -11.2067      0.668    -16.770      0.000     -12.516      -9.897
income       5.771e-06   

In [27]:
probs = result.predict(X_test)
preds = (probs > 0.5).astype('int')
confusion_matrix(y_pred=preds, y_true=y_test)

array([[4342,   20],
       [  93,   45]], dtype=int64)

In [28]:
accuracy_score(y_test, preds)

0.9748888888888889