In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression

default_ = pd.read_csv("Default.csv")
default = default_.copy() 

In [2]:
#a) Première approche
X = default[['income', 'balance']]
y = default['default']
clf = LogisticRegression().fit(X,y)


predProbs = clf.predict_proba(X)
X_design = np.hstack([np.ones((X.shape[0],1)), X])
V = np.diagflat(np.product(predProbs, axis=1))
covLogit = np.linalg.inv(np.dot(np.dot(X_design.T,V), X_design))

standard_errors = np.sqrt(np.diag(covLogit))
print("Coefficients : " , clf.coef_ , clf.intercept_)
print("Standard errors : ", standard_errors)

Coefficients :  [[2.08091984e-05 5.64710797e-03]] [-11.54047811]
Standard errors :  [4.34772356e-01 4.98523984e-06 2.27381314e-04]


In [3]:
#a) Deuxième approche
default.loc[default['default'] == 'No', 'default'] = int(1)
default.loc[default['default'] == 'Yes', 'default'] = int(0)

f = 'default ~ income + balance'
model = smf.glm(formula = f, data = default, family = sm.families.Binomial()).fit()
print(model.summary())

                      Generalized Linear Model Regression Results                       
Dep. Variable:     ['default[0]', 'default[1]']   No. Observations:                10000
Model:                                      GLM   Df Residuals:                     9997
Model Family:                          Binomial   Df Model:                            2
Link Function:                            logit   Scale:                          1.0000
Method:                                    IRLS   Log-Likelihood:                -789.48
Date:                          Wed, 16 Dec 2020   Deviance:                       1579.0
Time:                                  11:50:04   Pearson chi2:                 6.95e+03
No. Iterations:                               9                                         
Covariance Type:                      nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------

In [4]:
#b)
def boot_fn(dataFrame, index):
    df = dataFrame.iloc[index, :]
    X = df[['income', 'balance']]
    y = df['default']
    clf = LogisticRegression().fit(X,y)
    return(clf.coef_, clf.intercept_)

In [11]:
def boot_se(dataFrame, index):
    df = dataFrame.iloc[index,:]
    X = df[['income', 'balance']]
    y = df['default']
    y = y.astype('int')
    clf = LogisticRegression().fit(X,y)
    predProbs = clf.predict_proba(X)
    X_design = np.hstack([np.ones((X.shape[0],1)), X])
    V = np.diagflat(np.product(predProbs, axis=1))
    covLogit = np.linalg.inv(np.dot(np.dot(X_design.T,V), X_design))
    return(np.sqrt(np.diag(covLogit)))

def boot(dataFrame, n):
    standard_errors = []
    for i in range(n):
        index = np.random.choice([i for i in range(len(dataFrame))], len(dataFrame))
        standard_errors.append(boot_se(dataFrame, index))
    return pd.DataFrame(standard_errors).mean()

print(boot(default,100))

0    0.183895
1    0.000006
2    0.000106
dtype: float64
