# Actividad 8
## Bootstrap
### Ana Sofía Hinojosa Bale

In [91]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
import statsmodels.api as sm
from sklearn.linear_model import Ridge
import optuna

In [92]:
advertising = pd.read_csv("Advertising.csv")
advertising = advertising.drop(columns=["Unnamed: 0"])
default = pd.read_csv("Default.csv")
default["default"] = (default["default"] == "Yes").astype(int)
default["student"] = (default["student"] == "Yes").astype(int)

In [93]:
x_advertising = advertising.drop(columns=["sales"])
y_advertising = advertising["sales"]
model_advertising = sm.OLS(y_advertising, sm.add_constant(x_advertising)).fit()
x_default = default.drop(columns=["default"])
y_default = default["default"]
model_default = sm.Logit(y_default, sm.add_constant(x_default)).fit()

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10


In [94]:
params_advertising = model_advertising.params
params_default = model_default.params
bse_advertising = model_advertising.bse
bse_default = model_default.bse
model_advertising.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Thu, 20 Nov 2025",Prob (F-statistic):,1.58e-96
Time:,17:28:21,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.9389,0.312,9.422,0.000,2.324,3.554
TV,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [95]:
model_default.summary()

0,1,2,3
Dep. Variable:,default,No. Observations:,10000.0
Model:,Logit,Df Residuals:,9996.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 20 Nov 2025",Pseudo R-squ.:,0.4619
Time:,17:28:21,Log-Likelihood:,-785.77
converged:,True,LL-Null:,-1460.3
Covariance Type:,nonrobust,LLR p-value:,3.257e-292

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.8690,0.492,-22.079,0.000,-11.834,-9.904
student,-0.6468,0.236,-2.738,0.006,-1.110,-0.184
balance,0.0057,0.000,24.737,0.000,0.005,0.006
income,3.033e-06,8.2e-06,0.370,0.712,-1.3e-05,1.91e-05


In [96]:
n_bootstraps = 1000
boot_params_advertising = np.zeros((n_bootstraps, len(params_advertising)))
boot_params_default = np.zeros((n_bootstraps, len(params_default)))
for i in range(n_bootstraps):
    boot_sample_advertising = resample(advertising, replace=True, n_samples=len(advertising))
    x_boot_advertising = boot_sample_advertising.drop(columns=["sales"])
    y_boot_advertising = boot_sample_advertising["sales"]
    model_boot_advertising = sm.OLS(y_boot_advertising, sm.add_constant(x_boot_advertising)).fit()
    boot_params_advertising[i, :] = model_boot_advertising.params

    boot_sample_default = resample(default, replace=True, n_samples=len(default))
    x_boot_default = boot_sample_default.drop(columns=["default"])
    y_boot_default = boot_sample_default["default"]
    model_boot_default = sm.Logit(y_boot_default, sm.add_constant(x_boot_default)).fit(disp=0)
    boot_params_default[i, :] = model_boot_default.params

boot_bse_advertising = np.std(boot_params_advertising, axis=0)
boot_bse_default = np.std(boot_params_default, axis=0)
boot_beta_advertising = np.mean(boot_params_advertising, axis=0)
boot_beta_default = np.mean(boot_params_default, axis=0)

### Comparación resultados Advertising

In [97]:
pd.DataFrame({
    "Parameter": params_advertising.index,
    "Original Beta": params_advertising.values,
    "Bootstrap Beta": boot_beta_advertising,
    "Original BSE": bse_advertising.values,
    "Bootstrap BSE": boot_bse_advertising
})

Unnamed: 0,Parameter,Original Beta,Bootstrap Beta,Original BSE,Bootstrap BSE
0,const,2.938889,2.959053,0.311908,0.318871
1,TV,0.045765,0.045732,0.001395,0.001891
2,radio,0.18853,0.188409,0.008611,0.011009
3,newspaper,-0.001037,-0.001201,0.005871,0.006291


Los resultados obtenidos antes y después del bootstrap son similares, aunque los valores de los errores del bootsrap tienden a ser mayores.

### Comparación resultados Default

In [98]:
pd.DataFrame({
    "Parameter": params_default.index,
    "Original Beta": params_default.values,
    "Bootstrap Beta": boot_beta_default,
    "Original BSE": bse_default.values,
    "Bootstrap BSE": boot_bse_default
})

Unnamed: 0,Parameter,Original Beta,Bootstrap Beta,Original BSE,Bootstrap BSE
0,const,-10.869045,-10.899208,0.492273,0.496834
1,student,-0.646776,-0.641034,0.236257,0.241389
2,balance,0.005737,0.005747,0.000232,0.000238
3,income,3e-06,3e-06,8e-06,8e-06


Los resultados obtenidos antes y después del bootstrap son similares, aunque los valores del bootsrap tienden a ser mayores, tanto en las betas como en los errores.

In [99]:
alpha = 0.1
ridge = Ridge(alpha=alpha)

ridge.fit(x_advertising, y_advertising)
ridge.intercept_, ridge.coef_

(np.float64(2.9388971784979194),
 array([ 0.04576465,  0.18852952, -0.00103737]))

In [100]:
def objective(trial):
    alpha = trial.suggest_float("alpha", 1e-2, 3.0)
    ridge = Ridge(alpha=alpha)
    ridge.fit(x_advertising, y_advertising)
    y_pred = ridge.predict(x_advertising)
    mse = np.mean((y_advertising - y_pred) ** 2)
    return mse

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best alpha:", study.best_params["alpha"])

[I 2025-11-20 17:28:24,967] A new study created in memory with name: no-name-6a1ad663-27b8-4958-a106-d311b88ca03e
[I 2025-11-20 17:28:24,969] Trial 0 finished with value: 2.7841263145304778 and parameters: {'alpha': 0.06486422980063461}. Best is trial 0 with value: 2.7841263145304778.
[I 2025-11-20 17:28:24,971] Trial 1 finished with value: 2.784126314724458 and parameters: {'alpha': 0.21441003999658229}. Best is trial 0 with value: 2.7841263145304778.
[I 2025-11-20 17:28:24,972] Trial 2 finished with value: 2.7841263214777014 and parameters: {'alpha': 1.224761339120289}. Best is trial 0 with value: 2.7841263145304778.
[I 2025-11-20 17:28:24,973] Trial 3 finished with value: 2.784126349079478 and parameters: {'alpha': 2.728314498743506}. Best is trial 0 with value: 2.7841263145304778.
[I 2025-11-20 17:28:24,974] Trial 4 finished with value: 2.7841263223904122 and parameters: {'alpha': 1.3025232315759}. Best is trial 0 with value: 2.7841263145304778.
[I 2025-11-20 17:28:24,975] Trial 5 

Best alpha: 0.013152740024445948


In [101]:
params_ridge = np.concatenate(([ridge.intercept_], ridge.coef_))
n_bootstraps_ridge = 1000
alpha = study.best_params["alpha"]
boot_params_ridge = np.zeros((n_bootstraps_ridge, x_advertising.shape[1] + 1))
for i in range(n_bootstraps_ridge):
    boot_sample = resample(advertising, replace=True, n_samples=len(advertising))
    x_boot = boot_sample.drop(columns=["sales"])
    y_boot = boot_sample["sales"]
    ridge_boot = Ridge(alpha=alpha)
    ridge_boot.fit(x_boot, y_boot)
    boot_params_ridge[i, 0] = ridge_boot.intercept_
    boot_params_ridge[i, 1:] = ridge_boot.coef_
boot_bse_ridge = np.std(boot_params_ridge, axis=0)
boot_beta_ridge = np.mean(boot_params_ridge, axis=0)    

### Comparación resultados regularización

In [102]:
pd.DataFrame({
    "Parameter": params_advertising.index,
    "Original Beta": params_ridge,
    "Bootstrap Beta": boot_beta_ridge,
    "Bootstrap BSE": boot_bse_ridge
})

Unnamed: 0,Parameter,Original Beta,Bootstrap Beta,Bootstrap BSE
0,const,2.938897,2.947891,0.342475
1,TV,0.045765,0.045723,0.00193
2,radio,0.18853,0.188502,0.010882
3,newspaper,-0.001037,-0.001016,0.006467


Los resultados obtenidos antes y después del bootstrap son similares, aunque los valores de los errores del bootsrap tienden a ser mayores.

### Análisis de resultados

Los errores estándar obtenidos por el método sin bootstrap y por bootstrap son similares, lo que indica que los coeficientes de los modelos son estables. Sin embargo, los valores del bootstrap tienden a ser ligeramente mayores porque este método no depende de los supuestos teóricos del modelo lineal y refleja mejor la variabilidad real en los datos. En contraste, el método analítico suele ser un poco más optimista al basarse en supuestos ideales sobre los errores.

El bootstrap estima la variabilidad remuestreando directamente de los datos observados. Esto lo hace más sensible a características reales del dataset. Por eso puede producir errores estándar ligeramente diferentes (y como se vio en los resultados obtenidos, generalmente mayores).