In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, 
                        summarize, 
                        poly)
from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import (cross_validate, 
    KFold, 
    ShuffleSplit)
from sklearn.base import clone
from ISLP.models import sklearn_sm

from ISLP import confusion_table

### (a) Based on this data set, provide an estimate for the population mean of medv. Call this estimate µ ˆ.

In [8]:
Boston = load_data("Boston")

In [9]:
medv_mean_est = np.mean(Boston["medv"])
print("estimate mean of medv: %.4f" %medv_mean_est)

estimate mean of medv: 22.5328


### (b) Provide an estimate of the standard error of µ ˆ. Interpret this result. 
Hint: We can compute the standard error of the sample mean by dividing the sample standard deviation by the square root of the number of observations.

In [10]:
medv_mean_SE_est = np.std(Boston["medv"]) / np.sqrt(len(Boston))
print("estimate SE of mean of medv: %.4f" %medv_mean_SE_est)

estimate SE of mean of medv: 0.4085


### (c) Now estimate the standard error of µ ˆ using the bootstrap. How does this compare to your answer from (b)?

In [11]:
def boot_SE(func, 
            D, 
            n=None, 
            B=1000, 
            seed=0): 
    rng = np.random.default_rng(seed)
    first_, second_ = 0, 0
    n = n or D.shape[0] # use the bigger one as n
    
    for _ in range(B): # repeat sampling for B times
        idx = rng.choice(D.index, 
                         n, 
                         replace=True)
        value = func(D, idx)
        first_ += value
        second_ += value ** 2

    return np.sqrt(second_ / B - (first_ / B) ** 2)

In [12]:
def calculate_mean(D, idx): 
    return np.mean(D.iloc[idx])

In [13]:
medv_mean_SE_boot = boot_SE(calculate_mean, 
                            Boston["medv"], 
                            B=1000, 
                            seed=0)

In [14]:
print("estimate SE of mean of medv by bootstrap: %.4f" %medv_mean_SE_boot)

estimate SE of mean of medv by bootstrap: 0.4125


### (d) Based on your bootstrap estimate from (c), provide a 95 % confidence interval for the mean of medv. Compare it to the results obtained by using Boston['medv'].std() and the two standard error rule (3.9).

In [15]:
print("Boston[\"medv\"].std(): %.4f" %Boston["medv"].std())

Boston["medv"].std(): 9.1971


In [18]:
import statsmodels.stats.api as sms
confidence_interval = sms.DescrStatsW(Boston["medv"]).tconfint_mean()
print(f"95% Confidence Interval: {confidence_interval}")

95% Confidence Interval: (21.72952801457859, 23.336084633642756)


In [20]:
print("95%% CI of the mean of medv using bootstrap: [%.4f, %.4f]"
     %(
         np.mean(Boston["medv"]) - 2 * medv_mean_SE_boot, 
         np.mean(Boston["medv"]) + 2 * medv_mean_SE_boot, 
     ))

95% CI of the mean of medv using bootstrap: [21.7077, 23.3579]


### (e) Based on this data set, provide an estimate, µ med , for the median value of medv in the population.

In [22]:
medv_med_est = np.median(Boston["medv"])
print("estimate median of medv: %.4f" %medv_med_est)

estimate median of medv: 21.2000


### (f) We now would like to estimate the standard error of µ med ˆ . Unfortunately, there is no simple formula for computing the standard error of the median. Instead, estimate the standard error of the median using the bootstrap. Comment on your findings.

In [23]:
def calculate_med(D, idx): 
    return np.median(D.iloc[idx])

In [25]:
medv_med_SE_boot = boot_SE(calculate_med, 
                           Boston["medv"], 
                           B=1000, 
                           seed=0)
print("estimate SE of median of medv using bootstrap: %.4f"
     % medv_med_SE_boot)

estimate SE of median of medv using bootstrap: 0.3694


### (g) Based on this data set, provide an estimate for the tenth percentile of medv in Boston census tracts. Call this quantity µ 0.1 ˆ . (You can use the np.percentile() function.)

In [28]:
medv_10_perc = np.percentile(Boston["medv"], 10)
print("estimate 10th percentile of medv: %.4f" %medv_10_perc)

estimate 10th percentile of medv: 12.7500


In [31]:
def calculate_10_perc(D, idx): 
    return np.percentile(D.iloc[idx], 10)

In [32]:
medv_10_perc_SE_boot = boot_SE(calculate_10_perc, 
                               Boston["medv"], 
                               B=1000, 
                               seed=0)
print("estimate SE of 10th percentile of medv using bootstrap: %.4f"
     % medv_10_perc_SE_boot)

estimate SE of 10th percentile of medv using bootstrap: 0.5035
