In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

In [3]:
boston = sm.datasets.get_rdataset('Boston', 'MASS')

In [5]:
print(boston.__doc__)

Boston R Documentation

Housing Values in Suburbs of Boston
-----------------------------------

Description
~~~~~~~~~~~

The ``Boston`` data frame has 506 rows and 14 columns.

Usage
~~~~~

::

   Boston

Format
~~~~~~

This data frame contains the following columns:

``crim``
   per capita crime rate by town.

``zn``
   proportion of residential land zoned for lots over 25,000 sq.ft.

``indus``
   proportion of non-retail business acres per town.

``chas``
   Charles River dummy variable (= 1 if tract bounds river; 0
   otherwise).

``nox``
   nitrogen oxides concentration (parts per 10 million).

``rm``
   average number of rooms per dwelling.

``age``
   proportion of owner-occupied units built prior to 1940.

``dis``
   weighted mean of distances to five Boston employment centres.

``rad``
   index of accessibility to radial highways.

``tax``
   full-value property-tax rate per \\$10,000.

``ptratio``
   pupil-teacher ratio by town.

``black``
   *1000(Bk - 0.63)^2* where *Bk* is

In [6]:
dboston = boston.data

In [7]:
dboston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


### (a) Based on this data set, provide an estimate for the population mean of medv. Call this estimate ˆμ.

In [19]:
medv_mean = np.mean(dboston['medv'])
print(medv_mean)

22.532806324110677


### (b) Provide an estimate of the standard error of ˆμ. Interpret this result.
Hint: We can compute the standard error of the sample mean by
dividing the sample standard deviation by the square root of the
number of observations.

In [20]:
medv_std = np.std(dboston['medv'])/np.sqrt(len(dboston['medv']))
print(medv_std)

0.4084569346972866


### (c) Now estimate the standard error of ˆμ using the bootstrap. How does this compare to your answer from (b)?

In [41]:
def boot(data,func,R):
    estimates = []
    for i in range(R):
        estimates.append(func(data,resample(np.arange(0,len(data)), n_samples=len(data), replace=True)))
    df = pd.DataFrame(estimates)
    bootstrap_statistics = {'estimated_value':np.mean(np.array(df.values),axis=0),'std_error':np.std(np.array(df.values),axis=0)}   
    return bootstrap_statistics

In [42]:
def boot_fn(data, index):
    aux_mean = np.mean(data.loc[index])
    return aux_mean

In [43]:
boot(dboston['medv'],boot_fn,1000)

{'estimated_value': array([22.54558399]), 'std_error': array([0.40149605])}

### (d) Based on your bootstrap estimate from (c), provide a 95% confidence interval for the mean of medv. 

In [46]:
(medv_mean-2*medv_std, medv_mean+2*medv_std)

(21.715892454716105, 23.34972019350525)

### (e) Based on this data set, provide an estimate, ˆμ_med, for the median value of medv in the population.

In [48]:
medv_median = np.median(dboston['medv'])
print(medv_median)

21.2


### (f) We now would like to estimate the standard error of ˆμmed. Unfortunately, there is no simple formula for computing the standard error of the median. Instead, estimate the standard error of the median using the bootstrap. Comment on your findings.

In [54]:
def boot_fn1(data, index):
    aux_median = np.median(data.loc[index])
    return aux_median

In [55]:
boot(dboston['medv'],boot_fn1,1000)

{'estimated_value': array([21.21015]), 'std_error': array([0.37487928])}

### (g) Based on this data set, provide an estimate for the tenth percentile of medv in Boston suburbs. Call this quantity ˆμ_0.1. 

In [58]:
# 10%
mdv_quantile = np.quantile(dboston['medv'],0.1)
print(mdv_quantile)

12.75


### (h) Use the bootstrap to estimate the standard error of ˆμ_0.1. Comment on your findings.

In [62]:
def boot_fn2(data, index):
    aux_quantile = np.quantile(data.loc[index],0.1)
    return aux_quantile

In [63]:
boot(dboston['medv'],boot_fn2,1000)

{'estimated_value': array([12.7671]), 'std_error': array([0.49253689])}