In [None]:
import numpy as np
import pandas as pd
from scipy import stats, linspace
from statsmodels.stats import power
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12, 8)
%matplotlib inline
# jupyter lab configs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# STATISTICS Applied to data science

## Exercises PART 3: Bootstrapping


<img align="center" width="800"  src="../images/sampling.png">

## 1. Bootstrapping a variable to obtain a more reliable confidence interval

The logics of parameter estimation with bootstrapping is:

1. Draw n samples with replacement.

2. Calculate and save the mean (or another statistic) of the n resampled values.

3. Repeat steps 1–2 `R` times

* Use the new *population* of means to estimate the standard error and confidence intervals

* For the calculation of **Confidence intervals**:

1. Order the new `population` of means
2. Find the index of the values corresponding to the lower and upper threshold, which are given by `[n(1−α)/2, n(1+α)/2]`   
where n is the sample size of the new *population* and alpha is the desired threshold (e.g. 0.9 for a 90% confidence interval)
3. Look at the respective values of these bounds, they represent your new confidence interval

In [None]:
# Load Boston house prices data
from sklearn.datasets import load_boston
dt = load_boston(return_X_y=False)
df = pd.DataFrame(data = np.c_[dt['data'],dt['target']])
df.columns = np.append(dt['feature_names'], 'MED_VALUE')
df.drop(['B', 'LSTAT'], inplace=True, axis=1)
df.describe()

### Exercise
Calculate the boostrapped 90% confidence interval for the variable `INDUS`

In [None]:
df.INDUS.plot.hist()

---