In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error

### Overview

We are going to illustrate the use of bootstrapping to evaluate model performance. 

In [None]:
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)

Split the data into a train and test set:

In [None]:
test_size = 100
diabetes_X_train = diabetes_X[:-test_size]
diabetes_X_test = diabetes_X[-test_size:]

diabetes_y_train = diabetes_y[:-test_size]
diabetes_y_test = diabetes_y[-test_size:]

Let's train a simple ML model:

In [None]:
model = linear_model.LinearRegression()
model.fit(diabetes_X_train, diabetes_y_train)
diabetes_y_pred = model.predict(diabetes_X_test)
rmse = mean_squared_error(diabetes_y_test, diabetes_y_pred, squared=False)
print(f"RMSE: {rmse:.2f}")

Great, we have an error (RMSE), but how much could this vary if we saw a slightly different dataset of the same size?

### Bootstrap!

We can use the bootstrap to answer this question!

First, let's put the test set into a dataframe for convenience.

In [None]:
test_df = pd.DataFrame(diabetes_X_test)
test_df["preds"] = model.predict(diabetes_X_test)
test_df["y"] = diabetes_y_test

Now let's sample (with replacement) from our test set, predict on this sampled test set and calculate the error.

**TODO:** finish the for loop by bootstrapping the test set.

In [None]:
num_samples = 10_000
errors = []
for i in range(num_samples):
    pass  # put your code here

Plot the distribution of errors, notice the large variation!

In [None]:
if len(errors) > 0:
    f, ax = plt.subplots(figsize=(12, 6))
    ax.hist(errors, label="Bootstrap errors", bins=25)
    ax.axvline(rmse, label="original RMSE", color="red")
    ax.legend(fontsize=14)
    ax.set_title(f"RMSE: {rmse:.2f} with std dev: {np.std(errors):.2f}");

**Extra:** vary the test set size and see what happens to the plot above - do you understand why?