# ISLR Sec. 5-3-4

## Estimating the Accuracy of a Linear Regression Model

In [None]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.utils import resample
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

In [None]:
df = pd.read_csv('../Data/Auto-cleaned.csv')
df = df[['horsepower','mpg']]
df.head(3)

In [None]:
df.plot.scatter(x='horsepower',y='mpg')

In [None]:
LR = LinearRegression()
data = df.values

In [None]:
num_boot = 1000
deg = 3
poly = PolynomialFeatures(degree=deg)
coef = np.zeros((num_boot,deg+1))
for k in range(num_boot):
    bootstrap = resample(data)
    #bootstrap = data # for debugging purposes
    x = bootstrap[:,0]
    y = bootstrap[:,1]
    x = np.resize(x,(len(x),1))
    y = np.resize(y,(len(y),1))
    X = poly.fit_transform(x)
    LR.fit(X,y)
    coef[k,:] = LR.coef_
    coef[k,0] = LR.intercept_
print('coefficients')
print('')
print('mean')
print(coef.mean(axis=0))
print('')
print('standard errors')
print(coef.std(axis=0))

### Confidence Intervals (not in textbook)

Next, we compute 95% confidence intervals for each of the regression coefficients. If zero is contained in any of these confidence intervals, then the corresponding coefficient is not statistically different from zero at a 95% confidence level and it should not be used. (You can use higher or lower confidence levels depending on the particular application.)

In [None]:
CI = np.percentile(coef,[5.0,95.0],axis=0)
print(CI)

The last confidence interval corresponds to the cubic term. It contains 0. The other coefficients are statistically different from zero at a significance level of 95%. This suggests we should use only a quadratic model.