In [1]:
%matplotlib notebook

# Generate some synthetic regression data

In [2]:
from sklearn.datasets import make_regression
from matplotlib import pyplot
import numpy as np

# generate regression dataset
bias = 10
X, y, coeff = make_regression(n_samples=100, n_features=1, noise=10., coef=True, bias=bias)

# plot regression dataset
pyplot.scatter(X,y)
pyplot.show()
pyplot.plot(X , coeff*X + bias, 'r')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fc94dbd2160>]

# Now lets fit the data using ordinary least square (no standarization)

In [3]:
import statsmodels.api as sm

  from pandas.core import datetools


### For ordinary least square, without any regularization, scaling is not required (results are scale independent). Standardization is necessary if regularization is present

In [4]:
Xols = sm.add_constant(X)
model = sm.OLS(y, Xols)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.101
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     11.04
Date:                Tue, 26 Feb 2019   Prob (F-statistic):            0.00125
Time:                        22:54:24   Log-Likelihood:                -366.85
No. Observations:                 100   AIC:                             737.7
Df Residuals:                      98   BIC:                             742.9
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         11.2864      0.971     11.626      0.0

In [5]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,6))

ax.plot(X, y, '.', label="data")
ax.plot(X, results.fittedvalues, 'r-.', label="OLS")
ax.plot(X, coeff*X + bias, 'g-.', label="True")

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fc9453aa908>]

### ESL: equation 3.8 to compute $\hat{\sigma}^2$

In [6]:
p = 1
N = np.shape(y)[0]

sigma_hat_2 = np.sum((y - results.fittedvalues)**2)/float(N - p - 1)

### ESL: equation 3.12 to compute Z-score

In [7]:
from numpy.linalg import inv
XTX = Xols.T.dot(Xols)
XTX_inv = inv(XTX)
v0 = XTX_inv[0,0]

z0 = results.params[0]/np.sqrt(sigma_hat_2*v0)
print('z0 =',z0)

z0 = 11.6260519163


In [8]:
from numpy.linalg import inv
XTX = Xols.T.dot(Xols)
XTX_inv = inv(XTX)
v1 = XTX_inv[1,1]

z1 = results.params[1]/np.sqrt(sigma_hat_2*v1)
print('z1 =',z1)

z1 = 3.32315679819


### ESL: equation 3.13 to compute F statistics = $\frac{(RSS_0-RSS_1)/(p1-p0)}{RSS1/(N-p1-1)}$

#### Full model y = $\beta_0 + \beta_1*X$
#### Reduced model y = $\beta_0$

In [9]:
## Full model
full_model = sm.OLS(y, sm.add_constant(X))
full_model_res = full_model.fit()

## Reduced model
reduced_model = sm.OLS(y, sm.add_constant(X)[:,0])
reduced_model_res = reduced_model.fit()

In [10]:
p1 = 1
p0 = 0
RSS1 = np.sum((y - full_model_res.fittedvalues)**2)
RSS0 = np.sum((y - reduced_model_res.fittedvalues)**2) ## Just the intercept model

In [11]:
num = (RSS0 - RSS1)/(p1-p0)
den = sigma_hat_2
F = num/den
print('F statistic (',F,')', ' == ','Z^2 (',z1**2,')')

F statistic ( 11.0433711053 )  ==  Z^2 ( 11.0433711053 )


#### Full model y = $\beta_0 + \beta_1*X$
#### Reduced model y = $\beta_1*X$

In [12]:
## Full model
full_model = sm.OLS(y, sm.add_constant(X))
full_model_res = full_model.fit()

## Reduced model
reduced_model = sm.OLS(y, X)
reduced_model_res = reduced_model.fit()

In [13]:
p1 = 1
p0 = 0
RSS1 = np.sum((y - full_model_res.fittedvalues)**2)
RSS0 = np.sum((y - reduced_model_res.fittedvalues)**2) ## Just the intercept model

In [14]:
num = (RSS0 - RSS1)/(p1-p0)
den = sigma_hat_2
F = num/den
print('F statistic (',F,')', ' == ','Z^2 (',z0**2,')')

F statistic ( 135.16508316 )  ==  Z^2 ( 135.16508316 )


### ESL: equation 3.14 to compute confidence intervals