# Synthetic data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib notebook

N = 500
np.random.seed(0)
X = 3 * np.random.uniform(-1, 1, N)
y = 10. + X - 2 * (X ** 2) + 0.5 * (X ** 3) + np.random.normal(0, 10, N)

fig,ax = plt.subplots()
ax.scatter(X,y, s=10)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7fb708052eb8>

# Use OLS to fit this data

In [2]:
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

poly = PolynomialFeatures(3)
Xols = poly.fit_transform(np.reshape(X,(N,1)))

model = sm.OLS(y, Xols)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.903
Model:                            OLS   Adj. R-squared:                  0.903
Method:                 Least Squares   F-statistic:                     1543.
Date:                Wed, 27 Feb 2019   Prob (F-statistic):          5.09e-251
Time:                        22:52:30   Log-Likelihood:                -1852.0
No. Observations:                 500   AIC:                             3712.
Df Residuals:                     496   BIC:                             3729.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.3622      0.662     12.626      0.0

  from pandas.core import datetools


In [3]:
fig,ax = plt.subplots()
ax.scatter(X,y, s=10)
sorted_order = np.argsort(X)
ax.plot(X[sorted_order],results.fittedvalues[sorted_order],'r')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb6eb623ef0>]

# Ex 3.2(b) : Computing the confidence interval

In [4]:
param_conf = results.conf_int(alpha=0.05, cols=None)
x0 =np.linspace(min(X),max(X),100)

In [5]:
fx0 = []
for a in param_conf[0,:]:
    for b in param_conf[1,:]:
        for c in param_conf[2,:]:
            for d in param_conf[3,:]:
                fx0.append(a + b*x0 + c*x0**2 + d*x0**3)

In [6]:
fig,ax = plt.subplots()
ax.scatter(X,y, s=10)
sorted_order = np.argsort(X)
ax.plot(X[sorted_order],results.fittedvalues[sorted_order],'r')

ax.plot(x0,np.max(fx0,0),'k')
ax.plot(x0,np.min(fx0,0),'k')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb6eb5eb5f8>]

# Ex 3.2(a) : Computing the confidence interval

$\sigma_{0}^{2}=Var(\hat{f}(x_0)|x_0)=Var(x_0^T\hat{\beta}|x_0)=x_0^TVar(\hat{\beta})x_0=\sigma^2x_0^T(X^TX)^{−1}x_0$
 


In [7]:
from numpy.linalg import inv

#Compure (XtX)^1
XTX = Xols.T.dot(Xols)
XTX_inv = inv(XTX)

# Compute sigma_hat^2
p = len(results.params)
sigma_hat_2 = np.sum((y - results.fittedvalues)**2)/float(N - p - 1)

In [8]:
# The points x0 for predicting the confidence bounds
poly = PolynomialFeatures(3)
x0_ols = poly.fit_transform(np.reshape(x0,(len(x0),1)))

In [9]:
from scipy import stats

sigma_fx0 = np.zeros(np.shape(x0_ols)[0])
up_bnd = np.zeros(np.shape(x0_ols)[0])
low_bnd = np.zeros(np.shape(x0_ols)[0])
mean_val = np.zeros(np.shape(x0_ols)[0])
for i in range(np.shape(x0_ols)[0]):
    sigma_fx0[i] = sigma_hat_2*np.dot(np.matmul(x0_ols[i,:],XTX_inv),x0_ols[i,:])
    
    mean_val[i] = np.dot(x0_ols[i,:],results.params)
    up_bnd[i] = mean_val[i] + (stats.t.ppf(1-0.025, len(x0)))*np.sqrt(sigma_fx0[i])
    low_bnd[i] = mean_val[i] - (stats.t.ppf(1-0.025, len(x0)))*np.sqrt(sigma_fx0[i])

In [10]:
fig,ax = plt.subplots()
ax.scatter(X,y, s=10)

ax.plot(x0,mean_val,'r')
ax.plot(x0,up_bnd,'k')
ax.plot(x0,low_bnd,'k')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fb6eb59ab70>]