In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

In [6]:
# copy from ISLP package
def summarize(results,
              conf_int=False):
    """
       https://github.com/intro-stat-learning/ISLP/blob/main/ISLP/models/__init__.py
    """
    tab = results.summary().tables[1]
    results_table = pd.read_html(tab.as_html(),
                                 index_col=0,
                                 header=0)[0]
    if not conf_int:
        columns = ['coef', 'std err', 't','P>|t|']
        return results_table[results_table.columns[:-2]]
    return results_table

# converted from R's poly()
# ref: https://stackoverflow.com/questions/39031172/how-poly-generates-orthogonal-polynomials-how-to-understand-the-coefs-ret
# compute orthogonal polynomials
def poly(x, degree):
    x_mean = x.mean(axis=0)
    x = x - x_mean
    Z = np.power.outer(x, np.arange(0, degree+1))

    # orthogonalize
    x = x.reshape(-1, 1)
    qr = np.linalg.qr(Z, mode='complete')
    z = np.zeros_like(Z)
    np.fill_diagonal(z, np.diag(qr[1]))
    Z = qr[0] @ z
    norm2 = (Z ** 2.0).sum(axis=0)
    alpha = (x * Z ** 2.0).sum(axis=0) / norm2 + x_mean
    Z = Z / np.sqrt(norm2)
    norm2 = np.insert(norm2, 0, 1.0, axis=0)

    return Z[:, 1:], alpha[:-1], norm2


def poly_predict(x_new, degree, alpha, norm2):
    n = x_new.shape[0]
    Z = np.ones((n, degree + 1))
    Z[:, 1] = x_new - alpha[0]
    for i in range(1, degree):
        Z[:, i + 1] = (x_new - alpha[i]) * Z[:, i]
        Z[:, i + 1] -= (norm2[i + 1] / norm2[i]) * Z[:, i - 1]

    Z = Z / np.sqrt(norm2[1:])

    return Z[:, 1:]

In [7]:
data = pd.read_csv('http://liangfgithub.github.io/Data/noxData.csv')

In [8]:
data.describe()

Unnamed: 0,dis,nox
count,506.0,506.0
mean,3.795043,0.554695
std,2.10571,0.115878
min,1.1296,0.385
25%,2.100175,0.449
50%,3.20745,0.538
75%,5.188425,0.624
max,12.1265,0.871


In [13]:
X = data['dis'].to_numpy()
y = data['nox']

In [28]:
X1 = np.power.outer(X, np.arange(1, 5))
M1 = sm.OLS(y, sm.add_constant(X1)).fit()
M1.summary()

0,1,2,3
Dep. Variable:,nox,R-squared:,0.715
Model:,OLS,Adj. R-squared:,0.713
Method:,Least Squares,F-statistic:,314.1
Date:,"Fri, 24 Nov 2023",Prob (F-statistic):,5.2099999999999997e-135
Time:,21:15:47,Log-Likelihood:,690.59
No. Observations:,506,AIC:,-1371.0
Df Residuals:,501,BIC:,-1350.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.9522,0.039,24.172,0.000,0.875,1.030
x1,-0.2008,0.038,-5.333,0.000,-0.275,-0.127
x2,0.0280,0.012,2.395,0.017,0.005,0.051
x3,-0.0017,0.001,-1.151,0.250,-0.004,0.001
x4,3.244e-05,6.01e-05,0.540,0.589,-8.56e-05,0.000

0,1,2,3
Omnibus:,63.922,Durbin-Watson:,0.284
Prob(Omnibus):,0.0,Jarque-Bera (JB):,87.059
Skew:,0.913,Prob(JB):,1.25e-19
Kurtosis:,3.894,Cond. No.,41700.0


In [21]:
X1

array([[  4.09      ,  16.7281    ,  68.417929  ],
       [  4.9671    ,  24.67208241, 122.54870054],
       [  4.9671    ,  24.67208241, 122.54870054],
       ...,
       [  2.1675    ,   4.69805625,  10.18303692],
       [  2.3889    ,   5.70684321,  13.63307774],
       [  2.505     ,   6.275025  ,  15.71893762]])

In [32]:
fit1 = LinearRegression().fit(X1, y)
fit1.intercept_, fit1.coef_

(0.9522223280181946,
 array([-2.00803588e-01,  2.80498397e-02, -1.65643734e-03,  3.24433828e-05]))

In [33]:
np.sum((fit1.predict(X1) - y)**2)

1.9329813272985945

In [34]:
X2 = np.array([[6,6**2,6**3,6**4]])
fit1.predict(X2)

array([0.44145119])