# Ch6 exercises: Linear model selection and regularisation

In [45]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import statsmodels.api as sm
import statsmodels.formula.api as smf
import itertools
from sklearn.preprocessing import PolynomialFeatures

<b> 8a) Use the rnorm() function to generate a predictor X of length n = 100, as well as a noise vector ε of length n = 100.

In [8]:
# setting n to 100 for use later
n = 100

# generating a predictor, X, using numpy's random.normal function
X = np.random.normal(size=n)

# generating a noise vector, E, using numpy's random.normal function
E = np.random.normal(size=n)

<b> 8b) Generate a response vector Y of length n = 100 according to the model:

$$
Y = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \epsilon
$$

<b> where β0, β1, β2, and β3 are constants of your choice.

In [9]:
# defining constant values
constant_0 = 1.5 
constant_1 = 2.0
constant_2 = 0.5
constant_3 = -1.0

In [10]:
# defining Y
Y = constant_0 + constant_1*X + constant_2*X**2 + constant_2*X**2 + constant_2*X**3 + E

In [12]:
# seeing what Y looks like
print(Y)

[-1.17632977e+00  7.54289411e-01  2.36540559e+01  5.32939209e+00
  3.98861242e-01  1.00561144e+00  1.74896431e+00  2.09077944e+01
 -1.89421903e+00 -2.20231267e+00  1.26711024e+00  1.32366835e+00
  4.21104450e+00  1.04447124e+00  1.08640319e+00  2.33767880e+00
  2.83565255e+00 -1.93748408e-01  1.27433916e-01  1.90952988e+01
  3.92050205e+00  4.95091697e-02  1.39940553e+01 -1.58364965e-01
  2.63748073e+00  4.41160730e+00  3.03183412e+00  1.90768614e+00
 -3.07171128e+00 -2.74201834e-01  5.91896664e+00 -4.68120109e-01
  3.29509279e+00  7.65453356e-01  3.61426434e+00  1.24663761e+01
  2.37783112e+00  4.69151487e+00  3.86870271e+00  8.08448404e+00
  1.59301766e+00 -4.04399209e+00  3.01101593e+00  2.67119307e+00
  3.47390570e-01 -1.57391215e+00 -1.65188471e+00  7.57614489e+00
  1.48295117e+00  1.16559769e+00  6.52148167e-01 -3.17848387e+00
  2.44489176e+00 -3.79224949e-01  1.37849358e+00  1.39814149e+00
  2.46532658e+00 -2.16935710e+00  1.45822083e+00 -1.10849511e-01
  3.32679275e+00  8.51891

<b>8c) Perform best subset selection in order to choose the best model containing the predictors X,X2,...,X10. What is the best model obtained according to Cp, BIC, and adjusted R2? Show some plots to provide evidence for your answer, and report the coefficients of the best model obtained. <u>Note you will need to create a single data set containing both X and Y.

In [19]:
# creating a dataframe with X and Y
df = pd.DataFrame({'X': X, 'Y': Y})

In [20]:
print(df)

           X          Y
0  -1.266366  -1.176330
1  -0.306700   0.754289
2   2.672383  23.654056
3   0.973636   5.329392
4   0.046181   0.398861
..       ...        ...
95 -1.325153  -1.650597
96 -0.640027  -0.951770
97 -0.091344   2.418354
98 -0.485894   0.641327
99 -1.004008   0.836269

[100 rows x 2 columns]


In [21]:
# iterating from 2 to 10 and generating powers of X from X^2 to X^10 by adding new columns to df
for i in range(2, 11):
    df[f'X^{i}'] = df['X'] ** i

In [22]:
# initialising variables to store the best models according to Cp, BIC, and adjusted R^2
best_model_cp = None
best_model_bic = None
best_model_r2adj = None

In [25]:
# creating a loop to iterate over all possible subset sizes k (from 1 to the number of predictors) and generate all combinations of predictors of size k
for k in range(1, len(predictors) + 1):
    subsets = itertools.combinations(predictors, k)
    
    for subset in subsets:
        subset = list(subset)
        
        model = sm.GLS(df['Y'], sm.add_constant(df[subset])).fit()
        
        if best_model_cp is None or model.aic < best_model_cp.aic:
            best_model_cp = model
        if best_model_bic is None or model.bic < best_model_bic.bic:
            best_model_bic = model
        if best_model_r2adj is None or model.rsquared_adj > best_model_r2adj.rsquared_adj:
            best_model_r2adj = model

In [28]:
# printing summaries of the best models
print("Best model according to Cp:")
print(best_model_cp.summary())

Best model according to Cp:
                            GLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.961
Model:                            GLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                     781.5
Date:                Fri, 26 May 2023   Prob (F-statistic):           2.75e-67
Time:                        15:14:04   Log-Likelihood:                -134.34
No. Observations:                 100   AIC:                             276.7
Df Residuals:                      96   BIC:                             287.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5669   

In [29]:
print("Best model according to BIC:")
print(best_model_bic.summary())

Best model according to BIC:
                            GLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.961
Model:                            GLS   Adj. R-squared:                  0.959
Method:                 Least Squares   F-statistic:                     781.5
Date:                Fri, 26 May 2023   Prob (F-statistic):           2.75e-67
Time:                        15:14:14   Log-Likelihood:                -134.34
No. Observations:                 100   AIC:                             276.7
Df Residuals:                      96   BIC:                             287.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.5669  

In [31]:
print("Best model according to adjusted R^2:")
print(best_model_r2adj.summary())

Best model according to adjusted R^2:
                            GLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.963
Model:                            GLS   Adj. R-squared:                  0.960
Method:                 Least Squares   F-statistic:                     341.6
Date:                Fri, 26 May 2023   Prob (F-statistic):           6.29e-63
Time:                        15:14:22   Log-Likelihood:                -131.34
No. Observations:                 100   AIC:                             278.7
Df Residuals:                      92   BIC:                             299.5
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         

In [32]:
# extracting the coefficients of the best models
best_model_cp_coeffs = best_model_cp.params
best_model_bic_coeffs = best_model_bic.params
best_model_r2adj_coeffs = best_model_r2adj.params

In [38]:
# printing summaries of the coefficients according to the best models
print("Coefficients of the best model according to Cp:")
print(best_model_cp_coeffs)

print("Coefficients of the best model according to BIC:")
print(best_model_bic_coeffs)

print("Coefficients of the best model according to adjusted R^2:")
print(best_model_r2adj_coeffs)

Coefficients of the best model according to Cp:
const    1.566936
X        2.157003
X^2      0.966199
X^3      0.479067
dtype: float64
Coefficients of the best model according to BIC:
const    1.566936
X        2.157003
X^2      0.966199
X^3      0.479067
dtype: float64
Coefficients of the best model according to adjusted R^2:
const    1.551008
X        2.054153
X^2      1.714072
X^3      0.542002
X^4     -1.399001
X^6      0.651574
X^8     -0.108679
X^10     0.005917
dtype: float64
