In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

  import pandas.util.testing as tm


# Model selection based on p-values

In [53]:
# just get some data from sklearn
x, y = datasets.load_diabetes(return_X_y=True)
# discard 1st covariate. something is off
x = x[:,2:]

## Backward selection
**Loop backwards:** start using all predictors in your model, then look for the coefficients that are statistically significant (using p-values). Keep only those predictors and repeat. Stop when all the predictors are significant.

In [4]:
threshold = 0.1
num_coef = 0
num_features = x.shape[1]
num_iter = 0
trained_models = []

while num_coef < num_features:
  if num_iter==0:
    x_a = np.c_[np.ones((x.shape[0],1)),x]
  else:
    x_a = x_a[:,model.pvalues<=threshold]
  model = sm.OLS(y,x_a).fit()
  trained_models.append(model)
  num_coef = sum(model.pvalues<=threshold)
  num_features = x_a.shape[1]
  num_iter+=1

print('Total number of iterations: {}'.format(num_iter))
print('Model with insignificant coefficients:\n',trained_models[0].summary())
print('Model with only significant coefficients:\n',model.summary())

Total number of iterations: 2
Model with insignificant coefficients:
                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.500
Model:                            OLS   Adj. R-squared:                  0.491
Method:                 Least Squares   F-statistic:                     54.18
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           1.34e-60
Time:                        16:57:05   Log-Likelihood:                -2393.9
No. Observations:                 442   AIC:                             4806.
Df Residuals:                     433   BIC:                             4843.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------

Loop forward: start using the 1st predictor (together with an intercept), then look if the coefficients are statistically significant (using p-values). Keep adding one predictor at the time and stop when the latest predictor is not significant.

In [14]:
threshold = 0.1
num_iter = 1
num_signif = 2 
trained_models =[]
is_significant = True

while is_significant:
  x_a = np.c_[np.ones((x.shape[0],1)),x[:,:num_iter]]
  
  model = sm.OLS(y,x_a).fit()
  num_coef_signif = sum(model.pvalues<=threshold)
    
  if num_coef_signif==num_signif:
    is_significant = True
    num_signif+=1
    num_iter+=1
    trained_models.append(model)
  else:
    is_significant = False

print('Model with only significant coefficients:\n',\
      trained_models[-1].summary())

Model with only significant coefficients:
                             OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.396
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     143.9
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           8.68e-49
Time:                        17:06:50   Log-Likelihood:                -2435.7
No. Observations:                 442   AIC:                             4877.
Df Residuals:                     439   BIC:                             4890.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const    