## AccelerateAI Data Science Global Bootcamp

### Model Selection Methods

We will look at the following methods to select the value of alpha for LASSO regresion
 - AIC 
 - BIC
 - Cross Validation

In [1]:
# Required libraries for model selection
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoLarsIC             #Lasso model fit using BIC or AIC for model selection.
from sklearn.pipeline import make_pipeline

In [2]:
from sklearn.datasets import load_diabetes

# Load the diabetes dataset
X, y = load_diabetes(return_X_y=True, as_frame=True)
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [3]:
y.head()

0    151.0
1     75.0
2    141.0
3    206.0
4    135.0
Name: target, dtype: float64

In [4]:
# Add few random features as well  
rng = np.random.RandomState(2021)
n_random_features = 10
X_random = pd.DataFrame( rng.randn(X.shape[0], n_random_features),
                         columns=[f"r{i}" for i in range(n_random_features)],
                       )
X = pd.concat([X, X_random], axis=1)
# Show only a subset of the columns
X.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,r0,r1,r2,r3,r4,r5,r6,r7,r8,r9
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,1.488609,0.676011,-0.418451,-0.806521,0.555876,-0.705504,1.130858,0.645002,0.106414,0.422155
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,0.124207,-0.837953,0.409016,0.102751,-1.907722,1.100224,-1.402325,-0.225081,-1.336206,0.303722
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,-0.720159,2.544915,1.317291,0.07263,-0.256108,0.13801,1.147236,1.376261,-0.472184,0.524085
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,1.485108,1.482432,0.728131,-0.389648,0.278894,0.0519,-1.044744,-0.161508,-2.793531,0.361648
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,0.240108,0.477812,0.208852,0.917912,-1.411778,1.224236,-0.545658,0.906748,-0.982617,-0.633162


In [5]:
lasso_ic = make_pipeline(StandardScaler(), 
                              LassoLarsIC(criterion="aic", normalize=False)
                             ).fit(X, y)

In [6]:
# Store the AIC metric for each value of alpha used during fit.
results = pd.DataFrame(
    {
        "alphas": lasso_ic[-1].alphas_,
        "AIC criterion": lasso_ic[-1].criterion_,
    }
).set_index("alphas")
alpha_aic = lasso_ic[-1].alpha_

In [7]:
# Model coefficients with AIC 
list(zip(X.columns, lasso_ic[-1].coef_))

[('age', 0.0),
 ('sex', -9.758922885057379),
 ('bmi', 25.2951042901323),
 ('bp', 13.996381369711937),
 ('s1', -4.507930670370241),
 ('s2', 0.0),
 ('s3', -10.281899442391454),
 ('s4', 0.0),
 ('s5', 24.33241009488148),
 ('s6', 3.5260874930876103),
 ('r0', -2.5724988331272103),
 ('r1', -3.751921833009503),
 ('r2', 0.0),
 ('r3', -0.7530297805111872),
 ('r4', 0.0),
 ('r5', -2.228655436183959),
 ('r6', -3.1531438055208008),
 ('r7', 2.1293949964865106),
 ('r8', -1.0203880701439172),
 ('r9', -2.120271508629494)]

In [8]:
# Perform the same analysis using the BIC criterion.
lasso_bic = lasso_ic.set_params(lassolarsic__criterion="bic").fit(X, y)

# Append the results in the same dataframe
results["BIC criterion"] = lasso_ic[-1].criterion_
alpha_bic = lasso_ic[-1].alpha_

In [9]:
# Model coefficients with BIC 
list(zip(X.columns, lasso_ic[-1].coef_))

[('age', 0.0),
 ('sex', -3.431182149852974),
 ('bmi', 24.312306753262586),
 ('bp', 11.061904669850477),
 ('s1', 0.0),
 ('s2', 0.0),
 ('s3', -7.974191414317996),
 ('s4', 0.0),
 ('s5', 21.416644296316978),
 ('s6', 0.0),
 ('r0', 0.0),
 ('r1', 0.0),
 ('r2', 0.0),
 ('r3', 0.0),
 ('r4', 0.0),
 ('r5', 0.0),
 ('r6', 0.0),
 ('r7', 0.0),
 ('r8', 0.0),
 ('r9', 0.0)]

In [10]:
# Compare the AIC and BIC values
results.style.highlight_min().format("{:.2f}")

Unnamed: 0_level_0,AIC criterion,BIC criterion
alphas,Unnamed: 1_level_1,Unnamed: 2_level_1
45.16003,5237.11,5237.11
42.300448,5201.23,5205.32
21.542302,4925.62,4933.8
15.03411,4868.26,4880.54
6.189693,4814.94,4831.31
4.295932,4803.12,4823.57
4.2318,4804.64,4829.19
3.649706,4802.41,4831.05
3.565477,4803.75,4836.48
3.262927,4803.12,4839.94


In [11]:
#Check this for more styling help: https://pandas.pydata.org/docs/user_guide/style.html 

In [12]:
from sklearn.linear_model import LassoCV

lasso_cv = make_pipeline(StandardScaler(), LassoCV(cv=20)).fit(X, y)

list(zip(X.columns, lasso_cv[-1].coef_))

[('age', -0.0),
 ('sex', -9.536996678241735),
 ('bmi', 25.256439950451337),
 ('bp', 13.889188432092507),
 ('s1', -4.284857841530376),
 ('s2', -0.0),
 ('s3', -10.233736588336283),
 ('s4', 0.0),
 ('s5', 24.17423864547707),
 ('s6', 3.379955481287983),
 ('r0', -2.4587873275084653),
 ('r1', -3.622093300367794),
 ('r2', 0.0),
 ('r3', -0.6296500863571339),
 ('r4', -0.0),
 ('r5', -2.103889811312806),
 ('r6', -3.0112058597713647),
 ('r7', 1.987414914073564),
 ('r8', -0.9183424183510114),
 ('r9', -1.982232287027096)]

In [13]:
print(f'Best alpha from CV: {lasso_cv[-1].alpha_}')

Best alpha from CV: 1.0432552660691845


***