In [1]:
! pip install ISLP

Collecting ISLP
  Downloading ISLP-0.3.21-py3-none-any.whl (3.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/3.6 MB[0m [31m16.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.6/3.6 MB[0m [31m56.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting lifelines (from ISLP)
  Downloading lifelines-0.27.8-py3-none-any.whl (350 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m350.7/350.7 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pygam (from ISLP)
  Downloading pygam-0.9.0-py3-none-any.whl (522 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.2/522.2 kB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
Collecting p

# Lasso Regression

Lasso regression is a type of linear regression that uses a penalty for the absolute value of coefficients. This can lead to some coefficients being exactly zero, effectively performing feature selection.

Boston housing data dictionary [here](https://islp.readthedocs.io/en/latest/datasets/Boston.html).

Let's predict:

`medv`: median value of owner-occupied homes in $1000s.

In [2]:
from ISLP import load_data
boston = load_data('Boston')

print(boston.shape)
boston.head()

(506, 13)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2


In [3]:
from sklearn.model_selection import train_test_split

X = boston.drop("medv", axis=1)
y = boston['medv']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.05,
                                                    random_state=597,
                                                    shuffle=True)


In our code, alpha represents lambda penalty.

In [4]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

Let's predict.

In [5]:
from sklearn.metrics import mean_squared_error

y_pred = lasso.predict(X_test)

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 14.43


Let's look at our coefficients.

In [6]:
coef = lasso.coef_
print(f"Coefficients: {coef}")

Coefficients: [-0.11616889  0.0478916  -0.04469374  1.10610333 -0.          3.70203519
 -0.00904053 -1.14027149  0.26440401 -0.01490317 -0.7482806  -0.59459121]


In [10]:
# appending our coefficients to our columns we see  that `nox` reduced to zero!
# nox: nitrogen oxides concentration (parts per 10 million).

from pandas import DataFrame
DataFrame(data = lasso.coef_, index=X_train.columns.tolist()).T

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,lstat
0,-0.116169,0.047892,-0.044694,1.106103,-0.0,3.702035,-0.009041,-1.140271,0.264404,-0.014903,-0.748281,-0.594591


# Comparing Lasso and Ridge

## Recall Ridge Regression

Ridge regression is another type of linear regression that introduces a penalty on the magnitude of coefficients. Unlike Lasso, Ridge does not push coefficients to absolute zero but can shrink them towards zero. This makes Ridge regression useful, especially when every feature is of importance.

In [11]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f"Ridge Mean Squared Error: {mse_ridge:.2f}")

Ridge Mean Squared Error: 16.64


In [12]:
coef_ridge = ridge.coef_
print(f"Ridge Coefficients: {coef_ridge}")

Ridge Coefficients: [-1.25597782e-01  4.55057667e-02  1.36699602e-02  3.23403563e+00
 -1.85321061e+01  3.78228772e+00  2.53145698e-03 -1.46710518e+00
  3.00583256e-01 -1.27980104e-02 -9.41711457e-01 -5.41764832e-01]


In [15]:
# Let's compare the coefficients

coef_comparison = DataFrame({
    'Features': X_train.columns,
    'Lasso Coefficients': lasso.coef_,
    'Ridge Coefficients': ridge.coef_
})

coef_comparison


Unnamed: 0,Features,Lasso Coefficients,Ridge Coefficients
0,crim,-0.116169,-0.125598
1,zn,0.047892,0.045506
2,indus,-0.044694,0.01367
3,chas,1.106103,3.234036
4,nox,-0.0,-18.532106
5,rm,3.702035,3.782288
6,age,-0.009041,0.002531
7,dis,-1.140271,-1.467105
8,rad,0.264404,0.300583
9,tax,-0.014903,-0.012798


# Elastic Net Regression

Elastic Net regression is a type of linear regression that combines penalties from both Lasso and Ridge regression. It works by balancing the mix of Lasso and Ridge penalties using a parameter $p$ (also called l1_ratio in scikit-learn). When $p$ =1, Elastic Net is equivalent to Lasso, and when $p$=0, it's equivalent to Ridge.

In [16]:
from sklearn.linear_model import ElasticNet

# Here, we're setting l1_ratio to 0.5 as an example
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)

y_pred_enet = elastic_net.predict(X_test)

mse_enet = mean_squared_error(y_test, y_pred_enet)
print(f"Elastic Net Mean Squared Error: {mse_enet:.2f}")

Elastic Net Mean Squared Error: 13.87


Elastic Net performs the best!! Let's compare all the coefficients.

In [17]:
coef_enet = elastic_net.coef_
print(f"Elastic Net Coefficients: {coef_enet}")

Elastic Net Coefficients: [-0.11767945  0.04995411 -0.05197908  1.0730823  -0.1719499   3.25700645
 -0.0061344  -1.16138504  0.27907452 -0.01530809 -0.77528101 -0.62309709]


In [18]:
# Comparing coefficients of Lasso, Ridge, and Elastic Net
coef_comparison['Elastic Net Coefficients'] = coef_enet
coef_comparison

Unnamed: 0,Features,Lasso Coefficients,Ridge Coefficients,Elastic Net Coefficients
0,crim,-0.116169,-0.125598,-0.117679
1,zn,0.047892,0.045506,0.049954
2,indus,-0.044694,0.01367,-0.051979
3,chas,1.106103,3.234036,1.073082
4,nox,-0.0,-18.532106,-0.17195
5,rm,3.702035,3.782288,3.257006
6,age,-0.009041,0.002531,-0.006134
7,dis,-1.140271,-1.467105,-1.161385
8,rad,0.264404,0.300583,0.279075
9,tax,-0.014903,-0.012798,-0.015308


# Grid Search

Alright, we've shown that ElasticNet can perform better than Lasso or Ridge alone. But how do we choose the proper hyperparameters for the model?

# Grid Search for Optimal Parameters

[Grid Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) is a method to perform hyper-parameter tuning in a structured manner. It exhaustively tries every combination of the hyperparameters provided in a grid.

The primary advantage of using Grid Search over just k-fold cross-validation is that **Grid Search systematically works through multiple combinations of parameter tunes, cross-validating as it goes to determine which tune gives the best performance.**

In [25]:
import warnings
from numpy import linspace
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter("ignore", category=ConvergenceWarning)

param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': linspace(0, 1, 10)
}

enet_model = ElasticNet(max_iter=1000)

grid_search = GridSearchCV(enet_model,
                           param_grid,
                           scoring='neg_mean_squared_error',
                           cv=5,
                           verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best parameters found:  {'alpha': 0.001, 'l1_ratio': 0.3333333333333333}


## A note on why we use "negative" scores in CV in scitkit-learn

Using `neg_mean_squared_error` in CV is a bit counterintuitive at first glance, but there's a reason for its use in scikit-learn's cross-validation methods.

- **Maximization**: Scikit-learn's cross-validation methods are set up to maximize the score, so loss functions like Mean Squared Error (MSE) which are better when they're lower, need to be negated so they can be maximized. In other words, a model with a higher negative MSE in this context is better than one with a lower negative MSE.

- **Uniformity in Evaluation**: By always framing performance metrics as something to be maximized, it simplifies the process of comparing models. Whether you're dealing with a metric that's traditionally seen as a loss (like MSE) or as a score (like accuracy), you're always aiming for higher values.

- **Easier to Understand in Grid Search or Random Search**: When you're using methods like GridSearchCV or RandomizedSearchCV, the `best_score_` attribute will always be the highest value, making it more straightforward to understand: the highest score is the best, irrespective of whether you're dealing with a loss or a benefit metric.

# Expensive GridSearch

GridSearchCV can be expensive since it iterates through every hyperparameter combination. A rule of thumb is to multiply the length of every hyper parameter list together to determine how many combinations GridSearch will run through.


# Randomized Grid Search for Optimal Parameters

Unlike Grid Search which tries out every single combination of hyperparameters, Randomized Grid Search selects random combinations to try, which can be faster and more efficient.

In [26]:
from sklearn.model_selection import RandomizedSearchCV

enet_model = ElasticNet()

random_search = RandomizedSearchCV(enet_model,
                                   param_distributions=param_grid,
                                   n_iter=30,
                                   scoring='neg_mean_squared_error',
                                   cv=5,
                                   verbose=1)
random_search.fit(X_train, y_train)

# Best parameters from Randomized Search
print("Best parameters found (Randomized Search): ", random_search.best_params_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best parameters found (Randomized Search):  {'l1_ratio': 0.3333333333333333, 'alpha': 0.001}
