# Regularization and Cross Validation in practice

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading California Housing dataset

dataset = load_boston()

X = pd.DataFrame(dataset.data, columns = dataset.feature_names)
y = pd.DataFrame(dataset.target)

In [3]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
y.head()

Unnamed: 0,0
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [5]:
print(X.shape)
print(y.shape)

(506, 13)
(506, 1)


# Performing Linear Regression without Regularization/Cross validation

Since, in this notebook, we only want to see Regularization and Cross Validation in action, I am not doing any preprocessing or feature engineering and directly jumping in to the model fitting part.

In [6]:
# Splitting the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
print(f"X Training data shape: {X_train.shape}")
print(f"X Test data shape: {X_test.shape}")
print(f"y Training data shape: {y_train.shape}")
print(f"y Test data shape: {y_test.shape}")

X Training data shape: (404, 13)
X Test data shape: (102, 13)
y Training data shape: (404, 1)
y Test data shape: (102, 1)


In [8]:
# Performing linear regression on the training data

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

In [9]:
# Getting the R2 value

round(lr.score(X_test, y_test),2)

0.67

In [10]:
# Getting the MSE to evaluate model performance on training and test set

print(f"Training error: {round(mean_squared_error(y_train, y_pred_train),2)}")
print(f"Test error: {round(mean_squared_error(y_test, y_pred_test),2)}")

Training error: 21.64
Test error: 24.29


# L1 Regularization (Lasso)

In [11]:
lasso = Lasso(alpha = 0.01, random_state = 42)
lasso.fit(X_train, y_train)

y_pred_train = lasso.predict(X_train)
y_pred_test = lasso.predict(X_test)

In [12]:
pd.DataFrame(zip(lasso.feature_names_in_, lasso.coef_))

Unnamed: 0,0,1
0,CRIM,-0.111312
1,ZN,0.030961
2,INDUS,0.026062
3,CHAS,2.570765
4,NOX,-13.839232
5,RM,4.438317
6,AGE,-0.008495
7,DIS,-1.396032
8,RAD,0.256384
9,TAX,-0.010996


In [13]:
# Getting the R2 value

round(lasso.score(X_test, y_test),2)

0.67

In [14]:
# Getting the MSE to evaluate model performance on training and test set

print(f"Training error: {round(mean_squared_error(y_train, y_pred_train),2)}")
print(f"Test error: {round(mean_squared_error(y_test, y_pred_test),2)}")

Training error: 21.68
Test error: 24.3


# L2 Regularization (Ridge)

In [15]:
ridge = Ridge(alpha = 0.01, random_state = 42)
ridge.fit(X_train, y_train)

y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)

In [16]:
# Getting the R2 value

round(ridge.score(X_test, y_test),2)

0.67

In [17]:
# Getting the RMSE to evaluate model performance on training and test set

print(f"Training error: {round(mean_squared_error(y_train, y_pred_train),2)}")
print(f"Test error: {round(mean_squared_error(y_test, y_pred_test),2)}")

Training error: 21.64
Test error: 24.29


# K-Fold Cross validation

In [18]:
# Performing K-fold cross validation where k = 5

kf = KFold(n_splits=5)

mses = []

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    lr = LinearRegression()
    lr.fit(X_train_fold, y_train_fold)

    y_val_pred = lr.predict(X_val_fold)
    
    mse = mean_squared_error(y_val_fold, y_val_pred)
    mses.append(mse)
    
# Alternate way of doing it
# model = LinearRegression()
# cv_results = cross_val_score(model, X_train, y_train, cv=kf, scoring="neg_mean_squared_error")

In [19]:
mean_mse = np.mean(mses)

In [20]:
# Getting the R2 value

round(lr.score(X_test, y_test),2)

0.66

In [21]:
y_pred_test = lr.predict(X_test)

In [22]:
# Getting the MSE to evaluate model performance on training and test set

print(f"Training error: {round(mean_mse, 2)}")
print(f"Test error: {round(mean_squared_error(y_test, y_pred_test),2)}")

Training error: 23.65
Test error: 24.71
