# Ridge and Lasso Regression

Regularized linear models are finding a new line that doesn't fit the training data as well as ordinary least squares regression, in order to achieve better generalization to new data. This is particularly useful when dealing with multicollinearity or when the number of predictors exceeds the number of observations.

- **Ridge (L2 Regularization)**: Adds a penalty equal to the square of the magnitude of coefficients. Shrinks coefficients towards zero but rarely makes them exactly zero.
- **Lasso (L1 Regularization)**: Adds a penalty equal to the absolute value of the magnitude of coefficients. Can shrink coefficients to exactly zero, effectively performing feature selection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

### 1️⃣ Load Data & Split

In [None]:
df = pd.read_csv("data/dataset.csv")
target_column = 'median_house_value'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, "Testing set:", X_test.shape)

### 2️⃣ Preprocessing Pipeline

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [None]:
def regression_metrics(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {model_name} ---")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")
    print("\n")

### 3️⃣ Ridge Regression with Cross-Validation (RidgeCV)
RidgeCV automatically performs Leave-One-Out Cross-Validation to find the best `alpha`.

In [None]:
# Define alphas to test
alphas = [0.1, 1.0, 10.0, 100.0]

ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', RidgeCV(alphas=alphas, scoring='neg_mean_squared_error'))])

ridge_pipeline.fit(X_train, y_train)

# Best Alpha
best_alpha_ridge = ridge_pipeline.named_steps['regressor'].alpha_
print(f"Best Alpha for Ridge: {best_alpha_ridge}")

# Evaluate
y_pred_ridge = ridge_pipeline.predict(X_test)
regression_metrics(y_test, y_pred_ridge, "Ridge Regression (Tuned)")

### 4️⃣ Lasso Regression with Cross-Validation (LassoCV)
LassoCV tests multiple alphas to find the one that minimizes error.

In [None]:
lasso_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LassoCV(cv=5, random_state=42))])

lasso_pipeline.fit(X_train, y_train)

# Best Alpha
best_alpha_lasso = lasso_pipeline.named_steps['regressor'].alpha_
print(f"Best Alpha for Lasso: {best_alpha_lasso}")

# Evaluate
y_pred_lasso = lasso_pipeline.predict(X_test)
regression_metrics(y_test, y_pred_lasso, "Lasso Regression (Tuned)")

### 5️⃣ Visualization: Actual vs Predicted (Ridge)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_ridge, alpha=0.6, label='Ridge Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title(f"Ridge Regression (Alpha={best_alpha_ridge})")
plt.legend()
plt.show()