# Biol 359A | Regularization
### Spring 2025, Week 4
Objectives:
- Understand the Bias-Variance Tradeoff: Learn how increasing model complexity impacts bias and variance.
- Gain intuition for Regularization Techniques: Implement Ridge and LASSO to prevent overfitting.
- Implement Model Evaluation and Selection: Employ train-test splits for optimal model selection.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import ipywidgets as widgets
from ipywidgets import interact, fixed
from IPython.display import display


### Generate synthetic data

Today we will start by working with in-silico data. The code below will generate the data.

In [None]:
def generate_data(n_samples=100, n_features=10, noise_level=0.5, random_state=42, add_n_extreme=0):
    """
    Generate synthetic data with controlled noise and multicollinearity.
    
    Parameters:
    -----------
    n_samples : int
        Number of samples to generate
    n_features : int
        Number of features to generate
    noise_level : float
        Standard deviation of the Gaussian noise
    random_state : int
        Random seed for reproducibility
    add_extreme : bool
        If True, adds an extreme outlier value to the last sample
    
    Returns:
    --------
    X : ndarray of shape (n_samples, n_features)
        Feature matrix
    y : ndarray of shape (n_samples,)
        Target vector
    beta : ndarray of shape (n_features,)
        True coefficients
    """
    # Set random seed
    np.random.seed(random_state)
    
    # Generate random feature matrix
    X = np.random.randn(n_samples, n_features)
    
    # Add multicollinearity: Make some features correlated
    X[:, 5] = 0.7 * X[:, 0] + 0.3 * np.random.randn(n_samples)
    X[:, 6] = 0.8 * X[:, 1] + 0.2 * np.random.randn(n_samples)
    X[:, 7] = 0.9 * X[:, 2] + 0.1 * np.random.randn(n_samples)
    
    # Generate true coefficients with sparsity (some are zero)
    beta = np.zeros(n_features)
    beta[0] = 1.5
    beta[1] = -2.0
    beta[2] = 3.0
    beta[3] = -0.5
    beta[4] = 1.0
    # Features 5, 6, 7 have zero coefficients (but are correlated with others)
    beta[8] = -1.0
    beta[9] = 2.0
    
    # Generate target variable with noise
    y = np.dot(X, beta) + noise_level * np.random.randn(n_samples)
    # Add an extreme outlier if requested
    if add_n_extreme:
        y[-add_n_extreme:] += noise_level * (np.random.rand(add_n_extreme)) * 50
        #X[-add_n_extreme:] += np.random.randn(add_n_extreme, n_features) * 10
    
    return X, y, beta

In [None]:
# Generate the data
n_samples = 100
n_features = 10
noise_level = 0.5
X, y, true_beta = generate_data(n_samples, n_features, noise_level, add_n_extreme=10)
true_beta

### Multi linear regression

We want to build a linear regression model that will help us learn about the system that generated these data. To make the coefficients of our model more comparable to each other, we need to start by standardizing our data.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features for better convergence in regularization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Before standarization:")
print(f"mean: {np.mean(X_train): .3f}, std: {np.std(X_train):.3f}")
print("After standarization:")
print(f"mean: {np.mean(X_train_scaled): .3f}, std: {np.std(X_train_scaled):.3f}")

In [None]:
def fit_mlr(X_train, y_train, X_test, y_test):
    """Fit Multiple Linear Regression model and return metrics"""
    mlr = LinearRegression()
    mlr.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = mlr.predict(X_train)
    y_test_pred = mlr.predict(X_test)
    
    # Performance metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return mlr.coef_, train_mse, test_mse, train_r2, test_r2

def plot_mlr_results(noise_level=0.5, n_samples=100, add_n_extreme=0):
    """Plot MLR results with the given noise level and sample size"""
    # Generate new data with the specified parameters
    X, y, true_beta = generate_data(n_samples=n_samples, noise_level=noise_level, add_n_extreme=add_n_extreme)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit MLR
    coef, train_mse, test_mse, train_r2, test_r2 = fit_mlr(X_train_scaled, y_train, X_test_scaled, y_test)
    
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Coefficient values comparison (True vs Estimated)
    feature_indices = np.arange(len(true_beta))
    bar_width = 0.35
    
    ax1.bar(feature_indices - bar_width/2, true_beta, bar_width, label='True Coefficients', color='blue', alpha=0.7)
    ax1.bar(feature_indices + bar_width/2, coef, bar_width, label='Estimated Coefficients', color='red', alpha=0.7)
    
    ax1.set_xlabel('Feature Index')
    ax1.set_ylabel('Coefficient Value')
    ax1.set_title('True vs. Estimated Coefficients in MLR')
    ax1.set_xticks(feature_indices)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Predicted vs Actual values
    ax2.scatter(y_train, np.dot(X_train_scaled, coef), color='k', alpha=0.7, label="Train")
    ax2.scatter(y_test, np.dot(X_test_scaled, coef), color='green', alpha=0.7, label="Test")
    
    # Add a diagonal line (perfect predictions)
    min_val_train = min(min(y_train), min(np.dot(X_train_scaled, coef)))
    min_val_test = min(min(y_test), min(np.dot(X_test_scaled, coef)))
    min_val = min(min_val_train, min_val_test)
    max_val_train = max(max(y_train), max(np.dot(X_train_scaled, coef)))
    max_val_test = max(max(y_test), max(np.dot(X_test_scaled, coef)))
    max_val = max(max_val_test, max_val_test)
    ax2.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2)
    
    ax2.set_xlabel('Actual Values')
    ax2.set_ylabel('Predicted Values')
    ax2.set_title('Predicted vs. Actual Values in MLR')
    ax2.grid(True, alpha=0.3)
    ax2.legend()
    # Add text with performance metrics
    textstr = '\n'.join((
        f'Training MSE: {train_mse:.2f}',
        f'Test MSE: {test_mse:.2f}',
        f'Training R²: {train_r2:.2f}',
        f'Test R²: {test_r2:.2f}'))
    
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax2.text(0.05, 0.95, textstr, transform=ax2.transAxes, fontsize=10,
            verticalalignment='top', bbox=props)
    
    plt.tight_layout()
    plt.show()

# Interactive MLR visualization
print("Multiple Linear Regression Interactive Visualization")
print("===================================================")
print("Adjust the noise level and sample size to see how they affect MLR performance.")

interact(plot_mlr_results,
         add=widgets.FloatSlider(min=0.1, max=2.0, step=0.1, value=0.5, description='Noise Level:'),
         noise_level=widgets.FloatSlider(min=0.1, max=2.0, step=0.1, value=0.5, description='Noise Level:'),
         n_samples=widgets.IntSlider(min=50, max=500, step=50, value=100, description='Sample Size:'),
         add_n_extreme=widgets.IntSlider(min=0, max=10, step=1, value=0, description='N Outliers:'));

### Ridge regression

The interactive visualization below performs Ridge regression on these data. Use the sliders to adjust the polynomial degree and the lambda value, which controls the strength of the regularization:

- Polynomial Degree: Influences model complexity. Lower degrees might not capture all the data variability (underfitting), while higher degrees might model the noise as well (overfitting).
- Lambda Value: Adjusts the Ridge regularization strength. Increasing lambda enhances the model's generalization by penalizing large coefficients, helping to prevent overfitting.

Observe how adjusting the lambda values impacts the coefficients.

In [None]:
def fit_ridge(X_train, y_train, X_test, y_test, alpha):
    """Fit Ridge Regression model and return metrics"""
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = ridge.predict(X_train)
    y_test_pred = ridge.predict(X_test)
    
    # Performance metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return ridge.coef_, train_mse, test_mse, train_r2, test_r2

def plot_ridge_results(alpha=1.0, noise_level=0.5, n_samples=100, add_n_extreme=0):
    """Plot Ridge Regression results with the given parameters"""
    # Generate new data with the specified parameters
    X, y, true_beta = generate_data(n_samples=n_samples, noise_level=noise_level, add_n_extreme=add_n_extreme)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit MLR for comparison
    mlr_coef, mlr_train_mse, mlr_test_mse, mlr_train_r2, mlr_test_r2 = fit_mlr(X_train_scaled, y_train, X_test_scaled, y_test)
    mlr_y_test_pred = np.dot(X_test_scaled, mlr_coef)
    # Fit Ridge
    ridge_coef, ridge_train_mse, ridge_test_mse, ridge_train_r2, ridge_test_r2 = fit_ridge(X_train_scaled, y_train, X_test_scaled, y_test, alpha)
    ridge_y_test_pred = np.dot(X_test_scaled, ridge_coef)


    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Coefficient values comparison
    feature_indices = np.arange(len(true_beta))
    bar_width = 0.25
    
    ax1.bar(feature_indices - bar_width, true_beta, bar_width, label='True Coefficients', color='blue', alpha=0.7)
    ax1.bar(feature_indices, mlr_coef, bar_width, label='MLR Coefficients', color='red', alpha=0.7)
    ax1.bar(feature_indices + bar_width, ridge_coef, bar_width, label='Ridge Coefficients', color='green', alpha=0.7)
    
    ax1.set_xlabel('Feature Index')
    ax1.set_ylabel('Coefficient Value')
    ax1.set_title(f'Coefficient Comparison (Ridge with α={alpha:.2f})')
    ax1.set_xticks(feature_indices)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Regression results - Predicted vs Actual
    ax2.scatter(y_test, mlr_y_test_pred, color='red', alpha=0.7, label='MLR Predictions')
    ax2.scatter(y_test, ridge_y_test_pred, color='green', alpha=0.7, label='Ridge Predictions')
    
    # Add a diagonal line (perfect predictions)
    min_val = min(min(y_test), min(min(mlr_y_test_pred), min(ridge_y_test_pred)))
    max_val = max(max(y_test), max(max(mlr_y_test_pred), max(ridge_y_test_pred)))
    ax2.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2, label='Perfect Predictions')
    
    ax2.set_xlabel('Actual Values')
    ax2.set_ylabel('Predicted Values')
    ax2.set_title('Predicted vs. Actual Values: MLR vs Ridge')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add text with performance metrics
    mlr_metrics = f'MLR: MSE={mlr_test_mse:.2f}, R²={mlr_test_r2:.2f}'
    ridge_metrics = f'Ridge: MSE={ridge_test_mse:.2f}, R²={ridge_test_r2:.2f}'
    
    ax2.text(0.05, 0.95, mlr_metrics, transform=ax2.transAxes, fontsize=10,
            verticalalignment='top', color='red', 
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))
    
    ax2.text(0.05, 0.87, ridge_metrics, transform=ax2.transAxes, fontsize=10,
            verticalalignment='top', color='green',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))
    
    plt.tight_layout()
    plt.show()
    
    # Display regularization effect summary
    print(f"Ridge Regression (α={alpha:.2f}) Summary:")
    print(f"Sum of square of MLR parameters: {np.sum(mlr_coef**2):.4f}")
    print(f"Sum of square of Ridge parameters: {np.sum(ridge_coef**2):.4f}")



# Interactive Ridge Regression visualization
print("Ridge Regression Interactive Visualization")
print("==========================================")
print("Adjust the alpha parameter to control regularization strength,")
print("and other parameters to see their effect on Ridge Regression performance.")

interact(plot_ridge_results, 
         alpha=widgets.FloatLogSlider(min=-2, max=3, value=1.0, base=10, description='Alpha (λ):'),
         noise_level=widgets.FloatSlider(min=0.1, max=2.0, step=0.1, value=0.5, description='Noise Level:'),
         n_samples=widgets.IntSlider(min=50, max=500, step=50, value=100, description='Sample Size:'),
         add_n_extreme=widgets.IntSlider(min=0, max=10, step=1, value=0, description='N Outliers:'));

### Lasso

This interactive visualization showcases Lasso regression applied to polynomial models. Adjust the polynomial degree and alpha value using the provided sliders to explore their effects:

- Polynomial Degree: Controls the complexity of the model. Lower degrees might result in underfitting, where the model is too simplistic, while higher degrees might lead to overfitting, capturing noise instead of just the underlying data pattern.
- Lambda Value: Manages the Lasso regularization strength, promoting sparsity in the model coefficients. Higher lambda values can lead to more coefficients being reduced to zero, simplifying the model and potentially improving its generalizability.

Observe how changing the lambda value affects the number of parameters in the model.

In [None]:
def fit_lasso(X_train, y_train, X_test, y_test, alpha):
    """Fit Lasso Regression model and return metrics"""
    # Note: For numerical stability, we might need to increase max_iter
    lasso = Lasso(alpha=alpha, max_iter=10000, tol=0.0001)
    lasso.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = lasso.predict(X_train)
    y_test_pred = lasso.predict(X_test)
    
    # Performance metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    return lasso.coef_, train_mse, test_mse, train_r2, test_r2

def plot_lasso_results(alpha=0.1, noise_level=0.5, n_samples=100, add_n_extreme=0):
    """Plot Lasso Regression results with the given parameters"""
    # Generate new data with the specified parameters
    X, y, true_beta = generate_data(n_samples=n_samples, noise_level=noise_level, add_n_extreme=add_n_extreme)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Fit MLR for comparison
    mlr_coef, mlr_train_mse, mlr_test_mse, mlr_train_r2, mlr_test_r2 = fit_mlr(X_train_scaled, y_train, X_test_scaled, y_test)
    
    # Fit Lasso
    lasso_coef, lasso_train_mse, lasso_test_mse, lasso_train_r2, lasso_test_r2 = fit_lasso(X_train_scaled, y_train, X_test_scaled, y_test, alpha)
    
    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Plot 1: Coefficient values comparison
    feature_indices = np.arange(len(true_beta))
    bar_width = 0.25
    
    ax1.bar(feature_indices - bar_width, true_beta, bar_width, label='True Coefficients', color='blue', alpha=0.7)
    ax1.bar(feature_indices, mlr_coef, bar_width, label='MLR Coefficients', color='red', alpha=0.7)
    ax1.bar(feature_indices + bar_width, lasso_coef, bar_width, label='Lasso Coefficients', color='purple', alpha=0.7)
    
    ax1.set_xlabel('Feature Index')
    ax1.set_ylabel('Coefficient Value')
    ax1.set_title(f'Coefficient Comparison (Lasso with α={alpha:.4f})')
    ax1.set_xticks(feature_indices)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Feature selection visualization
    # Show zero vs non-zero coefficients
    zero_mask = np.abs(lasso_coef) < 1e-10
    nonzero_mask = ~zero_mask
    
    selected_features = np.sum(nonzero_mask)
    zero_coef_count = np.sum(zero_mask)
    
    ax2.pie([selected_features, zero_coef_count], 
            labels=[f'Non-zero Coefficients ({selected_features})', 
                   f'Zero Coefficients ({zero_coef_count})'],
            colors=['purple', 'gray'], 
            autopct='%1.1f%%',
            startangle=90,
            explode=(0.1, 0))
    
    ax2.set_title('Lasso Feature Selection')
    
    plt.tight_layout()
    plt.show()
    
    # Display model comparison
    metrics_df = pd.DataFrame({
        'MLR': [mlr_train_mse, mlr_test_mse, mlr_train_r2, mlr_test_r2],
        'Lasso': [lasso_train_mse, lasso_test_mse, lasso_train_r2, lasso_test_r2]
    }, index=['Train MSE', 'Test MSE', 'Train R²', 'Test R²'])
    
    display(metrics_df)
    
    # Display sparsity analysis
    print(f"Lasso Regression (α={alpha:.4f}) Sparsity Analysis:")
    print(f"Number of non-zero coefficients: {selected_features} out of {len(lasso_coef)}")
    print(f"L1 Penalty Term: {np.sum(np.abs(lasso_coef)):.4f}")
    print(f"Non-zero coefficients indices: {np.where(nonzero_mask)[0].tolist()}")
    print(f"True non-zero coefficients indices: {np.where(np.abs(true_beta) > 0)[0].tolist()}")

# Interactive Lasso Regression visualization
print("\nLasso Regression Interactive Visualization")
print("==========================================")
print("Adjust the alpha parameter to control sparsity,")
print("and other parameters to see their effect on Lasso Regression performance.")

interact(plot_lasso_results, 
         alpha=widgets.FloatLogSlider(min=-3, max=1, value=0.1, base=10, description='Alpha (λ):'),
         noise_level=widgets.FloatSlider(min=0.1, max=2.0, step=0.1, value=0.5, description='Noise Level:'),
         n_samples=widgets.IntSlider(min=50, max=500, step=50, value=100, description='Sample Size:'),
         add_n_extreme=widgets.IntSlider(min=0, max=10, step=1, value=0, description='N Outliers:'));

### The Bias-Variance Tradeoff

- Bias: Bias is the error introduced by approximating a real-world problem, which might be highly complex, with a simpler model. It reflects how well the model captures the underlying patterns of the data. High bias can cause the model to miss relevant relations between features and target outputs (underfitting).

- Variance: Variance measures how much the model's predictions vary for a given dataset. A model with high variance pays a lot of attention to training data and learns noise as well as signals, leading to less generalizable models on new data (overfitting).

In [None]:
def generate_data(n_samples=100, degree=3, noise_level=0.5, x_range=(-3, 3)):
    """
    Generate synthetic data with polynomial relationship and controlled noise.
    
    Parameters:
    -----------
    n_samples : int
        Number of samples to generate
    degree : int
        True polynomial degree of the data
    noise_level : float
        Standard deviation of the Gaussian noise
    x_range : tuple
        Range of x values (min, max)
    
    Returns:
    --------
    X : ndarray of shape (n_samples,)
        Feature values
    y : ndarray of shape (n_samples,)
        Target values with noise
    true_coef : ndarray
        True coefficients used to generate data
    """
    # Generate random x values within the specified range
    np.random.seed(42)
    X = np.random.uniform(x_range[0], x_range[1], n_samples)
    
    # Generate random coefficients for polynomial
    true_coef = np.random.randn(degree + 1)
    true_coef = true_coef / np.max(np.abs(true_coef)) * 3  # Scale coefficients
    
    # Generate y values based on polynomial relationship
    y_true = np.zeros(n_samples)
    for i in range(degree + 1):
        y_true += true_coef[i] * X**i
    
    # Add noise
    y = y_true + noise_level * np.random.randn(n_samples)
    
    return X, y, true_coef

#### Fit polynomial regression model with various degrees

In [None]:
# Function to fit polynomial regression models
def fit_polynomial_regression(X, y, max_degree=10, if_lasso=False, if_ridge=False):
    """
    Fit polynomial regression models of varying degrees.
    
    Parameters:
    -----------
    X : ndarray of shape (n_samples,)
        Feature values
    y : ndarray of shape (n_samples,)
        Target values
    max_degree : int
        Maximum polynomial degree to fit
    
    Returns:
    --------
    models : list
        List of fitted models for each degree
    """
    # Reshape X for sklearn
    X_reshaped = X.reshape(-1, 1)
    
    models = []
    
    # Fit models for degrees 1 to max_degree
    for degree in range(1, max_degree + 1):
        # Create polynomial features
        poly_features = PolynomialFeatures(degree=degree, include_bias=True)
        X_poly = poly_features.fit_transform(X_reshaped)
        
        # Fit linear regression model
        if if_lasso:
          model = Lasso(alpha=1.0, max_iter=10000, tol=0.0001)
        elif if_ridge:
          model = Ridge(alpha=1.0, max_iter=10000, tol=0.0001)
        else:
          model = LinearRegression()
        model.fit(X_poly, y)
        
        # Store model and transformer
        models.append((model, poly_features))
    
    return models

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Filter out the specific ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Function to visualize polynomial regression results
def visualize_polynomial_regression(X_train, y_train, X_test, y_test, models, true_degree, true_coef):
    """
    Visualize polynomial regression models of different degrees.
    
    Parameters:
    -----------
    X_train, y_train : ndarray
        Training data
    X_test, y_test : ndarray
        Test data
    models : list
        List of fitted models for each degree
    true_degree : int
        True polynomial degree of the data
    true_coef : ndarray
        True coefficients used to generate data
    """
    # Create a smooth curve for plotting
    X_curve = np.linspace(min(X_train.min(), X_test.min()), max(X_train.max(), X_test.max()), 1000).reshape(-1, 1)
    
    # Calculate true function values
    y_true_curve = np.zeros(len(X_curve))
    for i in range(true_degree + 1):
        y_true_curve += true_coef[i] * X_curve.flatten()**i
    
    # Create figure with subplots
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes = axes.flatten()
    
    # Plot 1: Data and model fits
    ax = axes[0]
    ax.scatter(X_train, y_train, color='blue', alpha=0.5, label='Training Data')
    ax.scatter(X_test, y_test, color='green', alpha=0.5, label='Test Data')
    ax.plot(X_curve, y_true_curve, 'k--', linewidth=2, label='True Function')
    
    # Select specific degrees to plot (to avoid cluttering)
    degrees_to_plot = [1, true_degree, 10] if true_degree != 1 and true_degree != 10 else [1, 5, 10]
    colors = ['red', 'purple', 'orange']
    
    for i, degree in enumerate(degrees_to_plot):
        if degree <= len(models):
            model, poly_features = models[degree - 1]
            X_curve_poly = poly_features.transform(X_curve)
            y_pred = model.predict(X_curve_poly)
            ax.plot(X_curve, y_pred, color=colors[i], linewidth=2, label=f'Degree {degree}')
    
    ax.set_xlabel('X')
    ax.set_ylabel('y')
    ax.set_title('Polynomial Fits of Different Degrees')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot 2: Training and test errors vs. polynomial degree
    ax = axes[1]
    degrees = list(range(1, len(models) + 1))
    train_errors = []
    test_errors = []
    
    for degree in range(1, len(models) + 1):
        model, poly_features = models[degree - 1]
        
        # Reshape data for transformation
        X_train_reshaped = X_train.reshape(-1, 1)
        X_test_reshaped = X_test.reshape(-1, 1)
        
        # Transform data
        X_train_poly = poly_features.transform(X_train_reshaped)
        X_test_poly = poly_features.transform(X_test_reshaped)
        
        # Make predictions
        y_train_pred = model.predict(X_train_poly)
        y_test_pred = model.predict(X_test_poly)
        
        # Calculate errors
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        
        train_errors.append(train_mse)
        test_errors.append(test_mse)
    
    ax.plot(degrees, train_errors, 'b-', marker='o', label='Training Error')
    ax.plot(degrees, test_errors, 'r-', marker='s', label='Test Error')
    ax.axvline(x=true_degree, color='k', linestyle='-.', alpha=0.7, label=f'True Degree = {true_degree}')
    
    # Find the degree with minimum test error
    best_degree = degrees[np.argmin(test_errors)]
    ax.axvline(x=best_degree, color='g', linestyle='--', alpha=0.7, label=f'Best Degree = {best_degree}')
    
    ax.set_xlabel('Polynomial Degree')
    ax.set_ylabel('Mean Squared Error')
    ax.set_title('Error vs. Model Complexity')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    # Plot 3: Bias and variance components (simplified representation)
    ax = axes[2]
    
    biases = np.array(train_errors)
    
    # Simplified variance calculation (increasing with complexity)
    variances = np.array([max(0.01, (test_err - train_err)) for train_err, test_err in zip(train_errors, test_errors)])
    
    # Total error = bias^2 + variance
    total_errors = biases + variances
    
    ax.plot(degrees, biases, 'b-', marker='o', label='Bias')
    ax.plot(degrees, variances, 'r-', marker='s', label='Variance')
    ax.plot(degrees, total_errors, 'g-', marker='^', label='Total Error')
    ax.axvline(x=true_degree, color='k', linestyle='--', alpha=0.7, label=f'True Degree = {true_degree}')
    
    ax.set_xlabel('Polynomial Degree')
    ax.set_ylabel('Error Component')
    ax.set_title('Bias-Variance Decomposition')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Interactive visualization function
def interactive_polynomial_regression():
    @interact(true_degree=widgets.IntSlider(min=1, max=9, step=1, value=3, description='True Degree:'),
              noise_level=widgets.FloatSlider(min=0.1, max=20.0, step=0.1, value=0.5, description='Noise Level:'),
              n_samples=widgets.IntSlider(min=20, max=200, step=10, value=100, description='Sample Size:'),
              regularization=widgets.RadioButtons(
                  options=['None', 'Lasso', 'Ridge'],
                  value='None',
                  description='Regularization:',
                  disabled=False
              ))
    def update(true_degree=3, noise_level=0.5, n_samples=100, regularization='None'):
        # Generate data
        X, y, true_coef = generate_data(n_samples=n_samples, degree=true_degree, noise_level=noise_level)
        
        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # Determine regularization type
        if_lasso = (regularization == 'Lasso')
        if_ridge = (regularization == 'Ridge')
        
        # Fit models of different degrees
        max_degree = 10
        models = fit_polynomial_regression(X_train, y_train, max_degree=max_degree, if_lasso=if_lasso, if_ridge=if_ridge)
        
        # Visualize results
        visualize_polynomial_regression(X_train, y_train, X_test, y_test, models, true_degree, true_coef)

print("True Degree: The actual polynomial degree of the data-generating process")
print("Noise Level: The amount of random noise added to the data")
print("Sample Size: The number of data points generated")

interactive_polynomial_regression()