# Biol 359A | Parameter Estimation and Regularization
### Spring 2025, Week 6
Objectives:
- gain intuition for parameter estimation strategy
- gain intuition for cost function landscapes
- contextualize MLR parameters (coefficients)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from ipywidgets import interact, widgets
import pandas as pd
import seaborn as sns

### Generate synthetic data
Today we will start by working with in-silico data. The code below will generate the data.

In [None]:
def generate_data(n_samples=100, degree=3, noise_level=0.5, x_range=(-3, 3)):
    """
    Generate synthetic data with polynomial relationship and controlled noise.
    
    Parameters:
    -----------
    n_samples : int
        Number of samples to generate
    degree : int
        True polynomial degree of the data
    noise_level : float
        Standard deviation of the Gaussian noise
    x_range : tuple
        Range of x values (min, max)
    
    Returns:
    --------
    X : ndarray of shape (n_samples,)
        Feature values
    y : ndarray of shape (n_samples,)
        Target values with noise
    true_coef : ndarray
        True coefficients used to generate data
    """
    # Generate random x values within the specified range
    np.random.seed(42)
    X = np.random.uniform(x_range[0], x_range[1], n_samples)
    
    # Generate random coefficients for polynomial
    true_coef = np.random.randn(degree + 1)
    true_coef = true_coef / np.max(np.abs(true_coef)) * 3  # Scale coefficients
    
    # Generate y values based on polynomial relationship
    y_true = np.zeros(n_samples)
    for i in range(degree + 1):
        y_true += true_coef[i] * X**i
    
    # Add noise
    y = y_true + noise_level * np.random.randn(n_samples)
    X = X.reshape(-1, 1)
    return X, y, true_coef

In [None]:
def get_model(model_type='None', alpha=1.0, l1_ratio=0.5):
    """Create model based on regularization choice."""
    if model_type == 'Lasso':
        return Lasso(alpha=alpha, max_iter=10000)
    elif model_type == 'Ridge':
        return Ridge(alpha=alpha)
    elif model_type == 'Elastic':
        return ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000)
    else:
        return LinearRegression()

def create_polynomial_features(X, degree):
    """Create polynomial features from input data."""
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    return poly.fit_transform(X)

In [None]:
def perform_cross_validation(X, y, test_degree, n_folds=5, model_type='None', alpha=1.0, l1_ratio=0.5):
    """
    Perform k-fold cross-validation for polynomial regression.
    
    Parameters:
    -----------
    X : ndarray
        Input features
    y : ndarray
        Target values
    test_degree : int
        Degree of polynomial to test
    n_folds : int
        Number of folds for cross-validation
    model_type : str
        Type of regularization to use
    alpha : float
        Regularization strength
    l1_ratio : float
        Mixing parameter for ElasticNet
        
    Returns:
    --------
    cv_results : dict
        Dictionary with cross-validation results
    """
    # Create polynomial features
    X_poly = create_polynomial_features(X, test_degree)
    
    # Initialize k-fold cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Initialize lists to store results
    fold_train_losses = []
    fold_val_losses = []
    fold_coefs = []
    fold_intercepts = []
    
    # Perform cross-validation
    for i, (train_idx, val_idx) in enumerate(kf.split(X_poly)):
        # Split data into train and validation sets
        X_train, X_val = X_poly[train_idx], X_poly[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Create and fit model
        model = get_model(model_type, alpha, l1_ratio)
        model.fit(X_train, y_train)
        
        # Calculate train and validation losses
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        
        train_loss = mean_squared_error(y_train, y_train_pred)
        val_loss = mean_squared_error(y_val, y_val_pred)
        
        # Store results
        fold_train_losses.append(train_loss)
        fold_val_losses.append(val_loss)
        
        # Store model parameters
        if hasattr(model, 'coef_'):
            fold_coefs.append(model.coef_)
        else:
            fold_coefs.append(None)
            
        if hasattr(model, 'intercept_'):
            fold_intercepts.append(model.intercept_)
        else:
            fold_intercepts.append(None)
    
    # Calculate average losses
    avg_train_loss = np.mean(fold_train_losses)
    avg_val_loss = np.mean(fold_val_losses)
    
    # Create dictionary with results
    cv_results = {
        'fold_train_losses': fold_train_losses,
        'fold_val_losses': fold_val_losses,
        'fold_coefs': fold_coefs,
        'fold_intercepts': fold_intercepts,
        'avg_train_loss': avg_train_loss,
        'avg_val_loss': avg_val_loss
    }
    
    return cv_results

def train_test_model(X, y, test_degree, test_size=0.2, model_type='None', alpha=1.0, l1_ratio=0.5):
    """
    Train model on train set and evaluate on test set.
    
    Parameters:
    -----------
    X : ndarray
        Input features
    y : ndarray
        Target values
    test_degree : int
        Degree of polynomial to test
    test_size : float
        Proportion of data to use for testing
    model_type : str
        Type of regularization to use
    alpha : float
        Regularization strength
    l1_ratio : float
        Mixing parameter for ElasticNet
        
    Returns:
    --------
    test_results : dict
        Dictionary with test results
    """
    # Create polynomial features
    X_poly = create_polynomial_features(X, test_degree)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_poly, y, test_size=test_size, random_state=42
    )
    
    # Create and fit model
    model = get_model(model_type, alpha, l1_ratio)
    model.fit(X_train, y_train)
    
    # Calculate train and test losses
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_loss = mean_squared_error(y_train, y_train_pred)
    test_loss = mean_squared_error(y_test, y_test_pred)
    
    # Create dictionary with results
    test_results = {
        'train_loss': train_loss,
        'test_loss': test_loss,
        'model': model
    }
    
    return test_results

In [None]:
def interactive_polynomial_regression():
    @interact(
        true_degree=widgets.IntSlider(min=1, max=9, step=1, value=3, description='True Degree:'),
        noise_level=widgets.FloatSlider(min=0.1, max=20.0, step=0.1, value=0.5, description='Noise Level:'),
        n_samples=widgets.IntSlider(min=20, max=200, step=10, value=100, description='Sample Size:'),
        test_degree=widgets.IntSlider(min=1, max=15, step=1, value=3, description='Test Degree:'),
        regularization=widgets.RadioButtons(
            options=['None', 'Lasso', 'Ridge', 'Elastic'],
            value='None',
            description='Regularization:'
        ),
        alpha=widgets.FloatLogSlider(
            min=-5, max=1, step=0.1, value=0.1, base=10, description='Alpha (Reg. Strength):'
        ),
        l1_ratio=widgets.FloatSlider(
            min=0.0, max=1.0, step=0.05, value=0.5, description='L1 Ratio (Elastic):'
        ),
        n_folds=widgets.IntSlider(min=3, max=10, step=1, value=5, description='CV Folds:')
    )
    def cross_validate_and_visualize(true_degree, noise_level, n_samples, test_degree,
                                    regularization, alpha, l1_ratio, n_folds):
        # Generate synthetic data
        X, y, true_coef = generate_data(n_samples, true_degree, noise_level)
        
        # Perform cross-validation
        cv_results = perform_cross_validation(
            X, y, test_degree, n_folds, regularization, alpha, l1_ratio
        )
        
        # Train final model on all data and evaluate on test set
        test_results = train_test_model(
            X, y, test_degree, 0.2, regularization, alpha, l1_ratio
        )
        
        # Create figure with 3 subplots
        fig, axes = plt.subplots(2, 2, figsize=(18, 14))
        
        # Plot 1: Data and model fit
        ax1 = axes[0, 0]
        
        # Plot original data
        X_flat = X.flatten()
        ax1.scatter(X_flat, y, alpha=0.6, label='Data points')
        
        # Plot true function
        X_line = np.linspace(min(X_flat), max(X_flat), 100).reshape(-1, 1)
        y_true = np.zeros(100)
        for i in range(true_degree + 1):
            y_true += true_coef[i] * X_line.flatten()**i
        ax1.plot(X_line, y_true, 'r-', linewidth=2, label='True function')
        
        # Plot model fit
        X_poly_line = create_polynomial_features(X_line, test_degree)
        y_pred = test_results['model'].predict(X_poly_line)
        ax1.plot(X_line, y_pred, 'g-', linewidth=2, label=f'Model fit (degree={test_degree})')
        
        ax1.set_title(f'Data and Model Fit\nTrue degree: {true_degree}, Test degree: {test_degree}')
        ax1.set_xlabel('X')
        ax1.set_ylabel('y')
        ax1.legend()
        
        # Plot 2: Validation loss for each fold
        ax2 = axes[0, 1]
        
        folds = list(range(1, n_folds + 1))
        ax2.bar(
            [f - 0.2 for f in folds], 
            cv_results['fold_train_losses'], 
            width=0.4, 
            color='blue', 
            alpha=0.6, 
            label='Train Loss'
        )
        ax2.bar(
            [f + 0.2 for f in folds], 
            cv_results['fold_val_losses'], 
            width=0.4, 
            color='red', 
            alpha=0.6, 
            label='Validation Loss'
        )
        
        ax2.axhline(
            cv_results['avg_train_loss'], 
            color='blue', 
            linestyle='--', 
            alpha=0.8,
            label=f'Avg Train Loss: {cv_results["avg_train_loss"]:.4f}'
        )
        ax2.axhline(
            cv_results['avg_val_loss'], 
            color='red', 
            linestyle='--', 
            alpha=0.8,
            label=f'Avg Val Loss: {cv_results["avg_val_loss"]:.4f}'
        )
        ax2.axhline(
            test_results['test_loss'], 
            color='green', 
            linestyle='--', 
            alpha=0.8,
            label=f'Test Loss: {test_results["test_loss"]:.4f}'
        )
        
        ax2.set_title(f'Train and Validation Loss for Each Fold\n({n_folds}-fold Cross-Validation)')
        ax2.set_xlabel('Fold')
        ax2.set_ylabel('Mean Squared Error')
        ax2.set_xticks(folds)
        ax2.legend()
        
        # Plot 3: Model parameters for each fold
        ax3 = axes[1, 0]
        
        # Get coefficients for each fold
        coefs_array = np.array(cv_results['fold_coefs'])
        
        # Plot coefficients
        for i in range(coefs_array.shape[1]):
            label = f'Coef {i}' if i > 0 else 'Intercept'
            ax3.plot(folds, coefs_array[:, i], 'o-', label=label)
        
        ax3.set_title('Model Parameters for Each Fold')
        ax3.set_xlabel('Fold')
        ax3.set_ylabel('Coefficient Value')
        ax3.set_xticks(folds)
        ax3.legend()
        
        # Plot 4: Train, validation, test loss comparison across model complexities
        ax4 = axes[1, 1]
        
        # Evaluate models with different degrees
        degrees = list(range(1, test_degree + 5))
        train_losses = []
        val_losses = []
        test_losses = []
        
        for degree in degrees:
            # Perform cross-validation
            cv_res = perform_cross_validation(
                X, y, degree, n_folds, regularization, alpha, l1_ratio
            )
            
            # Train final model on all data and evaluate on test set
            test_res = train_test_model(
                X, y, degree, 0.2, regularization, alpha, l1_ratio
            )
            
            train_losses.append(cv_res['avg_train_loss'])
            val_losses.append(cv_res['avg_val_loss'])
            test_losses.append(test_res['test_loss'])
        
        ax4.plot(degrees, train_losses, 'o-', color='blue', label='Train Loss')
        ax4.plot(degrees, val_losses, 'o-', color='red', label='Validation Loss')
        ax4.plot(degrees, test_losses, 'o-', color='green', label='Test Loss')
        
        ax4.axvline(
            true_degree, 
            color='black', 
            linestyle='--', 
            alpha=0.5,
            label=f'True Degree: {true_degree}'
        )
        
        ax4.set_title('Model Performance vs. Complexity')
        ax4.set_xlabel('Polynomial Degree')
        ax4.set_ylabel('Mean Squared Error')
        ax4.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Print detailed results
        print("\n=== Cross-Validation Results ===\n")
        
        # Create a DataFrame for fold-by-fold results
        fold_results = pd.DataFrame({
            'Fold': list(range(1, n_folds + 1)),
            'Train Loss': cv_results['fold_train_losses'],
            'Validation Loss': cv_results['fold_val_losses']
        })
        
        print(fold_results)
        
        print(f"\nAverage Train Loss: {cv_results['avg_train_loss']:.4f}")
        print(f"Average Validation Loss: {cv_results['avg_val_loss']:.4f}")
        print(f"Test Loss: {test_results['test_loss']:.4f}")
        
        print("\n=== Model Parameters for Each Fold ===\n")
        
        # Create a DataFrame for model parameters
        param_cols = [f'Coef {i}' if i > 0 else 'Intercept' for i in range(coefs_array.shape[1])]
        param_data = pd.DataFrame(coefs_array, columns=param_cols)
        param_data.insert(0, 'Fold', list(range(1, n_folds + 1)))
        
        print(param_data)
        
        # Print true coefficients
        print("\n=== True Coefficients ===\n")
        true_coef_names = [f'Coef {i}' if i > 0 else 'Intercept' for i in range(len(true_coef))]
        true_coef_df = pd.DataFrame([true_coef], columns=true_coef_names)
        print(true_coef_df)
        
        # Print final model coefficients
        print("\n=== Final Model Coefficients ===\n")
        final_model = test_results['model']
        final_coef = np.concatenate(([final_model.intercept_], final_model.coef_[1:]))
        final_coef_names = [f'Coef {i}' if i > 0 else 'Intercept' for i in range(len(final_coef))]
        final_coef_df = pd.DataFrame([final_coef], columns=final_coef_names)
        print(final_coef_df)

        # Explain findings
        print("\n=== Interpretation ===\n")
        
        # Compare true degree vs test degree
        if true_degree == test_degree:
            print(f"The test polynomial degree ({test_degree}) matches the true degree ({true_degree}).")
        elif test_degree < true_degree:
            print(f"The test polynomial degree ({test_degree}) is lower than the true degree ({true_degree}).")
            print("This suggests the model is underfit (high bias), which can be seen in the error plots.")
        else:
            print(f"The test polynomial degree ({test_degree}) is higher than the true degree ({true_degree}).")
            print("This suggests potential overfitting if regularization is not applied properly.")
        
        # Analyze validation and test loss
        val_test_diff = abs(cv_results['avg_val_loss'] - test_results['test_loss'])
        if val_test_diff < 0.1 * cv_results['avg_val_loss']:
            print("\nThe cross-validation estimate closely matches the test loss.")
            print("This suggests the model generalizes well to unseen data.")
        else:
            print("\nThere's a notable difference between cross-validation and test loss.")
            print("This may indicate high variance in the data or instability in the model.")
        
        # Analyze cross-validation stability
        cv_std = np.std(cv_results['fold_val_losses'])
        cv_mean = np.mean(cv_results['fold_val_losses'])
        cv_variation = cv_std / cv_mean
        
        if cv_variation < 0.1:
            print("\nThe cross-validation results are very stable across folds.")
        elif cv_variation < 0.3:
            print("\nThe cross-validation results show moderate variation across folds.")
        else:
            print("\nThe cross-validation results show high variation across folds.")
            print("This may suggest the model is sensitive to the specific data split.")
        
        # Regularization effects
        if regularization != 'None':
            print(f"\nRegularization ({regularization}) was applied with alpha={alpha:.6f}.")
            if regularization == 'Elastic':
                print(f"L1 ratio was set to {l1_ratio:.2f} (balance between Lasso and Ridge).")
            
            # Check if regularization is helping
            degree_diff = test_degree - true_degree
            if degree_diff > 2 and test_losses[test_degree-1] < 1.5 * test_losses[true_degree-1]:
                print("Regularization appears to be effectively preventing overfitting.")
            elif degree_diff > 2:
                print("Despite regularization, there may still be overfitting.")
                if alpha < 0.1:
                    print("Consider increasing the regularization strength (alpha).")

# Run the interactive application
interactive_polynomial_regression()