<a href="https://colab.research.google.com/github/Arthur-Grainger/MScDissertation/blob/main/Elastic_Net_and_Lasso.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import required libraries**

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LassoCV, Lasso, ElasticNetCV, ElasticNet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

**Loading and Preparing Data**
1.  Reads data from an Excel file into a pandas DataFrame.
2.  Converts the 'Date' column to datetime objects.
3.  Sorts the DataFrame by the 'Date' column and resets the index.
4.  Prints the shape of the loaded dataset and the date range.
5.  Returns the prepared DataFrame.


In [None]:
def load_and_prepare_data(filename):

    df = pd.read_excel(filename)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)

    print(f"Dataset loaded: {df.shape}")
    print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")

    return df

**Creating Rolling Windows**
1.  Prints the window length and step size for clarity.
2.  Calculates the total number of observations in the DataFrame.
3.  Calculates the number of windows needed to cover the data, ensuring the last possible data point is included (through the `+1`).
4.  It iterates through the calculated number of windows:
  - Determines the start and end indices based on the window length and step size.
  - Adjusts the `end_idx` to ensure it doesn't go beyond the total number of observations.
  - Creates a dictionary `window_info` containing details about the window, such as its ID, start and end indices, start and end dates, and the number of observations within that window.
  - This dictionary is appended to the windows list.
5.  Prints the total number of windows created, the date ranges of the first and last windows, the length of the last window, and the list of windows.


In [None]:
def create_rolling_windows(df, window_length=62, step_size=6):
    """Create 5-year rolling windows with 6-month steps, including the last possible window."""
    print("\nCreating rolling windows:")
    print(f"Window length: {window_length} months")
    print(f"Step size: {step_size} months")

    total_obs = len(df)
    # Calculate the number of windows to include the last possible data point
    num_windows = (total_obs - window_length + step_size) // step_size + 1

    windows = []
    for i in range(num_windows):
        start_idx = i * step_size
        end_idx = start_idx + window_length

        # Ensure the end index does not exceed the total number of observations
        end_idx = min(end_idx, total_obs)
        current_window_length = end_idx - start_idx

        # Only create a window if it has at least the minimum length (e.g., step_size or a slightly larger threshold)
        # or if it's the very last window trying to capture recent data.
        # For this case, we want to capture the last possible data, so we'll include the last window even if shorter.
        if current_window_length >= step_size or (i == num_windows - 1 and current_window_length > 0):
             window_info = {
                'window_id': i + 1,
                'start_idx': start_idx,
                'end_idx': end_idx,
                'start_date': df.iloc[start_idx]['Date'],
                'end_date': df.iloc[end_idx-1]['Date'] if end_idx > start_idx else df.iloc[start_idx]['Date'], # Handle potential single-month window edge case
                'n_obs': current_window_length
            }
             windows.append(window_info)

    print(f"Created {len(windows)} rolling windows")
    print(f"First window: {windows[0]['start_date'].strftime('%Y-%m')} to {windows[0]['end_date'].strftime('%Y-%m')}")
    print(f"Last window: {windows[-1]['start_date'].strftime('%Y-%m')} to {windows[-1]['end_date'].strftime('%Y-%m')}")
    print(f"Last window length: {windows[-1]['n_obs']} months")

    return windows

**Prepare Variables**
1.  Identifies and prints the name of the identified target variable.
2.  Identifies and creates a list of `predictor_vars` by finding the `exclude_cols` list -- variables to exclude from being predictors (`Date` column and the identified `target_var`) -- then creating a list containing all columns in the DataFrame except from these.
3. Prints the total number of predictor variables found.
4.  Categorizes Predictors -- it divides the predictor_vars into two categories:
  - Traditional_vars: `epu`, `ftse`, `retail`, `claimant`, `d_unemp_rate_l`.
  - gt_vars: variables that are in `predictor_vars` but not in `traditional_vars`.
5.  It prints the number of variables in each category.
6.  Finally, it returns the `target_var`, the full list of `predictor_vars`, the `traditional_vars`, and the `gt_vars`.

In [None]:
def prepare_variables(df):
    """Separate target variable from predictors and categorize for analysis."""
    print("\nPreparing variables...")

    target_vars = [col for col in df.columns if 'd_unemp_rate' in col.lower() and '_L' not in col]
    if not target_vars:
        raise ValueError("Could not find unemployment rate variable")

    target_var = target_vars[0]
    print(f"Target variable: {target_var}")

    exclude_cols = ['Date', target_var]
    predictor_vars = [col for col in df.columns if col not in exclude_cols]

    print(f"Total predictor variables: {len(predictor_vars)}")

    # Categorize: Traditional vs Google Trends
    traditional_vars = [var for var in predictor_vars if any(x in var.lower() for x in
                       ['epu', 'ftse', 'retail', 'claimant', 'd_unemp_rate_l'])]
    gt_vars = [var for var in predictor_vars if var not in traditional_vars]

    print(f"Traditional indicators: {len(traditional_vars)}")
    print(f"Google Trends variables: {len(gt_vars)}")

    return target_var, predictor_vars, traditional_vars, gt_vars

**Estimating Parameters Globally**
It prints a message indicating that it's estimating global parameters.
It separates the predictor variables (X) and the target variable (y) into separate DataFrames.
Removes Constant Variables: It identifies and removes any predictor variables that have a standard deviation of zero (meaning they have the same value for all observations). This is important because these variables don't provide any predictive information and can cause issues in some models.
Standardizes Data: It uses StandardScaler to standardize the predictor variables. Standardization is crucial for regularization techniques like LASSO and Elastic Net as it ensures that all variables are on the same scale, preventing variables with larger magnitudes from dominating the regularization process.
Time Series Cross-Validation: It sets up a TimeSeriesSplit object with 10 splits. This is a cross-validation strategy specifically designed for time series data, where the data is split into training and testing sets sequentially, preserving the temporal order.
LASSO Parameter Estimation:
It initializes a LassoCV object. LassoCV performs cross-validation to find the best alpha (lambda) value for the LASSO model.
It specifies a range of alphas to search over using np.logspace.
It uses the tscv object for cross-validation.
max_iter sets the maximum number of iterations for the optimization algorithm.
random_state ensures reproducibility.
n_jobs=-1 uses all available CPU cores for faster computation.
It fits the LassoCV model to the scaled data to find the optimal alpha.
Elastic Net Parameter Estimation:
It initializes an ElasticNetCV object. ElasticNetCV performs cross-validation to find the best alpha and l1_ratio for the Elastic Net model.
It specifies a range of alphas and a list of l1_ratio values to search over. The l1_ratio controls the mix between L1 (LASSO) and L2 (Ridge) regularization.
It uses the tscv object for cross-validation.
max_iter and random_state are set as before.
n_jobs=-1 uses all available CPU cores.
It fits the ElasticNetCV model to the scaled data.
It prints the optimal alpha found for LASSO, and the optimal alpha and l1_ratio found for Elastic Net, along with an interpretation of the Elastic Net mix.
Finally, it returns the optimal lasso_cv.alpha_, elastic_cv.alpha_, and elastic_cv.l1_ratio_.

In [None]:
def estimate_global_parameters(df, target_var, predictor_vars):
    """Find optimal parameters for both LASSO and Elastic Net using full dataset."""
    print("\nEstimating global parameters for both LASSO and Elastic Net...")

    X = df[predictor_vars].copy()
    y = df[target_var].copy()

    # Remove constant variables
    constant_vars = X.columns[X.std() == 0]
    if len(constant_vars) > 0:
        X = X.drop(columns=constant_vars)
        print(f"  Removed {len(constant_vars)} constant variables from full dataset")

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Time series CV
    tscv = TimeSeriesSplit(n_splits=10)

    # LASSO parameter estimation
    print("  Estimating LASSO parameters...")
    lasso_cv = LassoCV(
        alphas=np.logspace(-5, 2, 250),
        cv=tscv,
        max_iter=10000,
        random_state=42,
        n_jobs=-1
    )
    lasso_cv.fit(X_scaled, y)

    # Elastic Net parameter estimation
    print("  Estimating Elastic Net parameters...")
    elastic_cv = ElasticNetCV(
        alphas=np.logspace(-5, 2, 100),
        l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9],
        cv=tscv,
        max_iter=10000,
        random_state=42,
        n_jobs=-1
    )
    elastic_cv.fit(X_scaled, y)

    print(f"  LASSO optimal λ: {lasso_cv.alpha_:.6f}")
    print(f"  Elastic Net optimal α: {elastic_cv.alpha_:.6f}")
    print(f"  Elastic Net optimal l1_ratio: {elastic_cv.l1_ratio_:.3f}")
    print(f"  Elastic Net interpretation: {elastic_cv.l1_ratio_*100:.1f}% LASSO, {(1-elastic_cv.l1_ratio_)*100:.1f}% Ridge")

    return lasso_cv.alpha_, elastic_cv.alpha_, elastic_cv.l1_ratio_

Estimating elastic net and lasso and then OLS on both

In [None]:
def estimate_window_combined_models(window_data, target_var, predictor_vars, window_id,
                                  lasso_lambda, elastic_alpha, elastic_l1_ratio):
    """Run both LASSO and Elastic Net for variable selection, then OLS on selected variables."""

    X = window_data[predictor_vars].copy()
    y = window_data[target_var].copy()

    # Remove constants and standardize
    constant_vars = X.columns[X.std() == 0]
    if len(constant_vars) > 0:
        X = X.drop(columns=constant_vars)
        if len(constant_vars) > 0:  # Only print if there are any
            print(f"    Removed {len(constant_vars)} constant variables in Window {window_id}")

    # Skip window if no valid predictors remain
    if X.empty:
        print(f"    Window {window_id} skipped: No valid predictor variables after removing constants.")
        return None


    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

    # ========== LASSO ANALYSIS ==========
    lasso = Lasso(alpha=lasso_lambda, max_iter=2000, random_state=42)
    lasso.fit(X_scaled, y)

    lasso_selected_vars = X.columns[lasso.coef_ != 0].tolist()
    lasso_coefficients = dict(zip(X.columns, lasso.coef_))

    # Post-LASSO OLS
    if len(lasso_selected_vars) > 0:
        X_lasso_selected = window_data[lasso_selected_vars]
        X_lasso_selected = sm.add_constant(X_lasso_selected)

        lasso_ols_model = sm.OLS(y, X_lasso_selected).fit()

        lasso_ols_coefficients = dict(zip(X_lasso_selected.columns, lasso_ols_model.params))
        lasso_ols_pvalues = dict(zip(X_lasso_selected.columns, lasso_ols_model.pvalues))
        lasso_ols_tvalues = dict(zip(X_lasso_selected.columns, lasso_ols_model.tvalues))

        y_pred_lasso_ols = lasso_ols_model.predict(X_lasso_selected)
        rmse_lasso_ols = np.sqrt(mean_squared_error(y, y_pred_lasso_ols))
        r2_lasso_ols = lasso_ols_model.rsquared
        r2_adj_lasso_ols = lasso_ols_model.rsquared_adj
    else:
        # Handle case where LASSO selects no variables
        lasso_ols_coefficients = {'const': y.mean()}
        lasso_ols_pvalues = {'const': 1.0} # p-value for constant in a mean-only model
        lasso_ols_tvalues = {'const': 0.0} # t-value for constant in a mean-only model
        rmse_lasso_ols = np.sqrt(np.var(y))
        r2_lasso_ols = 0.0
        r2_adj_lasso_ols = 0.0


    # LASSO performance
    y_pred_lasso = lasso.predict(X_scaled)
    rmse_lasso = np.sqrt(mean_squared_error(y, y_pred_lasso))
    r2_lasso = lasso.score(X_scaled, y)

    # ========== ELASTIC NET ANALYSIS ==========
    elastic_net = ElasticNet(alpha=elastic_alpha, l1_ratio=elastic_l1_ratio, max_iter=2000, random_state=42)
    elastic_net.fit(X_scaled, y)

    elastic_selected_vars = X.columns[elastic_net.coef_ != 0].tolist()
    elastic_coefficients = dict(zip(X.columns, elastic_net.coef_))

    # Post-Elastic Net OLS
    if len(elastic_selected_vars) > 0:
        X_elastic_selected = window_data[elastic_selected_vars]
        X_elastic_selected = sm.add_constant(X_elastic_selected)

        elastic_ols_model = sm.OLS(y, X_elastic_selected).fit()

        elastic_ols_coefficients = dict(zip(X_elastic_selected.columns, elastic_ols_model.params))
        elastic_ols_pvalues = dict(zip(X_elastic_selected.columns, elastic_ols_model.pvalues))
        elastic_ols_tvalues = dict(zip(X_elastic_selected.columns, elastic_ols_model.tvalues))

        y_pred_elastic_ols = elastic_ols_model.predict(X_elastic_selected)
        rmse_elastic_ols = np.sqrt(mean_squared_error(y, y_pred_elastic_ols))
        r2_elastic_ols = elastic_ols_model.rsquared
        r2_adj_elastic_ols = elastic_ols_model.rsquared_adj
    else:
        # Handle case where Elastic Net selects no variables
        elastic_ols_coefficients = {'const': y.mean()}
        elastic_ols_pvalues = {'const': 1.0} # p-value for constant in a mean-only model
        elastic_ols_tvalues = {'const': 0.0} # t-value for constant in a mean-only model
        rmse_elastic_ols = np.sqrt(np.var(y))
        r2_elastic_ols = 0.0
        r2_adj_elastic_ols = 0.0


    # Elastic Net performance
    y_pred_elastic = elastic_net.predict(X_scaled)
    rmse_elastic = np.sqrt(mean_squared_error(y, y_pred_elastic))
    r2_elastic = elastic_net.score(X_scaled, y)

    results = {
        'window_id': window_id,

        # Model parameters
        'lasso_lambda': lasso_lambda,
        'elastic_alpha': elastic_alpha,
        'elastic_l1_ratio': elastic_l1_ratio,

        # LASSO results
        'lasso_n_selected': len(lasso_selected_vars),
        'lasso_selected_vars': lasso_selected_vars,
        'lasso_coefficients': lasso_coefficients,
        'rmse_lasso': rmse_lasso,
        'r2_lasso': r2_lasso,

        # Post-LASSO OLS results
        'lasso_ols_coefficients': lasso_ols_coefficients,
        'lasso_ols_pvalues': lasso_ols_pvalues,
        'lasso_ols_tvalues': lasso_ols_tvalues,
        'rmse_lasso_ols': rmse_lasso_ols,
        'r2_lasso_ols': r2_lasso_ols,
        'r2_adj_lasso_ols': r2_adj_lasso_ols,

        # Elastic Net results
        'elastic_n_selected': len(elastic_selected_vars),
        'elastic_selected_vars': elastic_selected_vars,
        'elastic_coefficients': elastic_coefficients,
        'rmse_elastic': rmse_elastic,
        'r2_elastic': r2_elastic,

        # Post-Elastic Net OLS results
        'elastic_ols_coefficients': elastic_ols_coefficients,
        'elastic_ols_pvalues': elastic_ols_pvalues,
        'elastic_ols_tvalues': elastic_ols_tvalues,
        'rmse_elastic_ols': rmse_elastic_ols,
        'r2_elastic_ols': r2_elastic_ols,
        'r2_adj_elastic_ols': r2_adj_elastic_ols,

        'constant_vars_removed': list(constant_vars)
    }

    return results

Run the models on the rolling windows

In [None]:
def run_rolling_combined_analysis(df, windows, target_var, predictor_vars,
                                lasso_lambda, elastic_alpha, elastic_l1_ratio):
    """Apply both LASSO and Elastic Net analysis across all rolling windows."""
    print(f"\nRunning Combined LASSO and Elastic Net Analysis for {len(windows)} windows...")
    print(f"LASSO λ={lasso_lambda:.6f}, Elastic Net α={elastic_alpha:.6f}, l1_ratio={elastic_l1_ratio:.3f}")
    print("This may take a few minutes...")

    all_results = []

    for i, window in enumerate(windows):
        print(f"Processing Window {window['window_id']}: {window['start_date'].strftime('%Y-%m')} to {window['end_date'].strftime('%Y-%m')}")

        window_data = df.iloc[window['start_idx']:window['end_idx']].copy()

        try:
            results = estimate_window_combined_models(
                window_data, target_var, predictor_vars, window['window_id'],
                lasso_lambda, elastic_alpha, elastic_l1_ratio
            )

            if results is not None: # Check if the window was skipped
                results['start_date'] = window['start_date']
                results['end_date'] = window['end_date']

                all_results.append(results)

                print(f"  ✓ LASSO: {results['lasso_n_selected']} vars (RMSE: {results['rmse_lasso_ols']:.4f})")
                print(f"  ✓ Elastic: {results['elastic_n_selected']} vars (RMSE: {results['rmse_elastic_ols']:.4f})")

        except Exception as e:
            print(f"  ✗ Error in window {window['window_id']}: {str(e)}")
            continue

    print(f"\nCompleted {len(all_results)} successful estimations")
    return all_results

Analyse the Results side-by-side

In [None]:
def analyze_combined_results(results, traditional_vars, gt_vars):
    """Analyze both LASSO and Elastic Net results side by side."""
    print("\n" + "="*70)
    print("COMBINED LASSO AND ELASTIC NET RESULTS SUMMARY")
    print("="*70)

    # Extract performance metrics
    lasso_n_selected = [r['lasso_n_selected'] for r in results]
    elastic_n_selected = [r['elastic_n_selected'] for r in results]

    rmse_lasso = [r['rmse_lasso'] for r in results]
    rmse_lasso_ols = [r['rmse_lasso_ols'] for r in results]
    rmse_elastic = [r['rmse_elastic'] for r in results]
    rmse_elastic_ols = [r['rmse_elastic_ols'] for r in results]

    r2_lasso_ols = [r['r2_lasso_ols'] for r in results]
    r2_elastic_ols = [r['r2_elastic_ols'] for r in results]

    print("\nPerformance Comparison:")
    print(f"  LASSO Parameters: λ = {results[0]['lasso_lambda']:.6f}")
    print(f"  Elastic Net Parameters: α = {results[0]['elastic_alpha']:.6f}, l1_ratio = {results[0]['elastic_l1_ratio']:.3f}")
    print(f"  Elastic Net Mix: {results[0]['elastic_l1_ratio']*100:.1f}% LASSO + {(1-results[0]['elastic_l1_ratio'])*100:.1f}% Ridge")

    print(f"\nVariable Selection:")
    print(f"  LASSO - Average variables selected: {np.mean(lasso_n_selected):.1f} (±{np.std(lasso_n_selected):.1f})")
    print(f"  Elastic Net - Average variables selected: {np.mean(elastic_n_selected):.1f} (±{np.std(elastic_n_selected):.1f})")

    print(f"\nPredictive Performance (Post-Selection OLS):")
    print(f"  LASSO - Average RMSE: {np.mean(rmse_lasso_ols):.4f} (±{np.std(rmse_lasso_ols):.4f})")
    print(f"  Elastic Net - Average RMSE: {np.mean(rmse_elastic_ols):.4f} (±{np.std(rmse_elastic_ols):.4f})")
    print(f"  LASSO - Average R²: {np.mean(r2_lasso_ols):.4f} (±{np.std(r2_lasso_ols):.4f})")
    print(f"  Elastic Net - Average R²: {np.mean(r2_elastic_ols):.4f} (±{np.std(r2_elastic_ols):.4f})")

    # Variable selection analysis for both models
    lasso_var_selection_count = {}
    elastic_var_selection_count = {}
    lasso_var_significance_count = {}
    elastic_var_significance_count = {}

    for result in results:
        # LASSO selections and significance
        for var in result['lasso_selected_vars']:
            lasso_var_selection_count[var] = lasso_var_selection_count.get(var, 0) + 1
            if var in result['lasso_ols_pvalues'] and result['lasso_ols_pvalues'][var] < 0.05:
                lasso_var_significance_count[var] = lasso_var_significance_count.get(var, 0) + 1

        # Elastic Net selections and significance
        for var in result['elastic_selected_vars']:
            elastic_var_selection_count[var] = elastic_var_selection_count.get(var, 0) + 1
            if var in result['elastic_ols_pvalues'] and result['elastic_ols_pvalues'][var] < 0.05:
                elastic_var_significance_count[var] = elastic_var_significance_count.get(var, 0) + 1

    # Convert to frequencies
    total_windows = len(results)
    lasso_var_selection_freq = {var: count/total_windows for var, count in lasso_var_selection_count.items()}
    elastic_var_selection_freq = {var: count/total_windows for var, count in elastic_var_selection_count.items()}

    lasso_var_significance_freq = {var: lasso_var_significance_count.get(var, 0)/total_windows
                                 for var in lasso_var_selection_count.keys()}
    elastic_var_significance_freq = {var: elastic_var_significance_count.get(var, 0)/total_windows
                                   for var in elastic_var_selection_count.keys()}

    # Combined scores
    lasso_combined_score = {var: lasso_var_selection_freq[var] * lasso_var_significance_freq.get(var, 0)
                          for var in lasso_var_selection_freq.keys()}
    elastic_combined_score = {var: elastic_var_selection_freq[var] * elastic_var_significance_freq.get(var, 0)
                            for var in elastic_var_selection_freq.keys()}

    # Top variables for each model
    top_lasso_vars = sorted(lasso_combined_score.items(), key=lambda x: x[1], reverse=True)[:15]
    top_elastic_vars = sorted(elastic_combined_score.items(), key=lambda x: x[1], reverse=True)[:15]

    print("\nTop 15 Variables - LASSO (Selection × Significance):")
    for i, (var, score) in enumerate(top_lasso_vars, 1):
        var_type = "GT" if var in gt_vars else "Traditional"
        selection_freq = lasso_var_selection_freq[var]
        significance_freq = lasso_var_significance_freq.get(var, 0)
        print(f"  {i:2d}. {var:<35} Sel:{selection_freq:>5.1%} Sig:{significance_freq:>5.1%} Score:{score:>5.3f} [{var_type}]")

    print("\nTop 15 Variables - Elastic Net (Selection × Significance):")
    for i, (var, score) in enumerate(top_elastic_vars, 1):
        var_type = "GT" if var in gt_vars else "Traditional"
        selection_freq = elastic_var_selection_freq[var]
        significance_freq = elastic_var_significance_freq.get(var, 0)
        print(f"  {i:2d}. {var:<35} Sel:{selection_freq:>5.1%} Sig:{significance_freq:>5.1%} Score:{score:>5.3f} [{var_type}]")

    # Model comparison analysis
    print("\nModel Comparison Analysis:")

    # Traditional vs GT analysis for both models
    lasso_traditional_selections = sum(1 for var in lasso_var_selection_count.keys() if var in traditional_vars)
    lasso_gt_selections = sum(1 for var in lasso_var_selection_count.keys() if var in gt_vars)
    elastic_traditional_selections = sum(1 for var in elastic_var_selection_count.keys() if var in traditional_vars)
    elastic_gt_selections = sum(1 for var in elastic_var_selection_count.keys() if var in gt_vars)

    print(f"  LASSO - Traditional variables selected: {lasso_traditional_selections}/{len(traditional_vars)} ({lasso_traditional_selections/len(traditional_vars)*100:.1f}%)")
    print(f"  LASSO - Google Trends variables selected: {lasso_gt_selections}/{len(gt_vars)} ({lasso_gt_selections/len(gt_vars)*100:.1f}%)")
    print(f"  Elastic Net - Traditional variables selected: {elastic_traditional_selections}/{len(traditional_vars)} ({elastic_traditional_selections/len(traditional_vars)*100:.1f}%)")
    print(f"  Elastic Net - Google Trends variables selected: {elastic_gt_selections}/{len(gt_vars)} ({elastic_gt_selections/len(gt_vars)*100:.1f}%)")

    return {
        'lasso_var_selection_freq': lasso_var_selection_freq,
        'lasso_var_significance_freq': lasso_var_significance_freq,
        'elastic_var_selection_freq': elastic_var_selection_freq,
        'elastic_var_significance_freq': elastic_var_significance_freq,
        'top_lasso_vars': top_lasso_vars,
        'top_elastic_vars': top_elastic_vars
    }

Creating Plots

In [None]:
def create_visualization_plots(results, analysis_results):
    """Create visualization plots for the combined analysis."""

    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")

    # Create a figure with multiple subplots
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Combined LASSO and Elastic Net Analysis Results', fontsize=16, fontweight='bold')

    # Extract data for plotting
    windows = [r['window_id'] for r in results]
    dates = [r['start_date'] for r in results]

    lasso_n_selected = [r['lasso_n_selected'] for r in results]
    elastic_n_selected = [r['elastic_n_selected'] for r in results]

    lasso_rmse = [r['rmse_lasso_ols'] for r in results]
    elastic_rmse = [r['rmse_elastic_ols'] for r in results]

    lasso_r2 = [r['r2_lasso_ols'] for r in results]
    elastic_r2 = [r['r2_elastic_ols'] for r in results]

    # Plot 1: Number of Variables Selected Over Time
    axes[0, 0].plot(dates, lasso_n_selected, label='LASSO', marker='o', alpha=0.7)
    axes[0, 0].plot(dates, elastic_n_selected, label='Elastic Net', marker='s', alpha=0.7)
    axes[0, 0].set_title('Variables Selected Over Time')
    axes[0, 0].set_ylabel('Number of Variables')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

    # Plot 2: RMSE Comparison Over Time
    axes[0, 1].plot(dates, lasso_rmse, label='LASSO', marker='o', alpha=0.7)
    axes[0, 1].plot(dates, elastic_rmse, label='Elastic Net', marker='s', alpha=0.7)
    axes[0, 1].set_title('RMSE Comparison Over Time')
    axes[0, 1].set_ylabel('RMSE')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)

    # Plot 3: R² Comparison Over Time
    axes[0, 2].plot(dates, lasso_r2, label='LASSO', marker='o', alpha=0.7)
    axes[0, 2].plot(dates, elastic_r2, label='Elastic Net', marker='s', alpha=0.7)
    axes[0, 2].set_title('R² Comparison Over Time')
    axes[0, 2].set_ylabel('R²')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.3)

    # Plot 4: Distribution of Variables Selected
    axes[1, 0].hist([lasso_n_selected, elastic_n_selected], bins=15, alpha=0.7,
                   label=['LASSO', 'Elastic Net'])
    axes[1, 0].set_title('Distribution of Variables Selected')
    axes[1, 0].set_xlabel('Number of Variables')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].legend()

    # Plot 5: RMSE Distribution
    axes[1, 1].hist([lasso_rmse, elastic_rmse], bins=15, alpha=0.7,
                   label=['LASSO', 'Elastic Net'])
    axes[1, 1].set_title('RMSE Distribution')
    axes[1, 1].set_xlabel('RMSE')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].legend()

    # Plot 6: Variable Selection Frequency Comparison (Top 10)
    top_vars = set(list(analysis_results['lasso_var_selection_freq'].keys())[:10] +
                  list(analysis_results['elastic_var_selection_freq'].keys())[:10])

    lasso_freqs = [analysis_results['lasso_var_selection_freq'].get(var, 0) for var in top_vars]
    elastic_freqs = [analysis_results['elastic_var_selection_freq'].get(var, 0) for var in top_vars]

    x = np.arange(len(top_vars))
    width = 0.35

    axes[1, 2].bar(x - width/2, lasso_freqs, width, label='LASSO', alpha=0.7)
    axes[1, 2].bar(x + width/2, elastic_freqs, width, label='Elastic Net', alpha=0.7)
    axes[1, 2].set_title('Variable Selection Frequency (Top Variables)')
    axes[1, 2].set_ylabel('Selection Frequency')
    axes[1, 2].set_xticks(x)
    axes[1, 2].set_xticklabels([var[:15] + '...' if len(var) > 15 else var for var in top_vars],
                              rotation=45, ha='right')
    axes[1, 2].legend()

    plt.tight_layout()
    plt.show()

    return fig

Save results

In [None]:
def save_combined_results(results, analysis_results, lasso_lambda, elastic_alpha, elastic_l1_ratio,
                         filename='combined_lasso_elastic_net_results.xlsx'):
    """Export combined LASSO and Elastic Net results to a comprehensive Excel file."""
    print("\nSaving combined results...")

    with pd.ExcelWriter(filename, engine='openpyxl') as writer:

        # Sheet 1: Performance Comparison Summary
        performance_df = pd.DataFrame([
            {
                'window_id': r['window_id'],
                'start_date': r['start_date'],
                'end_date': r['end_date'],

                # LASSO performance
                'lasso_n_selected': r['lasso_n_selected'],
                'lasso_rmse': r['rmse_lasso'],
                'lasso_ols_rmse': r['rmse_lasso_ols'],
                'lasso_ols_r2': r['r2_lasso_ols'],
                'lasso_ols_r2_adj': r['r2_adj_lasso_ols'],

                # Elastic Net performance
                'elastic_n_selected': r['elastic_n_selected'],
                'elastic_rmse': r['rmse_elastic'],
                'elastic_ols_rmse': r['rmse_elastic_ols'],
                'elastic_ols_r2': r['r2_elastic_ols'],
                'elastic_ols_r2_adj': r['r2_adj_elastic_ols'],

                # Model parameters
                'lasso_lambda': r['lasso_lambda'],
                'elastic_alpha': r['elastic_alpha'],
                'elastic_l1_ratio': r['elastic_l1_ratio']
            } for r in results
        ])
        performance_df.to_excel(writer, sheet_name='Performance_Comparison', index=False)

        # Sheet 2: LASSO Variable Selection Frequency
        lasso_freq_df = pd.DataFrame([
            {'variable': var, 'selection_frequency': freq}
            for var, freq in analysis_results['lasso_var_selection_freq'].items()
        ]).sort_values('selection_frequency', ascending=False)
        lasso_freq_df.to_excel(writer, sheet_name='LASSO_Selection_Frequency', index=False)

        # Sheet 3: Elastic Net Variable Selection Frequency
        elastic_freq_df = pd.DataFrame([
            {'variable': var, 'selection_frequency': freq}
            for var, freq in analysis_results['elastic_var_selection_freq'].items()
        ]).sort_values('selection_frequency', ascending=False)
        elastic_freq_df.to_excel(writer, sheet_name='ElasticNet_Selection_Frequency', index=False)

        # Sheet 4: LASSO Combined Rankings
        lasso_combined_df = pd.DataFrame([
            {
                'variable': var,
                'selection_frequency': analysis_results['lasso_var_selection_freq'].get(var, 0),
                'significance_frequency': analysis_results['lasso_var_significance_freq'].get(var, 0),
                'combined_score': analysis_results['lasso_var_selection_freq'].get(var, 0) * analysis_results['lasso_var_significance_freq'].get(var, 0)
            }
            for var in analysis_results['lasso_var_selection_freq'].keys()
        ]).sort_values('combined_score', ascending=False)
        lasso_combined_df.to_excel(writer, sheet_name='LASSO_Combined_Rankings', index=False)

        # Sheet 5: Elastic Net Combined Rankings
        elastic_combined_df = pd.DataFrame([
            {
                'variable': var,
                'selection_frequency': analysis_results['elastic_var_selection_freq'].get(var, 0),
                'significance_frequency': analysis_results['elastic_var_significance_freq'].get(var, 0),
                'combined_score': analysis_results['elastic_var_selection_freq'].get(var, 0) * analysis_results['elastic_var_significance_freq'].get(var, 0)
            }
            for var in analysis_results['elastic_var_selection_freq'].keys()
        ]).sort_values('combined_score', ascending=False)
        elastic_combined_df.to_excel(writer, sheet_name='ElasticNet_Combined_Rankings', index=False)

        # Sheet 6: Model Comparison
        comparison_df = pd.DataFrame([
            {'metric': 'LASSO Lambda', 'value': lasso_lambda},
            {'metric': 'Elastic Net Alpha', 'value': elastic_alpha},
            {'metric': 'Elastic Net L1 Ratio', 'value': elastic_l1_ratio},
            {'metric': 'Elastic Net LASSO %', 'value': elastic_l1_ratio * 100},
            {'metric': 'Elastic Net Ridge %', 'value': (1 - elastic_l1_ratio) * 100},
            {'metric': 'Total Windows', 'value': len(results)},
            {'metric': 'Analysis Type', 'value': 'Combined LASSO and Elastic Net Regime Analysis'},
            {'metric': 'Date Range Start', 'value': results[0]['start_date'].strftime('%Y-%m')},
            {'metric': 'Date Range End', 'value': results[-1]['end_date'].strftime('%Y-%m')},
            {'metric': 'Average LASSO Variables Selected', 'value': np.mean([r['lasso_n_selected'] for r in results])},
            {'metric': 'Average Elastic Net Variables Selected', 'value': np.mean([r['elastic_n_selected'] for r in results])},
            {'metric': 'Average LASSO OLS RMSE', 'value': np.mean([r['rmse_lasso_ols'] for r in results])},
            {'metric': 'Average Elastic Net OLS RMSE', 'value': np.mean([r['rmse_elastic_ols'] for r in results])},
            {'metric': 'Average LASSO OLS R2', 'value': np.mean([r['r2_lasso_ols'] for r in results])},
            {'metric': 'Average Elastic Net OLS R2', 'value': np.mean([r['r2_elastic_ols'] for r in results])}
        ])
        comparison_df.to_excel(writer, sheet_name='Model_Comparison', index=False)

        # Sheet 7: Variable Overlap Analysis
        lasso_vars = set(analysis_results['lasso_var_selection_freq'].keys())
        elastic_vars = set(analysis_results['elastic_var_selection_freq'].keys())
        common_vars = lasso_vars.intersection(elastic_vars)
        lasso_only = lasso_vars - elastic_vars
        elastic_only = elastic_vars - lasso_vars

        overlap_data = []

        # Common variables
        for var in common_vars:
            overlap_data.append({
                'variable': var,
                'selection_type': 'Both Models',
                'lasso_freq': analysis_results['lasso_var_selection_freq'].get(var, 0),
                'elastic_freq': analysis_results['elastic_var_selection_freq'].get(var, 0),
                'lasso_sig_freq': analysis_results['lasso_var_significance_freq'].get(var, 0),
                'elastic_sig_freq': analysis_results['elastic_var_significance_freq'].get(var, 0)
            })

        # LASSO only
        for var in lasso_only:
            overlap_data.append({
                'variable': var,
                'selection_type': 'LASSO Only',
                'lasso_freq': analysis_results['lasso_var_selection_freq'].get(var, 0),
                'elastic_freq': 0,
                'lasso_sig_freq': analysis_results['lasso_var_significance_freq'].get(var, 0),
                'elastic_sig_freq': 0
            })

        # Elastic Net only
        for var in elastic_only:
            overlap_data.append({
                'variable': var,
                'selection_type': 'Elastic Net Only',
                'lasso_freq': 0,
                'elastic_freq': analysis_results['elastic_var_selection_freq'].get(var, 0),
                'lasso_sig_freq': 0,
                'elastic_sig_freq': analysis_results['elastic_var_significance_freq'].get(var, 0)
            })

        overlap_df = pd.DataFrame(overlap_data).sort_values(['selection_type', 'lasso_freq', 'elastic_freq'], ascending=[True, False, False])
        overlap_df.to_excel(writer, sheet_name='Variable_Overlap_Analysis', index=False)

        # Sheet 8: Summary Statistics
        lasso_n_selected = [r['lasso_n_selected'] for r in results]
        elastic_n_selected = [r['elastic_n_selected'] for r in results]
        lasso_rmse = [r['rmse_lasso_ols'] for r in results]
        elastic_rmse = [r['rmse_elastic_ols'] for r in results]
        lasso_r2 = [r['r2_lasso_ols'] for r in results]
        elastic_r2 = [r['r2_elastic_ols'] for r in results]

        summary_stats_df = pd.DataFrame([
            {'Metric': 'Variables Selected', 'Model': 'LASSO', 'Mean': np.mean(lasso_n_selected), 'Std': np.std(lasso_n_selected), 'Min': np.min(lasso_n_selected), 'Max': np.max(lasso_n_selected)},
            {'Metric': 'Variables Selected', 'Model': 'Elastic Net', 'Mean': np.mean(elastic_n_selected), 'Std': np.std(elastic_n_selected), 'Min': np.min(elastic_n_selected), 'Max': np.max(elastic_n_selected)},
            {'Metric': 'RMSE', 'Model': 'LASSO', 'Mean': np.mean(lasso_rmse), 'Std': np.std(lasso_rmse), 'Min': np.min(lasso_rmse), 'Max': np.max(lasso_rmse)},
            {'Metric': 'RMSE', 'Model': 'Elastic Net', 'Mean': np.mean(elastic_rmse), 'Std': np.std(elastic_rmse), 'Min': np.min(elastic_rmse), 'Max': np.max(elastic_rmse)},
            {'Metric': 'R²', 'Model': 'LASSO', 'Mean': np.mean(lasso_r2), 'Std': np.std(lasso_r2), 'Min': np.min(lasso_r2), 'Max': np.max(lasso_r2)},
            {'Metric': 'R²', 'Model': 'Elastic Net', 'Mean': np.mean(elastic_r2), 'Std': np.std(elastic_r2), 'Min': np.min(elastic_r2), 'Max': np.max(elastic_r2)}
        ])
        summary_stats_df.to_excel(writer, sheet_name='Summary_Statistics', index=False)

        # Sheet 9: Detailed Coefficients by Window
        all_coefficients_data = []
        for r in results:
            window_id = r['window_id']
            start_date = r['start_date']
            end_date = r['end_date']

            # LASSO coefficients
            for var, coef in r['lasso_ols_coefficients'].items():
                all_coefficients_data.append({
                    'Window ID': window_id,
                    'Start Date': start_date,
                    'End Date': end_date,
                    'Model': 'LASSO (OLS)',
                    'Variable': var,
                    'Coefficient': coef,
                    'P-value': r['lasso_ols_pvalues'].get(var),
                    'T-value': r['lasso_ols_tvalues'].get(var)
                })

            # Elastic Net coefficients
            for var, coef in r['elastic_ols_coefficients'].items():
                all_coefficients_data.append({
                    'Window ID': window_id,
                    'Start Date': start_date,
                    'End Date': end_date,
                    'Model': 'Elastic Net (OLS)',
                    'Variable': var,
                    'Coefficient': coef,
                    'P-value': r['elastic_ols_pvalues'].get(var),
                    'T-value': r['elastic_ols_tvalues'].get(var)
                })

        coefficients_df = pd.DataFrame(all_coefficients_data)
        coefficients_df.to_excel(writer, sheet_name='Detailed_Coefficients', index=False)


    print(f"✓ Combined results saved to: {filename}")
    print(f"✓ File contains 9 comprehensive sheets with LASSO vs Elastic Net analysis")
    print("  - Performance_Comparison: Side-by-side model performance metrics")
    print("  - LASSO_Selection_Frequency: How often variables selected by LASSO")
    print("  - ElasticNet_Selection_Frequency: How often variables selected by Elastic Net")
    print("  - LASSO_Combined_Rankings: LASSO selection × significance scores")
    print("  - ElasticNet_Combined_Rankings: Elastic Net selection × significance scores")
    print("  - Model_Comparison: Parameters and average performance metrics")
    print("  - Variable_Overlap_Analysis: Which variables each model prefers")
    print("  - Summary_Statistics: Descriptive statistics for key metrics")
    print("  - Detailed_Coefficients: Coefficients for each variable in each window")

Exectution function

In [None]:
def main(filename):
    """Execute combined rolling window LASSO and Elastic Net analysis."""
    print("COMBINED LASSO AND ELASTIC NET ROLLING WINDOW ANALYSIS")
    print("="*70)

    # Load preprocessed data
    df = load_and_prepare_data(filename)

    # Create temporal windows for regime analysis
    windows = create_rolling_windows(df, window_length=62, step_size=6)

    # Identify variables and categorize by type
    target_var, predictor_vars, traditional_vars, gt_vars = prepare_variables(df)

    # Estimate optimal parameters for both models
    lasso_lambda, elastic_alpha, elastic_l1_ratio = estimate_global_parameters(df, target_var, predictor_vars)

    # Apply combined analysis across all windows
    results = run_rolling_combined_analysis(df, windows, target_var, predictor_vars,
                                          lasso_lambda, elastic_alpha, elastic_l1_ratio)

    # Analyze and compare both models
    analysis_results = analyze_combined_results(results, traditional_vars, gt_vars)

    # Create visualizations
    print("\nCreating visualization plots...")
    fig = create_visualization_plots(results, analysis_results)

    # Export comprehensive comparison
    save_combined_results(results, analysis_results, lasso_lambda, elastic_alpha, elastic_l1_ratio)

    print("\n" + "="*70)
    print("COMBINED LASSO AND ELASTIC NET ANALYSIS COMPLETE!")
    print("="*70)
    print(f"✓ LASSO: λ = {lasso_lambda:.6f}")
    print(f"✓ Elastic Net: α = {elastic_alpha:.6f}, l1_ratio = {elastic_l1_ratio:.3f}")
    print(f"✓ Elastic Net Mix: {elastic_l1_ratio*100:.1f}% LASSO + {(1-elastic_l1_ratio)*100:.1f}% Ridge")
    print(f"✓ Processed {len(results)} rolling windows with both models")
    print("✓ Combined variable selection with OLS coefficient estimation")
    print("✓ Comprehensive model comparison results saved")
    print("✓ Visualization plots created")
    print("✓ Ready for regime-specific interpretation and model comparison")

    return results, analysis_results, lasso_lambda, elastic_alpha, elastic_l1_ratio

# Execute the combined analysis
results, analysis_results, lasso_lambda, elastic_alpha, elastic_l1_ratio = main('final_dataset_with_lags_stl.xlsx')