# Time Series Forecasting Results Visualization

This notebook automatically loads all experimental results and provides flexible visualization options.

## Workflow:
1. **Load all results** - Automatically scan and load all available results
2. **Configure filters** - Select specific datasets/models/settings to visualize
3. **Visualize** - Generate comparison tables and plots

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Libraries loaded successfully!")

## 1. Load All Results

Automatically scan the results directory and load all available experimental results.

In [None]:
def scan_and_load_all_results(base_results_path='./results'):
    """
    Scan the results directory and load all available results.

    Directory structure:
    results/{DATASET}/{seq_len}_{pred_len}/{MODEL}/{setting}/
        - metrics.npy
        - per_channel_metrics.npy (for CD mode)
        - scalability.npy
        - pred.npy
        - true.npy
        - input.npy

    Returns:
        results_dict: Nested dictionary {dataset: {length_config: {model: {mode: {feature: {repeat: data}}}}}}
        metrics_df: DataFrame with all metrics
    """
    results_dict = {}
    metrics_list = []

    if not os.path.exists(base_results_path):
        print(f"Results path does not exist: {base_results_path}")
        return results_dict, pd.DataFrame()

    # First, load channel names from results.json files
    channel_names_cache = {}

    for dataset_dir in Path(base_results_path).iterdir():
        if not dataset_dir.is_dir():
            continue
        dataset_name = dataset_dir.name

        for length_dir in dataset_dir.iterdir():
            if not length_dir.is_dir():
                continue
            length_config = length_dir.name

            # Try to find CD results.json to get channel names
            for json_file in length_dir.glob('*_CD_results.json'):
                try:
                    with open(json_file, 'r') as f:
                        cd_results = json.load(f)
                    if 'per_channel' in cd_results:
                        channel_names_cache[(dataset_name, length_config)] = list(cd_results['per_channel'].keys())
                        break
                except:
                    pass

    # Scan all datasets
    for dataset_dir in Path(base_results_path).iterdir():
        if not dataset_dir.is_dir():
            continue

        dataset_name = dataset_dir.name
        results_dict[dataset_name] = {}

        # Scan all length configurations
        for length_dir in dataset_dir.iterdir():
            if not length_dir.is_dir():
                continue

            length_config = length_dir.name
            try:
                seq_len, pred_len = map(int, length_config.split('_'))
            except:
                continue

            results_dict[dataset_name][length_config] = {}

            # Get channel names for this dataset/length config
            channel_names = channel_names_cache.get((dataset_name, length_config), None)

            # Scan all models
            for model_dir in length_dir.iterdir():
                if not model_dir.is_dir():
                    continue

                model_name = model_dir.name
                results_dict[dataset_name][length_config][model_name] = {}

                # Scan all experiment settings
                for setting_dir in model_dir.iterdir():
                    if not setting_dir.is_dir():
                        continue

                    setting_name = setting_dir.name
                    parts = setting_name.split('_')

                    if len(parts) < 3:
                        continue

                    # Find mode (CD or CI)
                    mode = None
                    mode_idx = -1
                    for i, part in enumerate(parts):
                        if part in ['CD', 'CI']:
                            mode = part
                            mode_idx = i
                            break

                    if mode is None:
                        continue

                    # Find repeat number
                    repeat_num = None
                    for i in range(len(parts) - 1, -1, -1):
                        if parts[i].startswith('repeat'):
                            repeat_num = parts[i].replace('repeat', '')
                            break

                    if repeat_num is None:
                        continue

                    # Extract feature name (for CI mode)
                    if mode == 'CI':
                        feature_parts = []
                        for i in range(mode_idx + 1, len(parts)):
                            if parts[i].startswith('repeat'):
                                break
                            feature_parts.append(parts[i])
                        feature = '_'.join(feature_parts) if feature_parts else 'unknown'
                    else:
                        feature = 'all'

                    # Initialize nested structure
                    if mode not in results_dict[dataset_name][length_config][model_name]:
                        results_dict[dataset_name][length_config][model_name][mode] = {}

                    if feature not in results_dict[dataset_name][length_config][model_name][mode]:
                        results_dict[dataset_name][length_config][model_name][mode][feature] = {}

                    # Load data files
                    metrics_path = setting_dir / 'metrics.npy'
                    per_channel_metrics_path = setting_dir / 'per_channel_metrics.npy'
                    scalability_path = setting_dir / 'scalability.npy'
                    pred_path = setting_dir / 'pred.npy'
                    true_path = setting_dir / 'true.npy'
                    input_path = setting_dir / 'input.npy'

                    if not all([metrics_path.exists(), pred_path.exists(), true_path.exists()]):
                        continue

                    try:
                        metrics = np.load(str(metrics_path))
                        pred = np.load(str(pred_path))
                        true = np.load(str(true_path))
                        input_seq = np.load(str(input_path)) if input_path.exists() else None
                        per_channel_metrics = np.load(str(per_channel_metrics_path)) if per_channel_metrics_path.exists() else None
                        scalability = np.load(str(scalability_path)) if scalability_path.exists() else None

                        results_dict[dataset_name][length_config][model_name][mode][feature][repeat_num] = {
                            'metrics': metrics,
                            'pred': pred,
                            'true': true,
                            'input': input_seq,
                            'per_channel_metrics': per_channel_metrics,
                            'scalability': scalability
                        }

                        # Add overall metrics to list
                        base_metrics = {
                            'Dataset': dataset_name,
                            'Seq_Len': seq_len,
                            'Pred_Len': pred_len,
                            'Model': model_name,
                            'Mode': mode,
                            'Feature': feature,
                            'Channel': 'overall',
                            'Repeat': repeat_num,
                            'MAE': metrics[0],
                            'MSE': metrics[1],
                            'RMSE': metrics[2],
                            'MAPE': metrics[3],
                            'MSPE': metrics[4]
                        }

                        # Add scalability metrics if available
                        if scalability is not None:
                            base_metrics['Inference_Time'] = scalability[0]
                            base_metrics['Inference_Memory_GB'] = scalability[1]

                        metrics_list.append(base_metrics)

                        # Add per-channel metrics for CD mode
                        if mode == 'CD' and per_channel_metrics is not None and channel_names is not None:
                            n_channels = per_channel_metrics.shape[0]
                            for ch in range(min(n_channels, len(channel_names))):
                                ch_metrics = {
                                    'Dataset': dataset_name,
                                    'Seq_Len': seq_len,
                                    'Pred_Len': pred_len,
                                    'Model': model_name,
                                    'Mode': mode,
                                    'Feature': feature,
                                    'Channel': channel_names[ch],
                                    'Repeat': repeat_num,
                                    'MAE': per_channel_metrics[ch, 0],
                                    'MSE': per_channel_metrics[ch, 1],
                                    'RMSE': per_channel_metrics[ch, 2],
                                    'MAPE': per_channel_metrics[ch, 3],
                                    'MSPE': per_channel_metrics[ch, 4]
                                }

                                # Scalability is same for all channels
                                if scalability is not None:
                                    ch_metrics['Inference_Time'] = scalability[0]
                                    ch_metrics['Inference_Memory_GB'] = scalability[1]

                                metrics_list.append(ch_metrics)
                    except Exception as e:
                        print(f"Error loading {setting_dir}: {e}")
                        continue

    metrics_df = pd.DataFrame(metrics_list)
    return results_dict, metrics_df

# Load all results
print("Scanning results directory...")
results_dict, all_metrics_df = scan_and_load_all_results('./results')

print(f"\n{'='*80}")
print(f"Loaded {len(all_metrics_df)} experimental results")
print(f"{'='*80}\n")

if len(all_metrics_df) > 0:
    print("Available configurations:")
    print(f"\nDatasets: {all_metrics_df['Dataset'].unique().tolist()}")
    print(f"\nLength configurations (Seq_Len, Pred_Len):")
    for _, row in all_metrics_df[['Seq_Len', 'Pred_Len']].drop_duplicates().iterrows():
        print(f"  - {row['Seq_Len']} → {row['Pred_Len']}")
    print(f"\nModels: {all_metrics_df['Model'].unique().tolist()}")
    print(f"\nModes: {all_metrics_df['Mode'].unique().tolist()}")
    print(f"\nFeatures (CI mode): {[f for f in all_metrics_df['Feature'].unique() if f != 'all']}")
    print(f"\nTotal experiments by configuration:")
    print(all_metrics_df.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature', 'Channel']).size())
else:
    print("No results found! Please run experiments first.")

## 3. Metrics Summary Table

Display summary statistics for filtered results.

**Note**: For CI mode, metrics are shown both per-feature and as an average across all features.

In [None]:
if len(all_metrics_df) > 0:
    # ========== CD Overall Summary ==========
    print("\n" + "="*100)
    print("CD MODE: OVERALL METRICS (Mean ± Std across repeats)")
    print("="*100)
    
    cd_overall = all_metrics_df[(all_metrics_df['Mode'] == 'CD') & (all_metrics_df['Channel'] == 'overall')]
    if len(cd_overall) > 0:
        cd_overall_summary = cd_overall.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']).agg({
            'MAE': ['mean', 'std'],
            'MSE': ['mean', 'std'],
            'RMSE': ['mean', 'std'],
            'MAPE': ['mean', 'std'],
            'MSPE': ['mean', 'std']
        }).round(4)
        print(cd_overall_summary)
    else:
        print("No CD overall results found.")
    
    # ========== CD Per-Channel Summary ==========
    print("\n" + "="*100)
    print("CD MODE: PER-CHANNEL METRICS (Mean ± Std across repeats)")
    print("="*100)
    
    cd_channels = all_metrics_df[(all_metrics_df['Mode'] == 'CD') & (all_metrics_df['Channel'] != 'overall')]
    if len(cd_channels) > 0:
        cd_channel_summary = cd_channels.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature', 'Channel']).agg({
            'MAE': ['mean', 'std'],
            'MSE': ['mean', 'std'],
            'RMSE': ['mean', 'std'],
            'MAPE': ['mean', 'std'],
            'MSPE': ['mean', 'std']
        }).round(4)
        print(cd_channel_summary)
    else:
        print("No CD per-channel results found.")
    
    # ========== CI Per-Feature Summary ==========
    print("\n" + "="*100)
    print("CI MODE: PER-FEATURE METRICS (Mean ± Std across repeats)")
    print("="*100)
    
    ci_data = all_metrics_df[all_metrics_df['Mode'] == 'CI']
    if len(ci_data) > 0:
        ci_summary = ci_data.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']).agg({
            'MAE': ['mean', 'std'],
            'MSE': ['mean', 'std'],
            'RMSE': ['mean', 'std'],
            'MAPE': ['mean', 'std'],
            'MSPE': ['mean', 'std']
        }).round(4)
        print(ci_summary)
    else:
        print("No CI results found.")
    
    # ========== CI Average Calculation ==========
    if len(ci_data) > 0:
        print("\n" + "="*100)
        print("CI MODE: AVERAGE ACROSS ALL FEATURES")
        print("="*100)
        
        # Calculate average metrics across features for each model
        ci_avg_list = []
        for (dataset, seq_len, pred_len, model), group in ci_data.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model']):
            # Average across all features and repeats
            ci_avg_list.append({
                'Dataset': dataset,
                'Seq_Len': seq_len,
                'Pred_Len': pred_len,
                'Model': model,
                'Mode': 'CI_avg',
                'Feature': 'all_features_avg',
                'MAE_mean': group['MAE'].mean(),
                'MAE_std': group['MAE'].std(),
                'MSE_mean': group['MSE'].mean(),
                'MSE_std': group['MSE'].std(),
                'RMSE_mean': group['RMSE'].mean(),
                'RMSE_std': group['RMSE'].std(),
                'MAPE_mean': group['MAPE'].mean(),
                'MAPE_std': group['MAPE'].std(),
                'MSPE_mean': group['MSPE'].mean(),
                'MSPE_std': group['MSPE'].std()
            })
        
        ci_avg_df = pd.DataFrame(ci_avg_list)
        print(ci_avg_df.to_string(index=False))
    
    # ========== Simplified Comparison Table ==========
    print("\n" + "="*100)
    print("SIMPLIFIED COMPARISON TABLE")
    print("="*100)
    
    comparison_data = []
    
    # Add CD overall results
    if len(cd_overall) > 0:
        for idx, row in cd_overall.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']):
            comparison_data.append({
                'Dataset': idx[0],
                'Lengths': f"{idx[1]}→{idx[2]}",
                'Model': idx[3],
                'Mode': idx[4],
                'Feature': idx[5],
                'Channel': 'overall',
                'MAE': f"{row['MAE'].mean():.4f}±{row['MAE'].std():.4f}",
                'RMSE': f"{row['RMSE'].mean():.4f}±{row['RMSE'].std():.4f}",
                'MAPE': f"{row['MAPE'].mean():.4f}±{row['MAPE'].std():.4f}"
            })
    
    # Add CD per-channel results
    if len(cd_channels) > 0:
        for idx, row in cd_channels.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature', 'Channel']):
            comparison_data.append({
                'Dataset': idx[0],
                'Lengths': f"{idx[1]}→{idx[2]}",
                'Model': idx[3],
                'Mode': idx[4],
                'Feature': idx[5],
                'Channel': idx[6],
                'MAE': f"{row['MAE'].mean():.4f}±{row['MAE'].std():.4f}",
                'RMSE': f"{row['RMSE'].mean():.4f}±{row['RMSE'].std():.4f}",
                'MAPE': f"{row['MAPE'].mean():.4f}±{row['MAPE'].std():.4f}"
            })
    
    # Add CI per-feature results
    if len(ci_data) > 0:
        for idx, row in ci_data.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']):
            comparison_data.append({
                'Dataset': idx[0],
                'Lengths': f"{idx[1]}→{idx[2]}",
                'Model': idx[3],
                'Mode': idx[4],
                'Feature': idx[5],
                'Channel': 'N/A',
                'MAE': f"{row['MAE'].mean():.4f}±{row['MAE'].std():.4f}",
                'RMSE': f"{row['RMSE'].mean():.4f}±{row['RMSE'].std():.4f}",
                'MAPE': f"{row['MAPE'].mean():.4f}±{row['MAPE'].std():.4f}"
            })
        
        # Add CI average results
        for _, row in ci_avg_df.iterrows():
            comparison_data.append({
                'Dataset': row['Dataset'],
                'Lengths': f"{row['Seq_Len']}→{row['Pred_Len']}",
                'Model': row['Model'],
                'Mode': 'CI_avg',
                'Feature': 'all_features_avg',
                'Channel': 'N/A',
                'MAE': f"{row['MAE_mean']:.4f}±{row['MAE_std']:.4f}",
                'RMSE': f"{row['RMSE_mean']:.4f}±{row['RMSE_std']:.4f}",
                'MAPE': f"{row['MAPE_mean']:.4f}±{row['MAPE_std']:.4f}"
            })
    
    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))
    
else:
    print("\nNo filtered metrics to display.")

## 2. Configuration for Visualization

Set filters to select which results to visualize. Set to `None` to include all.

In [None]:
# ========== VISUALIZATION CONFIGURATION ==========

# Filter by dataset (None for all datasets)
FILTER_DATASET = 'milano_6165'  # or None

# Filter by sequence lengths (None for all lengths)
FILTER_SEQ_LEN = 96  # or None
FILTER_PRED_LEN = 96  # or None

# Filter by models (None for all models)
FILTER_MODELS = ['Autoformer', 'SegRNN', 'TimeMixer', 'SCINet']  # or None

# Filter by modes (None for all modes)
FILTER_MODES = ['CD', 'CI']  # or None

# For CI mode, filter by features (None for all features)
FILTER_CI_FEATURES = None  # e.g., ['OT', 'smsin'] or None

# Number of samples to plot in detailed visualization
N_SAMPLES_TO_PLOT = 3

# Random seed for sample selection
RANDOM_SEED = 42

# ==================================================

np.random.seed(RANDOM_SEED)

# Apply filters
filtered_df = all_metrics_df.copy()

if FILTER_DATASET is not None:
    filtered_df = filtered_df[filtered_df['Dataset'] == FILTER_DATASET]

if FILTER_SEQ_LEN is not None:
    filtered_df = filtered_df[filtered_df['Seq_Len'] == FILTER_SEQ_LEN]

if FILTER_PRED_LEN is not None:
    filtered_df = filtered_df[filtered_df['Pred_Len'] == FILTER_PRED_LEN]

if FILTER_MODELS is not None:
    filtered_df = filtered_df[filtered_df['Model'].isin(FILTER_MODELS)]

if FILTER_MODES is not None:
    filtered_df = filtered_df[filtered_df['Mode'].isin(FILTER_MODES)]

if FILTER_CI_FEATURES is not None:
    # Keep all CD results and only specified CI features
    filtered_df = filtered_df[
        (filtered_df['Mode'] == 'CD') | 
        ((filtered_df['Mode'] == 'CI') & (filtered_df['Feature'].isin(FILTER_CI_FEATURES)))
    ]

print(f"Filtered to {len(filtered_df)} results")
print(f"\nFiltered configurations:")
if len(filtered_df) > 0:
    print(filtered_df.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']).size())
else:
    print("No results match the filters!")

## 6. Detailed Prediction Visualization with Input Sequence

Plot input sequence, ground truth, and predictions together.

In [None]:
def plot_predictions_with_input(results_dict, dataset, length_config, model, mode, feature='all', repeat='0', n_samples=3):
    """
    Plot input, predictions, and ground truth for a specific configuration.
    """
    try:
        data = results_dict[dataset][length_config][model][mode][feature][repeat]
        pred = data['pred']
        true = data['true']
        input_seq = data['input']
        
        if input_seq is None:
            print(f"Warning: No input sequence saved for {dataset}/{length_config}/{model}/{mode}/{feature}")
            return
        
        # Select random samples to plot
        n_total = pred.shape[0]
        n_samples = min(n_samples, n_total)
        sample_indices = np.random.choice(n_total, n_samples, replace=False)
        
        # Determine number of features
        n_features = pred.shape[2]
        seq_len = input_seq.shape[1]
        pred_len = pred.shape[1]
        
        # Create subplots
        fig, axes = plt.subplots(n_samples, n_features, figsize=(7*n_features, 4*n_samples))
        if n_samples == 1:
            axes = axes.reshape(1, -1)
        if n_features == 1:
            axes = axes.reshape(-1, 1)
        
        fig.suptitle(
            f'{dataset} - {model} - {mode} - {feature} (Repeat {repeat})\n'
            f'Input: {seq_len} steps → Prediction: {pred_len} steps',
            fontsize=16, y=0.998
        )
        
        for i, sample_idx in enumerate(sample_indices):
            for j in range(n_features):
                ax = axes[i, j]
                
                # Time steps
                input_time = np.arange(seq_len)
                output_time = np.arange(seq_len, seq_len + pred_len)
                
                # Plot input sequence
                ax.plot(input_time, input_seq[sample_idx, :, j], 
                       label='Input Sequence', color='gray', linewidth=2, alpha=0.7)
                
                # Plot ground truth
                ax.plot(output_time, true[sample_idx, :, j], 
                       label='Ground Truth', color='blue', linewidth=2, alpha=0.7)
                
                # Plot prediction
                ax.plot(output_time, pred[sample_idx, :, j], 
                       label='Prediction', color='red', linewidth=2, linestyle='--', alpha=0.7)
                
                # Add vertical line to separate input and output
                ax.axvline(x=seq_len, color='black', linestyle=':', linewidth=1.5, alpha=0.5)
                ax.text(seq_len, ax.get_ylim()[1], ' Forecast Start', 
                       fontsize=9, verticalalignment='top')
                
                ax.set_xlabel('Time Step', fontsize=11)
                ax.set_ylabel('Value', fontsize=11)
                ax.set_title(f'Sample {sample_idx} - Feature {j}', fontsize=12)
                ax.legend(loc='best', fontsize=10)
                ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
    except KeyError as e:
        print(f"Error: Configuration not found - {e}")
    except Exception as e:
        print(f"Error plotting: {e}")

# Plot predictions for a sample of filtered configurations
if len(filtered_df) > 0:
    # Get unique configurations from filtered data
    configs = filtered_df.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model', 'Mode', 'Feature']).size().reset_index()
    
    # Limit to first 3 configurations to avoid too many plots
    for _, config in configs.head(3).iterrows():
        dataset = config['Dataset']
        length_config = f"{config['Seq_Len']}_{config['Pred_Len']}"
        model = config['Model']
        mode = config['Mode']
        feature = config['Feature']
        
        # Check if this configuration exists in results_dict
        try:
            if (dataset in results_dict and 
                length_config in results_dict[dataset] and
                model in results_dict[dataset][length_config] and
                mode in results_dict[dataset][length_config][model] and
                feature in results_dict[dataset][length_config][model][mode]):
                
                # Get first repeat for visualization
                repeats = list(results_dict[dataset][length_config][model][mode][feature].keys())
                if repeats:
                    repeat = repeats[0]
                    print(f"\nPlotting: {dataset} - {length_config} - {model} - {mode} - {feature}")
                    plot_predictions_with_input(
                        results_dict, dataset, length_config, model, mode, feature, repeat, N_SAMPLES_TO_PLOT
                    )
        except Exception as e:
            print(f"Error accessing configuration: {e}")
            continue
else:
    print("No results to visualize.")

## 7. Export Summary to CSV

Save the metrics summary to CSV files for further analysis.

## 8. Scalability Analysis

Analyze and compare scalability metrics (training time, inference time, GPU memory usage) across models and modes.

In [None]:
if len(filtered_df) > 0 and 'Inference_Time' in filtered_df.columns:
    # Calculate scalability statistics
    print("\n" + "="*100)
    print("SCALABILITY METRICS SUMMARY")
    print("="*100)

    # Group by Model and Mode
    scalability_summary = filtered_df[filtered_df['Channel'] == 'overall'].groupby(['Dataset', 'Model', 'Mode']).agg({
        'Inference_Time': ['mean', 'std'],
        'Inference_Memory_GB': ['mean', 'std']
    }).round(4)

    print(scalability_summary)

    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Get data for plotting
    plot_data = filtered_df[filtered_df['Channel'] == 'overall'].copy()

    # 1. Inference Time Comparison
    ax = axes[0, 0]
    models = plot_data['Model'].unique()
    x = np.arange(len(models))
    width = 0.35

    cd_times = []
    ci_times = []
    for model in models:
        cd_val = plot_data[(plot_data['Model'] == model) & (plot_data['Mode'] == 'CD')]['Inference_Time']
        ci_val = plot_data[(plot_data['Model'] == model) & (plot_data['Mode'] == 'CI')]['Inference_Time']
        cd_times.append(cd_val.mean() if len(cd_val) > 0 else 0)
        ci_times.append(ci_val.mean() if len(ci_val) > 0 else 0)

    bars1 = ax.bar(x - width/2, cd_times, width, label='CD', alpha=0.8, color='skyblue')
    bars2 = ax.bar(x + width/2, ci_times, width, label='CI', alpha=0.8, color='lightcoral')

    ax.set_ylabel('Inference Time (s)', fontsize=12)
    ax.set_title('Inference Time Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)

    # 2. Inference Memory Comparison
    ax = axes[0, 1]
    cd_memory = []
    ci_memory = []
    for model in models:
        cd_val = plot_data[(plot_data['Model'] == model) & (plot_data['Mode'] == 'CD')]['Inference_Memory_GB']
        ci_val = plot_data[(plot_data['Model'] == model) & (plot_data['Mode'] == 'CI')]['Inference_Memory_GB']
        cd_memory.append(cd_val.mean() if len(cd_val) > 0 else 0)
        ci_memory.append(ci_val.mean() if len(ci_val) > 0 else 0)

    bars1 = ax.bar(x - width/2, cd_memory, width, label='CD', alpha=0.8, color='skyblue')
    bars2 = ax.bar(x + width/2, ci_memory, width, label='CI', alpha=0.8, color='lightcoral')

    ax.set_ylabel('GPU Memory (GB)', fontsize=12)
    ax.set_title('Inference Memory Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(models, rotation=45, ha='right')
    ax.legend()
    ax.grid(axis='y', alpha=0.3)

    # 3. Inference Time vs MAE Trade-off
    ax = axes[1, 0]
    for mode in ['CD', 'CI']:
        mode_data = plot_data[plot_data['Mode'] == mode]
        if len(mode_data) > 0:
            ax.scatter(mode_data['Inference_Time'], mode_data['MAE'],
                      label=mode, s=100, alpha=0.6)

            # Add model labels
            for _, row in mode_data.iterrows():
                ax.annotate(row['Model'],
                           (row['Inference_Time'], row['MAE']),
                           fontsize=8, alpha=0.7)

    ax.set_xlabel('Inference Time (s)', fontsize=12)
    ax.set_ylabel('MAE', fontsize=12)
    ax.set_title('Inference Time vs MAE Trade-off', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    # 4. Memory vs MAE Trade-off
    ax = axes[1, 1]
    for mode in ['CD', 'CI']:
        mode_data = plot_data[plot_data['Mode'] == mode]
        if len(mode_data) > 0:
            ax.scatter(mode_data['Inference_Memory_GB'], mode_data['MAE'],
                      label=mode, s=100, alpha=0.6)

            # Add model labels
            for _, row in mode_data.iterrows():
                ax.annotate(row['Model'],
                           (row['Inference_Memory_GB'], row['MAE']),
                           fontsize=8, alpha=0.7)

    ax.set_xlabel('GPU Memory (GB)', fontsize=12)
    ax.set_ylabel('MAE', fontsize=12)
    ax.set_title('Memory vs MAE Trade-off', fontsize=14, fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

    # Print detailed comparison table
    print("\n" + "="*100)
    print("SCALABILITY DETAILED COMPARISON")
    print("="*100)

    comparison_data = []
    for idx, row in plot_data.groupby(['Dataset', 'Model', 'Mode']):
        comparison_data.append({
            'Dataset': idx[0],
            'Model': idx[1],
            'Mode': idx[2],
            'Inference_Time(s)': f"{row['Inference_Time'].mean():.4f}±{row['Inference_Time'].std():.4f}",
            'Inference_Memory(GB)': f"{row['Inference_Memory_GB'].mean():.4f}±{row['Inference_Memory_GB'].std():.4f}",
            'MAE': f"{row['MAE'].mean():.4f}±{row['MAE'].std():.4f}",
            'RMSE': f"{row['RMSE'].mean():.4f}±{row['RMSE'].std():.4f}"
        })

    comparison_df_scalability = pd.DataFrame(comparison_data)
    print(comparison_df_scalability.to_string(index=False))
else:
    print("No scalability metrics available or no filtered data.")

In [None]:
if len(filtered_df) > 0:
    # Create filename based on filters
    filename_parts = ['results_summary']
    if FILTER_DATASET:
        filename_parts.append(FILTER_DATASET)
    if FILTER_SEQ_LEN and FILTER_PRED_LEN:
        filename_parts.append(f"{FILTER_SEQ_LEN}_{FILTER_PRED_LEN}")
    
    base_filename = '_'.join(filename_parts)
    
    # Save detailed metrics
    detailed_output_path = f'./{base_filename}_detailed.csv'
    filtered_df.to_csv(detailed_output_path, index=False)
    print(f"Detailed metrics saved to: {detailed_output_path}")
    
    # Save comparison table
    if 'comparison_df' in locals() and len(comparison_df) > 0:
        comparison_output_path = f'./{base_filename}_comparison.csv'
        comparison_df.to_csv(comparison_output_path, index=False)
        print(f"Comparison table saved to: {comparison_output_path}")
    
    # Save overall summary (CD Overall + CI Average) for paper_viz
    overall_summary_list = []
    
    # Add CD overall results
    cd_overall = all_metrics_df[(all_metrics_df['Mode'] == 'CD') & (all_metrics_df['Channel'] == 'overall')]
    if len(cd_overall) > 0:
        cd_summary = cd_overall.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model']).agg({
            'MAE': 'mean',
            'MSE': 'mean',
            'RMSE': 'mean',
            'MAPE': 'mean',
            'MSPE': 'mean',
            'Inference_Time': 'mean',
            'Inference_Memory_GB': 'mean'
        }).reset_index()
        cd_summary['Mode'] = 'CD'
        overall_summary_list.append(cd_summary)
    
    # Add CI average results
    ci_data = all_metrics_df[all_metrics_df['Mode'] == 'CI']
    if len(ci_data) > 0:
        ci_avg = ci_data.groupby(['Dataset', 'Seq_Len', 'Pred_Len', 'Model']).agg({
            'MAE': 'mean',
            'MSE': 'mean',
            'RMSE': 'mean',
            'MAPE': 'mean',
            'MSPE': 'mean',
            'Inference_Time': 'mean',
            'Inference_Memory_GB': 'mean'
        }).reset_index()
        ci_avg['Mode'] = 'CI'
        overall_summary_list.append(ci_avg)
    
    # Combine and save
    if overall_summary_list:
        overall_summary_df = pd.concat(overall_summary_list, ignore_index=True)
        overall_output_path = './results_summary.csv'
        overall_summary_df.to_csv(overall_output_path, index=False)
        print(f"Overall summary saved to: {overall_output_path}")
        print(f"  - Includes {len(overall_summary_df[overall_summary_df['Mode'] == 'CD'])} CD results")
        print(f"  - Includes {len(overall_summary_df[overall_summary_df['Mode'] == 'CI'])} CI results")
else:
    print("No metrics to export.")