In [17]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from typing import Dict, List, Tuple

def analyze_experiments(experiments_path: str = "experiments", skip_models: list = ['LightGRU']) -> pd.DataFrame:
    """
    Analyze all experiments and create comprehensive pandas table.
    
    Args:
        experiments_path: Path to experiments directory
        
    Returns:
        DataFrame with metrics for all experiments
    """
    experiments_path = Path(experiments_path)
    
    # Collect data from all experiments
    all_data = []
    
    for model_dir in experiments_path.iterdir():
        if not model_dir.is_dir() or model_dir.name.startswith('.'):
            continue
            
        model_name = model_dir.name
        if model_name in skip_models:
            continue
        
        for test_dir in model_dir.iterdir():
            if not test_dir.is_dir() or test_dir.name.startswith('.'):
                continue
                
            test_idx = int(test_dir.name)
            
            try:
                # Load time metrics
                time_file = test_dir / 'times.json'
                if time_file.exists():
                    with open(time_file, 'r') as f:
                        time_data = json.load(f)
                else:
                    time_data = {}
                
                # Find epoch files and calculate training metrics
                epoch_files = sorted([f for f in test_dir.iterdir() 
                                    if f.name.startswith('epoch_') and f.suffix == '.npy'], key=lambda f: int(str(f)[str(f).find("epoch")+6:-4]))
                
                if epoch_files:
                    n_epochs = 1
                    # Calculate convergence speed
                    first_epoch_data = np.load(epoch_files[0])
                    if len(first_epoch_data.shape) == 2:
                        # ARIMA
                        test_target = first_epoch_data[:, 1]
                        test_pred_last = first_epoch_data[:, 0]
                        train_metrics = {
                            'test_mse_best': calculate_mse(test_pred_last, test_target),
                            'test_mae_best': calculate_mae(test_pred_last, test_target),
                            'test_mape_best': calculate_mape(test_pred_last, test_target),
                        }
                        time_metrics = {}
                    else:
                        test_target = first_epoch_data[-1, :, 1]

                        min_metrics = 10000000
                        res_file = ''
                        for i, epoch_file in enumerate(epoch_files):
                            # Calculate metrics
                            last_epoch_data = np.load(epoch_file)
                            test_pred_last = last_epoch_data[-1, :, 0].flatten()

                            test = calculate_mape(test_pred_last, test_target)
                            if test < min_metrics:
                                min_metrics = test
                                res_file = epoch_file
                                n_epochs = i + 1
                        
                        best_epoch_data = np.load(res_file)
                        train_metrics = calculate_epoch_metrics(first_epoch_data, best_epoch_data)
                        # Calculate time efficiency
                        time_metrics = calculate_time_metrics(time_data, n_epochs)
                    
                    # Combine all data
                    experiment_data = {
                        'model': model_name,
                        'test_idx': test_idx,
                        'n_epochs': n_epochs,
                        **train_metrics,
                        **time_metrics,
                        **time_data  # Add raw time data
                    }
                    
                    all_data.append(experiment_data)
                    
            except Exception as e:
                print(f"Error processing {model_dir.name}/{test_dir.name}: {e}")
                continue
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Calculate derived metrics if DataFrame is not empty
    if not df.empty:
        df = calculate_derived_metrics(df)
    
    return df


def calculate_epoch_metrics(first_epoch_data: np.ndarray, last_epoch_data: np.ndarray) -> Dict:
    """
    Calculate training metrics from first and last epoch.
    """
    # Extract predictions and targets
    # Shape: [n_tests+1, n_steps, 2] where last dim: 0=predictions, 1=targets
    
    train_pred_first = first_epoch_data[:-1, :, 0]
    train_target_first = first_epoch_data[:-1, :, 1]
    
    train_pred_last = last_epoch_data[:-1, :, 0]
    train_target_last = last_epoch_data[:-1, :, 1]

    test_pred_first = first_epoch_data[-1, :, 0].flatten()
    test_target = first_epoch_data[-1, :, 1].flatten()
    
    test_pred_last = last_epoch_data[-1, :, 0].flatten()
    
    # Calculate metrics
    metrics = {
        # Training (validation) metrics - improvement from first to last
        'train_mse_first': calculate_mse(train_pred_first, train_target_first),
        'train_mse_best': calculate_mse(train_pred_last, train_target_last),
        'train_mae_first': calculate_mae(train_pred_first, train_target_first),
        'train_mae_best': calculate_mae(train_pred_last, train_target_last),
        'train_mape_first': calculate_mape(train_pred_first, train_target_first),
        'train_mape_best': calculate_mape(train_pred_last, train_target_last),
        
        # Testing metrics
        'test_mse_first': calculate_mse(test_pred_first, test_target),
        'test_mse_best': calculate_mse(test_pred_last, test_target),
        'test_mae_first': calculate_mae(test_pred_first, test_target),
        'test_mae_best': calculate_mae(test_pred_last, test_target),
        'test_mape_first': calculate_mape(test_pred_first, test_target),
        'test_mape_best': calculate_mape(test_pred_last, test_target),
        
        # Improvement ratios
        'train_mse_improvement': calculate_improvement(train_pred_first, train_pred_last, train_target_first, train_target_last),
        'test_mse_improvement': calculate_improvement(test_pred_first, test_pred_last, test_target, None),
    }
    
    return metrics


def calculate_time_metrics(time_data: Dict, n_epochs: int) -> Dict:
    """
    Calculate time-related metrics.
    """
    metrics = {}
    
    if time_data:
        total_time = time_data.get('full_time', 0)
        epoch_train_time = time_data.get('epoch_train_time', 0)
        one_step_time = time_data.get('epoch_test_time_one_step', 0)
        epoch_total_time = total_time / n_epochs
        
        metrics.update({
            'total_time_minutes': total_time / 60,
            'epoch_train_time': epoch_train_time,
            'one_step_time': one_step_time,
            'epoch_total_time': epoch_total_time,
            'speed_epochs_per_minute': 60 / epoch_total_time if epoch_total_time > 0 else 0,
        })
    
    return metrics


def calculate_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate additional derived metrics.
    """
    # Improvement percentages
    df['train_mse_improvement_pct'] = (1 - df['train_mse_best'] / df['train_mse_first']) * 100
    df['test_mse_improvement_pct'] = (1 - df['test_mse_best'] / df['test_mse_first']) * 100
    
    return df


def calculate_mse(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Squared Error."""
    return float(np.mean((predictions - targets) ** 2))


def calculate_mae(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Absolute Error."""
    return float(np.mean(np.abs(predictions - targets)))


def calculate_mape(predictions: np.ndarray, targets: np.ndarray, eps: float = 1e-8) -> float:
    """Calculate Mean Absolute Percentage Error."""
    return float(np.mean(np.abs((predictions - targets) / (np.abs(targets) + eps))) * 100)


def calculate_improvement(pred_first: np.ndarray, pred_last: np.ndarray, target_first: np.ndarray, target_last: np.ndarray = None) -> float:
    """Calculate MSE improvement from first to last epoch."""
    mse_first = calculate_mse(pred_first, target_first)
    if target_last is None:
        target_last = target_first
    mse_last = calculate_mse(pred_last, target_last)
    
    if mse_first > 0:
        return (mse_first - mse_last) / mse_first
    return 0.0

In [18]:
df = analyze_experiments("experiments")
df.columns

Index(['model', 'test_idx', 'n_epochs', 'train_mse_first', 'train_mse_best',
       'train_mae_first', 'train_mae_best', 'train_mape_first',
       'train_mape_best', 'test_mse_first', 'test_mse_best', 'test_mae_first',
       'test_mae_best', 'test_mape_first', 'test_mape_best',
       'train_mse_improvement', 'test_mse_improvement', 'total_time_minutes',
       'epoch_train_time', 'one_step_time', 'epoch_total_time',
       'speed_epochs_per_minute', 'full_time', 'epoch_test_time_one_step',
       'last_loss', 'train_mse_improvement_pct', 'test_mse_improvement_pct'],
      dtype='object')

In [19]:
df.sort_values(['test_idx', 'model'])[['test_idx', 'model', 'train_mse_best', 'test_mape_best', 'n_epochs']].round(1).to_csv("full_results.csv", index=False)

In [20]:
assessor = df.groupby('model').mean().drop(['test_idx'], axis=1)

assessor.reset_index().round(3).to_csv("results.csv")

In [21]:
assessor['test_mape_best']

model
SEM_laplace_2comp         16.655855
SEM_laplace_2comp_exp     15.265164
SEM_laplace_3comp          9.182456
SEM_logistic_2comp         6.495142
SEM_logistic_2comp_exp    16.199093
SEM_logistic_3comp         7.336591
SEM_normal_2comp           6.543650
SEM_normal_2comp_exp      15.107805
SEM_normal_3comp           5.558914
SEM_normal_3comp_exp      22.202337
SEM_student_2comp          4.661618
SEM_student_2comp_exp      4.584772
SEM_student_3comp         30.702066
SEM_student_3comp_exp     25.620490
Name: test_mape_best, dtype: float64

In [22]:
assessor.head(10)

Unnamed: 0_level_0,n_epochs,train_mse_first,train_mse_best,train_mae_first,train_mae_best,train_mape_first,train_mape_best,test_mse_first,test_mse_best,test_mae_first,...,total_time_minutes,epoch_train_time,one_step_time,epoch_total_time,speed_epochs_per_minute,full_time,epoch_test_time_one_step,last_loss,train_mse_improvement_pct,test_mse_improvement_pct
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SEM_laplace_2comp,5.0,148214.741078,15145.235218,249.28442,72.822086,404.475889,121.927928,48290.114878,153.700503,152.702807,...,1.342964,0.051903,0.010499,16.115562,3.723109,80.577811,0.010499,0.731954,89.781559,99.681714
SEM_laplace_2comp_exp,5.0,243.695473,320.592272,11.358747,14.05891,20.288203,24.716418,248.10484,140.295132,13.165772,...,1.427519,0.054269,0.011163,17.130227,3.502581,85.651133,0.011163,0.768642,-31.554464,43.453287
SEM_laplace_3comp,21.0,161.658296,257.838941,9.357643,12.002972,13.939887,20.887841,530.40721,51.681363,19.610118,...,1.268529,0.047975,0.00992,3.62437,16.554603,76.111764,0.00992,0.684984,-59.496264,90.256286
SEM_logistic_2comp,1.0,757.480161,757.480161,24.260605,24.260605,39.33731,39.33731,26.018573,26.018573,4.360494,...,1.092543,0.050508,0.008513,65.552575,0.915296,65.552575,0.008513,0.669944,0.0,0.0
SEM_logistic_2comp_exp,3.0,112.722596,163.541789,7.51729,8.932431,12.510962,15.404833,232.453644,155.119001,12.521109,...,1.243632,0.059117,0.009684,24.872641,2.412289,74.617923,0.009684,0.71609,-45.083412,33.268845
SEM_logistic_3comp,25.0,1116.973485,720.565796,30.21876,23.433479,50.069068,38.532697,64.155309,37.958692,6.552034,...,1.034243,0.047348,0.00806,2.482184,24.172263,62.054595,0.00806,0.794031,35.489445,40.833124
SEM_normal_2comp,23.0,506.999148,516.94983,18.876691,19.153892,30.054098,30.816767,37.32939,25.885198,5.099156,...,1.210877,0.061477,0.009417,3.158809,18.994499,72.652612,0.009417,0.742465,-1.962663,30.657324
SEM_normal_2comp_exp,3.0,82.142787,136.284884,5.970902,8.359532,9.64994,13.664767,193.034061,133.602123,11.514507,...,1.025445,0.049375,0.007984,20.508893,2.92556,61.52668,0.007984,0.775533,-65.912174,30.788317
SEM_normal_3comp,5.0,434952.71067,1498.563394,403.133692,31.788313,641.512488,55.554575,267695.515343,28.798008,331.644226,...,1.278983,0.062859,0.009954,15.347796,3.909356,76.738979,0.009954,0.713052,99.655465,99.989242
SEM_normal_3comp_exp,2.0,56.852166,60.190437,4.958262,4.684721,8.203621,7.753082,358.205061,265.342523,16.578499,...,1.124727,0.05395,0.008757,33.741816,1.778209,67.483633,0.008757,0.68152,-5.871844,25.924407


In [23]:
res_df = assessor[['test_mape_best', 'train_mape_best', 'test_mse_best', 'n_epochs', 'total_time_minutes']]

res_df.head()

Unnamed: 0_level_0,test_mape_best,train_mape_best,test_mse_best,n_epochs,total_time_minutes
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SEM_laplace_2comp,16.655855,121.927928,153.700503,5.0,1.342964
SEM_laplace_2comp_exp,15.265164,24.716418,140.295132,5.0,1.427519
SEM_laplace_3comp,9.182456,20.887841,51.681363,21.0,1.268529
SEM_logistic_2comp,6.495142,39.33731,26.018573,1.0,1.092543
SEM_logistic_2comp_exp,16.199093,15.404833,155.119001,3.0,1.243632
