In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from typing import Dict, List, Tuple

def analyze_experiments(experiments_path: str = "experiments", skip_models: list = ['LightGRU']) -> pd.DataFrame:
    """
    Analyze all experiments and create comprehensive pandas table.
    
    Args:
        experiments_path: Path to experiments directory
        
    Returns:
        DataFrame with metrics for all experiments
    """
    experiments_path = Path(experiments_path)
    
    # Collect data from all experiments
    all_data = []
    
    for model_dir in experiments_path.iterdir():
        if not model_dir.is_dir() or model_dir.name.startswith('.'):
            continue
            
        model_name = model_dir.name
        if model_name in skip_models:
            continue
        
        for test_dir in model_dir.iterdir():
            if not test_dir.is_dir() or test_dir.name.startswith('.'):
                continue
                
            test_idx = int(test_dir.name)
            
            try:
                # Load time metrics
                time_file = test_dir / 'times.json'
                if time_file.exists():
                    with open(time_file, 'r') as f:
                        time_data = json.load(f)
                else:
                    time_data = {}
                
                # Find epoch files and calculate training metrics
                epoch_files = sorted([f for f in test_dir.iterdir() 
                                    if f.name.startswith('epoch_') and f.suffix == '.npy'])
                
                if epoch_files:
                    # Calculate convergence speed
                    n_epochs = len(epoch_files)
                    first_epoch_data = np.load(epoch_files[0])
                    test_target = first_epoch_data[-1, :, 1].flatten()

                    min_metrics = 1000
                    res_file = ''
                    for i, epoch_file in enumerate(epoch_files):
                        # Calculate metrics
                        last_epoch_data = np.load(epoch_file)
                        train_pred_last = last_epoch_data[:-1, :, 0]
                        train_target_last = last_epoch_data[:-1, :, 1]
                        test_pred_last = last_epoch_data[-1, :, 0].flatten()

                        test = 0
                        # test += calculate_mse(train_pred_last, train_target_last) / 200
                        # test += calculate_mae(test_pred_last, test_target)
                        test += calculate_mape(test_pred_last, test_target)
                        if test < min_metrics:
                            min_metrics = test
                            res_file = epoch_file
                            n_epochs = i + 1
                    
                    best_epoch_data = np.load(res_file)
                    train_metrics = calculate_epoch_metrics(first_epoch_data, best_epoch_data)
                    # Calculate time efficiency
                    time_metrics = calculate_time_metrics(time_data, n_epochs)
                    
                    # Combine all data
                    experiment_data = {
                        'model': model_name,
                        'test_idx': test_idx,
                        'n_epochs': n_epochs,
                        **train_metrics,
                        **time_metrics,
                        **time_data  # Add raw time data
                    }
                    
                    all_data.append(experiment_data)
                    
            except Exception as e:
                print(f"Error processing {model_dir.name}/{test_dir.name}: {e}")
                continue
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Calculate derived metrics if DataFrame is not empty
    if not df.empty:
        df = calculate_derived_metrics(df)
    
    return df


def calculate_epoch_metrics(first_epoch_data: np.ndarray, last_epoch_data: np.ndarray) -> Dict:
    """
    Calculate training metrics from first and last epoch.
    """
    # Extract predictions and targets
    # Shape: [n_tests+1, n_steps, 2] where last dim: 0=predictions, 1=targets
    
    train_pred_first = first_epoch_data[:-1, :, 0]
    train_target_first = first_epoch_data[:-1, :, 1]
    
    train_pred_last = last_epoch_data[:-1, :, 0]
    train_target_last = last_epoch_data[:-1, :, 1]

    test_pred_first = first_epoch_data[-1, :, 0].flatten()
    test_target = first_epoch_data[-1, :, 1].flatten()
    
    test_pred_last = last_epoch_data[-1, :, 0].flatten()
    
    # Calculate metrics
    metrics = {
        # Training (validation) metrics - improvement from first to last
        'train_mse_first': calculate_mse(train_pred_first, train_target_first),
        'train_mse_best': calculate_mse(train_pred_last, train_target_last),
        'train_mae_first': calculate_mae(train_pred_first, train_target_first),
        'train_mae_best': calculate_mae(train_pred_last, train_target_last),
        'train_mape_first': calculate_mape(train_pred_first, train_target_first),
        'train_mape_best': calculate_mape(train_pred_last, train_target_last),
        
        # Testing metrics
        'test_mse_first': calculate_mse(test_pred_first, test_target),
        'test_mse_best': calculate_mse(test_pred_last, test_target),
        'test_mae_first': calculate_mae(test_pred_first, test_target),
        'test_mae_best': calculate_mae(test_pred_last, test_target),
        'test_mape_first': calculate_mape(test_pred_first, test_target),
        'test_mape_best': calculate_mape(test_pred_last, test_target),
        
        # Improvement ratios
        'train_mse_improvement': calculate_improvement(train_pred_first, train_pred_last, train_target_first, train_target_last),
        'test_mse_improvement': calculate_improvement(test_pred_first, test_pred_last, test_target, None),
    }
    
    return metrics


def calculate_time_metrics(time_data: Dict, n_epochs: int) -> Dict:
    """
    Calculate time-related metrics.
    """
    metrics = {}
    
    if time_data:
        total_time = time_data.get('full_time', 0)
        epoch_train_time = time_data.get('epoch_train_time', 0)
        one_step_time = time_data.get('epoch_test_time_one_step', 0)
        epoch_total_time = total_time / n_epochs
        
        metrics.update({
            'total_time_minutes': total_time / 60,
            'epoch_train_time': epoch_train_time,
            'one_step_time': one_step_time,
            'epoch_total_time': epoch_total_time,
            'speed_epochs_per_minute': 60 / epoch_total_time if epoch_total_time > 0 else 0,
        })
    
    return metrics


def calculate_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate additional derived metrics.
    """
    # Improvement percentages
    df['train_mse_improvement_pct'] = (1 - df['train_mse_best'] / df['train_mse_first']) * 100
    df['test_mse_improvement_pct'] = (1 - df['test_mse_best'] / df['test_mse_first']) * 100
    
    return df


def calculate_mse(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Squared Error."""
    return float(np.mean((predictions - targets) ** 2))


def calculate_mae(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Absolute Error."""
    return float(np.mean(np.abs(predictions - targets)))


def calculate_mape(predictions: np.ndarray, targets: np.ndarray, eps: float = 1e-8) -> float:
    """Calculate Mean Absolute Percentage Error."""
    return float(np.mean(np.abs((predictions - targets) / (np.abs(targets) + eps))) * 100)


def calculate_improvement(pred_first: np.ndarray, pred_last: np.ndarray, target_first: np.ndarray, target_last: np.ndarray = None) -> float:
    """Calculate MSE improvement from first to last epoch."""
    mse_first = calculate_mse(pred_first, target_first)
    if target_last is None:
        target_last = target_first
    mse_last = calculate_mse(pred_last, target_last)
    
    if mse_first > 0:
        return (mse_first - mse_last) / mse_first
    return 0.0

In [2]:
df = analyze_experiments("experiments")
df.columns

Index(['model', 'test_idx', 'n_epochs', 'train_mse_first', 'train_mse_best',
       'train_mae_first', 'train_mae_best', 'train_mape_first',
       'train_mape_best', 'test_mse_first', 'test_mse_best', 'test_mae_first',
       'test_mae_best', 'test_mape_first', 'test_mape_best',
       'train_mse_improvement', 'test_mse_improvement', 'total_time_minutes',
       'epoch_train_time', 'one_step_time', 'epoch_total_time',
       'speed_epochs_per_minute', 'full_time', 'epoch_test_time_one_step',
       'last_loss', 'train_mse_improvement_pct', 'test_mse_improvement_pct'],
      dtype='object')

In [3]:
assessor = df.groupby('model').mean().drop(['test_idx'], axis=1)

assessor.reset_index().to_csv("results.csv")

In [5]:
assessor.head()

Unnamed: 0_level_0,n_epochs,train_mse_first,train_mse_best,train_mae_first,train_mae_best,train_mape_first,train_mape_best,test_mse_first,test_mse_best,test_mae_first,...,total_time_minutes,epoch_train_time,one_step_time,epoch_total_time,speed_epochs_per_minute,full_time,epoch_test_time_one_step,last_loss,train_mse_improvement_pct,test_mse_improvement_pct
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EM_normal_diff,17.5,263.841557,296.333622,11.533698,11.464747,141.170152,132.271613,87.228729,53.400398,7.186141,...,10.065883,1.079006,0.004016,102.055537,1.733319,603.952957,0.004016,0.021236,-10.073764,34.314042
SEM_laplace,17.3,252.806636,295.040392,11.858149,12.016709,177.739101,159.600375,146.970982,62.641869,9.525366,...,15.86971,0.187181,0.008036,195.770936,1.081148,952.182573,0.008036,0.008744,-28.064181,55.085619
SEM_normal,18.1,293.397692,323.645966,13.008472,12.961322,198.553974,194.790236,243.340964,100.661437,11.844473,...,13.180856,0.18634,0.006597,75.430065,1.381772,790.851386,0.006597,0.008914,-8.43472,64.112877
SEM_student,26.0,267.470449,298.282945,12.205852,12.211025,190.064071,164.230948,200.080984,62.660456,10.5156,...,29.368902,0.187788,0.014196,107.223327,0.88789,1762.134106,0.014196,0.009106,-19.2546,58.500744


In [6]:
res_df = assessor[['test_mape_best', 'train_mape_best', 'test_mse_best', 'n_epochs', 'total_time_minutes']]

res_df.head()

Unnamed: 0_level_0,test_mape_best,train_mape_best,test_mse_best,n_epochs,total_time_minutes
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
EM_normal_diff,16.152443,132.271613,53.400398,17.5,10.065883
SEM_laplace,28.625259,159.600375,62.641869,17.3,15.86971
SEM_normal,14.782695,194.790236,100.661437,18.1,13.180856
SEM_student,12.58749,164.230948,62.660456,26.0,29.368902
