In [13]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from typing import Dict, List, Tuple

def analyze_experiments(experiments_path: str = "experiments", skip_models: list = ['LightGRU'], skip_series: list = [1, 2, 5, 8]) -> pd.DataFrame:
    """
    Analyze all experiments and create comprehensive pandas table.
    
    Args:
        experiments_path: Path to experiments directory
        
    Returns:
        DataFrame with metrics for all experiments
    """
    experiments_path = Path(experiments_path)
    
    # Collect data from all experiments
    all_data = []
    
    for model_dir in experiments_path.iterdir():
        if not model_dir.is_dir() or model_dir.name.startswith('.'):
            continue
            
        model_name = model_dir.name
        if model_name in skip_models:
            continue
        
        for test_dir in model_dir.iterdir():
            if not test_dir.is_dir() or test_dir.name.startswith('.') or int(str(test_dir)[-1]) in skip_series:
                continue
                
            test_idx = int(test_dir.name)
            
            try:
                # Load time metrics
                time_file = test_dir / 'times.json'
                if time_file.exists():
                    with open(time_file, 'r') as f:
                        time_data = json.load(f)
                else:
                    time_data = {}
                
                # Find epoch files and calculate training metrics
                epoch_files = sorted([f for f in test_dir.iterdir() 
                                    if f.name.startswith('epoch_') and f.suffix == '.npy'], key=lambda f: int(str(f)[str(f).find("epoch")+6:-4]))
                
                if epoch_files:
                    n_epochs = 1
                    # Calculate convergence speed
                    first_epoch_data = np.load(epoch_files[0])
                    if len(first_epoch_data.shape) == 2:
                        # ARIMA
                        test_target = first_epoch_data[:, 1]
                        test_pred_last = first_epoch_data[:, 0]
                        train_metrics = {
                            'test_mse_best': calculate_mse(test_pred_last, test_target),
                            'test_mae_best': calculate_mae(test_pred_last, test_target),
                            'test_mape_best': calculate_mape(test_pred_last, test_target),
                        }
                        time_metrics = {}
                    else:
                        test_target = first_epoch_data[-1, :, 1]

                        min_metrics = 10000000
                        res_file = ''
                        for i, epoch_file in enumerate(epoch_files):
                            # Calculate metrics
                            last_epoch_data = np.load(epoch_file)
                            test_pred_last = last_epoch_data[-1, :, 0].flatten()

                            test = calculate_mape(test_pred_last, test_target)
                            if test < min_metrics:
                                min_metrics = test
                                res_file = epoch_file
                                n_epochs = i + 1
                        
                        best_epoch_data = np.load(res_file)
                        train_metrics = calculate_epoch_metrics(first_epoch_data, best_epoch_data)
                        # Calculate time efficiency
                        time_metrics = calculate_time_metrics(time_data, n_epochs)
                    
                    # Combine all data
                    experiment_data = {
                        'model': model_name,
                        'test_idx': test_idx,
                        'n_epochs': n_epochs,
                        **train_metrics,
                        **time_metrics,
                        **time_data  # Add raw time data
                    }
                    
                    all_data.append(experiment_data)
                    
            except Exception as e:
                print(f"Error processing {model_dir.name}/{test_dir.name}: {e}")
                continue
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Calculate derived metrics if DataFrame is not empty
    if not df.empty:
        df = calculate_derived_metrics(df)
    
    return df


def calculate_epoch_metrics(first_epoch_data: np.ndarray, last_epoch_data: np.ndarray) -> Dict:
    """
    Calculate training metrics from first and last epoch.
    """
    # Extract predictions and targets
    # Shape: [n_tests+1, n_steps, 2] where last dim: 0=predictions, 1=targets
    
    train_pred_first = first_epoch_data[:-1, :, 0]
    train_target_first = first_epoch_data[:-1, :, 1]
    
    train_pred_last = last_epoch_data[:-1, :, 0]
    train_target_last = last_epoch_data[:-1, :, 1]

    test_pred_first = first_epoch_data[-1, :, 0].flatten()
    test_target = first_epoch_data[-1, :, 1].flatten()
    
    test_pred_last = last_epoch_data[-1, :, 0].flatten()
    
    # Calculate metrics
    metrics = {
        # Training (validation) metrics - improvement from first to last
        'train_mse_first': calculate_mse(train_pred_first, train_target_first),
        'train_mse_best': calculate_mse(train_pred_last, train_target_last),
        'train_mae_first': calculate_mae(train_pred_first, train_target_first),
        'train_mae_best': calculate_mae(train_pred_last, train_target_last),
        'train_mape_first': calculate_mape(train_pred_first, train_target_first),
        'train_mape_best': calculate_mape(train_pred_last, train_target_last),
        
        # Testing metrics
        'test_mse_first': calculate_mse(test_pred_first, test_target),
        'test_mse_best': calculate_mse(test_pred_last, test_target),
        'test_mae_first': calculate_mae(test_pred_first, test_target),
        'test_mae_best': calculate_mae(test_pred_last, test_target),
        'test_mape_first': calculate_mape(test_pred_first, test_target),
        'test_mape_best': calculate_mape(test_pred_last, test_target),
        
        # Improvement ratios
        'train_mse_improvement': calculate_improvement(train_pred_first, train_pred_last, train_target_first, train_target_last),
        'test_mse_improvement': calculate_improvement(test_pred_first, test_pred_last, test_target, None),
    }
    
    return metrics


def calculate_time_metrics(time_data: Dict, n_epochs: int) -> Dict:
    """
    Calculate time-related metrics.
    """
    metrics = {}
    
    if time_data:
        total_time = time_data.get('full_time', 0)
        epoch_train_time = time_data.get('epoch_train_time', 0)
        one_step_time = time_data.get('epoch_test_time_one_step', 0)
        epoch_total_time = total_time / n_epochs
        
        metrics.update({
            'total_time_minutes': total_time / 60,
            'epoch_train_time': epoch_train_time,
            'one_step_time': one_step_time,
            'epoch_total_time': epoch_total_time,
            'speed_epochs_per_minute': 60 / epoch_total_time if epoch_total_time > 0 else 0,
        })
    
    return metrics


def calculate_derived_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate additional derived metrics.
    """
    # Improvement percentages
    df['train_mse_improvement_pct'] = (1 - df['train_mse_best'] / df['train_mse_first']) * 100
    df['test_mse_improvement_pct'] = (1 - df['test_mse_best'] / df['test_mse_first']) * 100
    
    return df


def calculate_mse(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Squared Error."""
    return float(np.mean((predictions - targets) ** 2))


def calculate_mae(predictions: np.ndarray, targets: np.ndarray) -> float:
    """Calculate Mean Absolute Error."""
    return float(np.mean(np.abs(predictions - targets)))


def calculate_mape(predictions: np.ndarray, targets: np.ndarray, eps: float = 1e-8) -> float:
    """Calculate Mean Absolute Percentage Error."""
    return float(np.mean(np.abs((predictions - targets) / (np.abs(targets) + eps))) * 100)


def calculate_improvement(pred_first: np.ndarray, pred_last: np.ndarray, target_first: np.ndarray, target_last: np.ndarray = None) -> float:
    """Calculate MSE improvement from first to last epoch."""
    mse_first = calculate_mse(pred_first, target_first)
    if target_last is None:
        target_last = target_first
    mse_last = calculate_mse(pred_last, target_last)
    
    if mse_first > 0:
        return (mse_first - mse_last) / mse_first
    return 0.0

In [14]:
df = analyze_experiments("experiments")
df['exp'] = df['model'].apply(lambda x: 'exp' in x)
df['components'] = df['model'].apply(lambda x: int(x[x.find('comp')-1]) if x.find('comp') != -1 else 1)
df['model'] = df['model'].apply(lambda x: x.replace('comp', '').replace('_exp', '')[:-2] if 'EM' in x else x)
df['distr'] = df['model'].apply(lambda x: x[x.find('_') + 1:] if x.find('_') != -1 else 'None')
df['model'] = df['model'].apply(lambda x: x.split('_')[0])


In [15]:
df.sort_values(['test_idx', 'test_mape_best'])[['test_idx', 'model', 'distr', 'components', 'exp', 'train_mse_best', 'test_mape_best', 'n_epochs']].round(3).to_csv("full_results.csv", index=False)

In [16]:
drops = []
groupers = ['model', 'exp', 'components', 'distr']

df_bests = df.drop(drops, axis=1).loc[df.groupby(['test_idx'] + groupers)['test_mape_best'].idxmin()]
df_bests

Unnamed: 0,model,test_idx,n_epochs,train_mse_first,train_mse_best,train_mae_first,train_mae_best,train_mape_first,train_mape_best,test_mse_first,...,epoch_total_time,speed_epochs_per_minute,full_time,epoch_test_time_one_step,last_loss,train_mse_improvement_pct,test_mse_improvement_pct,exp,components,distr
7,ARIMA,0,1,,,,,,,,...,,,,,,,,False,1,
1,Attention,0,12,12685.158214,15.538971,86.875212,3.031921,188.966876,6.837475,3980.160529,...,0.595805,100.704105,7.149659,0.000403,0.423438,99.877503,99.959237,False,1,
19,EM,0,3,52.079557,37.228119,5.232609,4.650934,12.281657,10.679697,2.189774,...,2.552329,23.507941,7.656987,0.001582,0.247088,28.516829,52.908659,False,2,
13,EM,0,3,59.843963,33.239314,5.645617,4.088706,14.044395,9.998997,2.036090,...,2.547034,23.556811,7.641102,0.001542,0.407927,44.456695,54.219963,False,3,
121,EM,0,22,7.611783,7.971997,1.991531,2.432085,4.636300,5.016703,20.180489,...,0.324802,184.727724,7.145652,0.001535,0.414411,-4.732321,54.528552,True,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,SEM,9,7,70.325126,50.261969,7.588846,6.306801,102.571019,78.474391,1.506738,...,1.181602,50.778538,8.271211,0.003480,0.790066,28.529144,4.474540,True,2,student
132,SEM,9,4,90.518700,96.686836,8.593224,8.887824,134.685066,128.764187,3.289307,...,1.280426,46.859406,5.121704,0.001983,0.799057,-6.814212,54.450150,True,3,laplace
24,SEM,9,12,707.855113,43.603713,22.810794,5.339907,394.890773,53.414397,429.086226,...,0.382228,156.974438,4.586734,0.001736,0.806528,93.840023,94.217700,True,3,logistic
102,SEM,9,43,108.156294,182.254421,9.108991,12.110255,120.591350,165.925780,15.987825,...,0.105735,567.458773,4.546586,0.001709,0.713021,-68.510231,10.542281,True,3,normal


In [17]:
assessor = df_bests.groupby(groupers).mean().drop(['test_idx'], axis=1).reset_index()

assessor.round(3).to_csv("results.csv")

In [18]:
assessor.sort_values('test_mape_best')

Unnamed: 0,model,exp,components,distr,n_epochs,train_mse_first,train_mse_best,train_mae_first,train_mae_best,train_mape_first,...,total_time_minutes,epoch_train_time,one_step_time,epoch_total_time,speed_epochs_per_minute,full_time,epoch_test_time_one_step,last_loss,train_mse_improvement_pct,test_mse_improvement_pct
14,SEM,False,3,student,9.833333,150.959269,37.698182,8.454625,4.912228,60.165718,...,0.147409,0.017254,0.003744,3.457943,71.276414,8.844553,0.003744,0.629981,35.706743,45.54529
2,EM,False,2,,2.333333,37.016979,34.589601,4.699854,4.822311,45.58906,...,0.118211,0.079063,0.001416,4.255773,21.130579,7.092646,0.001416,0.304552,-1.365297,38.966082
13,SEM,False,3,normal,21.5,195.411276,38.593376,10.247412,4.62581,84.803553,...,0.082755,0.017712,0.001885,0.520043,249.026366,4.965279,0.001885,0.538717,58.910942,77.626314
8,SEM,False,2,logistic,24.0,73.134255,58.532093,6.193141,5.479073,53.525555,...,0.08362,0.017553,0.001912,0.759685,307.913107,5.017202,0.001912,0.678499,-1.399621,68.664692
11,SEM,False,3,laplace,19.5,108.288695,62.617817,7.641552,5.674556,105.499203,...,0.094893,0.017778,0.002231,1.564507,196.708163,5.693605,0.002231,0.583485,36.140366,49.689764
6,MLP,False,1,,10.833333,68.353454,29.802557,6.295089,4.079295,54.907944,...,0.017445,0.015051,8.8e-05,0.413426,550.762248,1.046705,8.8e-05,0.357051,39.105316,41.266306
0,ARIMA,False,1,,1.0,,,,,,...,,,,,,,,,,
3,EM,False,3,,10.0,60.23183,24.536805,5.988509,3.835824,68.429226,...,0.139756,0.095436,0.001641,2.785877,75.978117,8.385372,0.001641,0.399941,39.253774,62.847958
4,EM,True,2,,17.5,40.13087,26.111432,4.942979,4.005858,50.279852,...,0.12778,0.084985,0.001536,1.413372,136.598899,7.666786,0.001536,0.408898,23.916728,59.802803
9,SEM,False,2,normal,18.833333,164.978326,42.504888,9.749232,4.890967,94.056812,...,0.081808,0.017896,0.001852,0.430048,215.425752,4.908491,0.001852,0.589637,60.696027,85.040956


In [19]:
res_df = assessor[groupers + ['test_mape_best', 'train_mse_best', 'n_epochs', 'total_time_minutes']]
res_df.sort_values('test_mape_best').reset_index(drop=True).head(100)

Unnamed: 0,model,exp,components,distr,test_mape_best,train_mse_best,n_epochs,total_time_minutes
0,SEM,False,3,student,4.617544,37.698182,9.833333,0.147409
1,EM,False,2,,4.651744,34.589601,2.333333,0.118211
2,SEM,False,3,normal,4.739989,38.593376,21.5,0.082755
3,SEM,False,2,logistic,5.533405,58.532093,24.0,0.08362
4,SEM,False,3,laplace,6.092717,62.617817,19.5,0.094893
5,MLP,False,1,,6.129416,29.802557,10.833333,0.017445
6,ARIMA,False,1,,6.167331,,1.0,
7,EM,False,3,,6.19916,24.536805,10.0,0.139756
8,EM,True,2,,6.710046,26.111432,17.5,0.12778
9,SEM,False,2,normal,6.902162,42.504888,18.833333,0.081808
