In [None]:
import sys 
sys.path.append('..')
import pandas as pd
import numpy as np


In [None]:
columns_to_agg = ['test_loss', 'test_loss_true', 'test_mae', 'test_mse']
clusters = ['WEBSTER', 'HIGH', 'TED', 'HAMILTON', 'RINCONADA', 'CAMBRIDGE', 'MPL', 'BRYANT']
data_path = "../data/LossCSVs"

In [None]:
import os

def load_csv(model, lead, strategy, censor_level):
    base_name = f"{data_path}/loss_{model}lead{lead}_{strategy}{censor_level}"

    aware_path = f"{base_name}.csv"
    unaware_path = f"{base_name}_unaware.csv"

    if not os.path.exists(aware_path):
        print(f"Warning: Missing data for aware {model} {lead} {strategy}{censor_level}. Path {aware_path} does not exist")
        raise FileNotFoundError
    else:
        aware_df = pd.read_csv(aware_path)
        aware_df = aware_df[aware_df.State != 'crashed']
    if not os.path.exists(unaware_path):
        # print(f"Warning: Missing data for unaware {model} {lead} {strategy}{censor_level}.Path {unaware_path} does not exist")
        #raise FileNotFoundError
        unaware_df = None
    else:
        unaware_df = pd.read_csv(unaware_path)
        unaware_df = unaware_df[unaware_df.State != 'crashed']
        unaware_df = None # currently all unaware models are incorrect
    return aware_df, unaware_df

def calculate_losses(df, df_unaware, model_name, columns_to_agg):
    agg_mean = np.round(df[columns_to_agg].mean(),2)
    agg_std = np.round(df[columns_to_agg].std(),2)
    if df_unaware is not None:
        agg_mean_unaware = np.round(df_unaware[columns_to_agg].mean(),2)
        agg_std_unaware = np.round(df_unaware[columns_to_agg].std(),2)

    losses = [model_name.upper()]
    for col in columns_to_agg:
        if df_unaware is None:
            mean_unaware = "xx"
            std_unaware = "xx"
        else:
            mean_unaware = agg_mean_unaware[col].astype(str)
            std_unaware = agg_std_unaware[col].astype(str)
        # Unaware models don't have a censored loss
        if col == 'test_loss':
            losses.append('$' + agg_mean[col].astype(str) + r' \pm ' + agg_std[col].astype(str) + '$')
        else:
            losses.append('$' + agg_mean[col].astype(str) + r' \pm ' + agg_std[col].astype(str) + '$ / $' + mean_unaware + r' \pm ' + std_unaware + '$')
    return losses

leads = [1, 48]
strategies = {
    'dyn': [1, 2],
    'stat': [2, 3]
}

models = ['tgcn', 'gru', 'lstm', 'ar']
leads = [1, 48]

# Store the dataframes in a dictionary

# Iterate over all the combinations
df_losses = pd.DataFrame(columns=['Forecast lead', 'Model', 'Strategy'] + columns_to_agg)

no_runs_dict = {}

for lead in leads:
    for strategy, censor_levels in strategies.items():
        for censor_level in censor_levels:
            for model in models:
                # Load data
                try:
                    aware, unaware = load_csv(model, lead, strategy, censor_level)
                except:
                    continue
                # Figure out how many runs we have
                if unaware is not None:
                    if model != 'tgcn':
                        no_runs_unaware = np.floor(unaware.shape[0]/8)
                        unaware = unaware.groupby('cluster').head(no_runs_unaware).copy()
                        unaware.loc[:, "run_no"] = np.repeat(np.arange(1, 1+no_runs_unaware), 8)
                    else:
                        unaware.loc[:, "run_no"] = np.repeat(np.arange(1, 1+unaware.shape[0]), 1)
                    unaware_agg = unaware.groupby('run_no').mean().reset_index()
                else:
                    unaware_agg = None
                if model != 'tgcn':
                    no_runs_aware = np.floor(aware.shape[0]/8)
                    # print(f"Model {model} lead {lead} strategy {strategy} censor level {censor_level} has {no_runs_aware} aware runs")
                    # Take the first no_runs runs
                    aware = aware.groupby('cluster').head(no_runs_aware).copy()
                    aware.loc[:, "run_no"] = np.repeat(np.arange(1, 1+no_runs_aware), 8)
                else:
                    aware.loc[:, "run_no"] = np.repeat(np.arange(1, 1+aware.shape[0]), 1)
                # Mean over runs
                aware_agg = aware.groupby('run_no').mean().reset_index()

                df_losses.loc[len(df_losses)] = [lead, strategy+ str(censor_level)] + calculate_losses(aware_agg, unaware_agg, model, columns_to_agg)

In [None]:
print(df_losses.style.to_latex())

In [None]:
load_csv('tgcn', 48, 'stat', 3)[0][['epoch']]