In [1]:
from pathlib import Path
import pandas as pd
from pandas import DataFrame, MultiIndex, merge, read_csv, concat


from reservoirs_synthetic_bph.utils.data import get_dataframe, remove_warmup_df
from reservoirs_synthetic_bph.utils.global_config import (
    N_WARMUPS,
    SERIES,
    TSTEPS,
    DATA_DIR,
    TEST_FILE,
    METRIC_CSV_FILE,
)
from reservoirs_synthetic_bph.utils.post_processing import METRIC, DSET, TGT, VAL

MDL = "model"
METD = "method"
SPEC = "spec."
NOISE = "w/ noise"
NONOISE = "w/o noise"

Data folder: /home/francois/Documents/SISTM/subSISTM/data/synthetic_bph_1


In [2]:
def generate_display(
    df: pd.DataFrame, metric: str, effect: str, column_order: list = None
):
    # we use pivot to make sur that there's a single value for each case
    # pivot_table use aggregating and will return a value instead of raising an error

    idxs = [MDL, SPEC]
    cols = [DSET, TGT]

    if METD in df.columns:
        cols = [METD] + cols

    df = df[df[METRIC] == metric]
    assert len(df) > 0

    df = df[df[DSET].str.endswith(effect)]
    assert len(df) > 0

    df = (
        df.pivot(
            index=idxs,
            columns=cols,
            values=[VAL],
        )
        .droplevel(level=0, axis=1)
        .sort_index(level=0, axis=1, ascending=False)
    )

    if column_order:
        df = df.reindex(columns=pd.MultiIndex.from_product(column_order))

    stlr = df.style
    stlr.format("{:#.2g}")
    # use df.to_html() to find the CSS labels
    stlr.set_table_styles(
        [
            {
                "selector": "table, th, td",
                "props": "border: 1px solid; text-align: center",
            },
        ]
    )
    # stlr.background_gradient(axis=0)
    return stlr


def base_convert_name(name, conversion_list):
    result = [nc for nc in conversion_list if name in nc[1]]
    assert len(result) == 1, name
    return result[0][0]

# Arthur's results

In [3]:
# put a comma at the end of a single value tuple


def arthur_convert_name(name):
    names_conversions = [
        (
            ("Linear model", "x1+…+x8"),
            ("lin_simple_fixed",),
        ),
        (
            ("Linear model", "x2*x5+x4*x7+x6*x8"),
            ("fixed",),
        ),
        (
            ("Mixed model", "x2*x5+x4*x7+x6*x8"),
            ("mixed",),
        ),
        (
            ("Mixed model", "x1+…+x8"),
            ("lin_mixed", "lin_fixed"),
        ),
        (
            ("Mixed model", "t^1+…+t^4"),
            ("naif_fixed", "naif_mixed"),
        ),
    ]
    return base_convert_name(name, names_conversions)

In [4]:
arthur_metrics = pd.read_csv(
    DATA_DIR + "/Résultats/Performances_moyennes.csv", index_col=0
).drop(
    columns=["mse_test_fixed_obs.1"]
)  # I've checked: it a duplicate
# there is also "Résultats simulation.csv" with MSE/MAE for each simulation
arthur_metrics

Unnamed: 0,mae_train_mixed_truth,mse_train_mixed_truth,mae_test_mixed_truth,mse_test_mixed_truth,mae_train_naif_mixed_truth,mse_train_naif_mixed_truth,mae_test_naif_mixed_truth,mse_test_naif_mixed_truth,mae_train_mixed_obs,mse_train_mixed_obs,...,mae_test_lin_fixed_obs,mse_test_lin_fixed_obs,mae_train_lin_simple_fixed_truth,mse_train_lin_simple_fixed_truth,mae_test_lin_simple_fixed_truth,mse_test_lin_simple_fixed_truth,mae_train_lin_simple_fixed_obs,mse_train_lin_simple_fixed_obs,mae_test_lin_simple_fixed_obs,mse_test_lin_simple_fixed_obs
1,0.211769,0.084689,0.20996,0.084497,1.480595,8.234363,1.43446,8.285175,0.760418,0.915951,...,0.753587,0.89818,3.408367,20.525309,3.393859,19.816737,3.506297,21.516265,3.483054,20.736139


In [5]:
def split_names(c: str) -> dict[str, str]:
    splt = c.split("_")
    model_ = "_".join(splt[2:-1])
    model, spec = arthur_convert_name(model_)
    return {
        METRIC: splt[0],
        DSET: f"{splt[1]}_{splt[-2]}",
        MDL: model,
        SPEC: spec,
        TGT: splt[-1],
    }


arthur_df = pd.DataFrame(
    [
        {**split_names(col), VAL: arthur_metrics[col].iloc[0]}
        for col in arthur_metrics.columns
    ]
)

arthur_df["target"] = arthur_df["target"].replace(
    {"truth": NONOISE, "obs": NOISE, "obs.1": NOISE}
)

In [6]:
generate_display(arthur_df, metric="mse", effect="fixed")

Unnamed: 0_level_0,dataset,train_fixed,train_fixed,test_fixed,test_fixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Linear model,x1+…+x8,21.0,22.0,20.0,21.0
Linear model,x2*x5+x4*x7+x6*x8,0.00027,1.0,0.00026,0.99
Mixed model,t^1+…+t^4,3.0,3.9,2.9,3.7
Mixed model,x1+…+x8,0.13,0.91,0.13,0.9


In [7]:
generate_display(arthur_df, metric="mse", effect="mixed")

Unnamed: 0_level_0,dataset,train_mixed,train_mixed,test_mixed,test_mixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Mixed model,t^1+…+t^4,8.2,9.1,8.3,9.1
Mixed model,x1+…+x8,0.16,0.89,0.16,0.89
Mixed model,x2*x5+x4*x7+x6*x8,0.085,0.92,0.084,0.91


# Results with a forcast method for the mixed model

The prediction of a mixed model normally uses the data at all timesteps.  

Here to predict the value for the timesteps $t0$ we only use the data of the previous steps.

Here's the corresponding R code:

```R
# initialization with the marginal predictions
pred <- as.vector(predictY(model, newdata = data, marg = TRUE)$pred)
for (t in temps[-1:-1]) {
# using only the previous time steps to predict the random effect
prev_data <- data[data[TSTEP] < t, ]
ui <- predictRE(model, prev_data)
# (some checks…)
# combining the marginal and the random effects prediction == subject specific
reffects <- ui$intercept + rowSums(data[data[TSTEP] == t, X_LABELS] * ui[, X_LABELS])
pred[data[TSTEP] == t] <- pred[data[TSTEP] == t] + reffects
}
```



In [8]:
# put a comma at the end of a single value tuple


def new_convert_name(name):
    names_conversions = [
        (
            ("Mixed model", "x2*x5+x4*x7+x6*x8"),
            ("mixed_oracle"),
        ),
        (
            ("Mixed model", "x1+…+x8"),
            ("mixed_linear", "fixed_linear"),
        ),
        (
            ("Mixed model", "t^1+…+t^4"),
            ("mixed_time-polynom", "fixed_time-polynom"),
        ),
    ]
    return base_convert_name(name, names_conversions)

In [9]:
new_metrics = pd.DataFrame()
for pth in Path("mixed_model_forecast").rglob("*csv"):
    model = str(pth).split("/")[1]
    tmp = pd.read_csv(pth, index_col=0)
    tmp[MDL] = model
    new_metrics = pd.concat([new_metrics, tmp])
new_metrics

Unnamed: 0,dataset,target,metric,value,model
0,train,y_mixed,mean_absolute_error,0.316157,mixed_oracle
1,train,y_mixed,mean_squared_error,0.175762,mixed_oracle
2,train,y_mixed_obs,mean_absolute_error,0.86387,mixed_oracle
3,train,y_mixed_obs,mean_squared_error,1.174989,mixed_oracle
4,test,y_mixed,mean_absolute_error,0.313691,mixed_oracle
5,test,y_mixed,mean_squared_error,0.173205,mixed_oracle
6,test,y_mixed_obs,mean_absolute_error,0.856459,mixed_oracle
7,test,y_mixed_obs,mean_squared_error,1.15779,mixed_oracle
0,test,y_mixed,quad-bias,0.172845,mixed_oracle
1,test,y_mixed_obs,quad-bias,1.15743,mixed_oracle


In [10]:
def new_restructure_row(row):
    try:
        if row[TGT].endswith("_obs"):
            tgt = NOISE
        else:
            tgt = NONOISE
    except AttributeError:
        tgt = row[TGT]

    dset = row[DSET] + "_" + row[MDL].split("_")[0]
    mdl = new_convert_name(row[MDL])
    met = (
        row[METRIC]
        .replace("mean_absolute_error", "mae")
        .replace("mean_squared_error", "mse")
    )

    row[DSET] = dset
    row[TGT] = tgt
    row[METRIC] = met
    row[MDL], row[SPEC] = mdl
    return row


new_df = new_metrics.apply(new_restructure_row, axis=1)

In [11]:
generate_display(new_df, metric="mse", effect="fixed")

Unnamed: 0_level_0,dataset,train_fixed,train_fixed,test_fixed,test_fixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Mixed model,t^1+…+t^4,3.2,4.2,3.1,4.1
Mixed model,x1+…+x8,0.35,1.3,0.36,1.3


In [12]:
generate_display(new_df, metric="mse", effect="mixed")

Unnamed: 0_level_0,dataset,train_mixed,train_mixed,test_mixed,test_mixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Mixed model,t^1+…+t^4,8.4,9.4,8.9,10.0
Mixed model,x1+…+x8,0.46,1.5,0.45,1.4
Mixed model,x2*x5+x4*x7+x6*x8,0.18,1.2,0.17,1.2


# Reservoirs results

In [13]:
# put a comma at the end of a single value tuple


def reservoir_convert_name(name):
    names_conversions = [
        (
            ("Reservoir model", "x2*x5+x4*x7+x6*x8"),
            ("SF_ME", "SF_FE"),
        ),
        (
            ("Reservoir model", "x2*x5+x4*x7+x6*x8+y(t-1)"),
            ("SF_ME_y", "SF_FE_y"),
        ),
        (
            ("Reservoir model", "x1+…+x8"),
            ("AF_ME", "AF_FE"),
        ),
        (
            ("Reservoir model", "x1+…+x8+y(t-1)"),
            ("AF_ME_y", "AF_FE_y"),
        ),
    ]
    return base_convert_name(name, names_conversions)

In [14]:
reservoir_metrics = pd.DataFrame()
for pth in Path("reservoirs_synthetic_bph").rglob("metrics.csv"):
    model = str(pth).split("/")[1]
    tmp = pd.read_csv(pth, index_col=0)
    tmp[MDL] = model
    reservoir_metrics = pd.concat([reservoir_metrics, tmp])
reservoir_metrics

Unnamed: 0,dataset,target,metric,value,model
0,train,y_mixed,mean_absolute_error,0.554750,SF_ME_y
1,train,y_mixed,mean_squared_error,0.515345,SF_ME_y
2,train,y_mixed_obs,mean_absolute_error,0.974460,SF_ME_y
3,train,y_mixed_obs,mean_squared_error,1.506047,SF_ME_y
4,test,y_mixed,mean_absolute_error,0.574440,SF_ME_y
...,...,...,...,...,...
6,test,y_mixed_obs,mean_absolute_error,10.799250,AF_ME
7,test,y_mixed_obs,mean_squared_error,275.186608,AF_ME
0,test,y_mixed,quad-bias,267.439341,AF_ME
1,test,y_mixed_obs,quad-bias,268.376239,AF_ME


In [15]:
def reservoir_restructure_row(row):
    try:
        if row[TGT].endswith("_obs"):
            tgt = NOISE
        else:
            tgt = NONOISE
    except AttributeError:
        tgt = row[TGT]

    tmp = row[MDL].split("_")[1]
    if tmp == "ME":
        dset = row[DSET] + "_mixed"
    elif tmp == "FE":
        dset = row[DSET] + "_fixed"
    else:
        raise UserWarning(tmp)

    mdl = reservoir_convert_name(row[MDL])
    met = (
        row[METRIC]
        .replace("mean_absolute_error", "mae")
        .replace("mean_squared_error", "mse")
    )

    row[DSET] = dset
    row[TGT] = tgt
    row[METRIC] = met
    row[MDL], row[SPEC] = mdl
    return row


reservoir_df = reservoir_metrics.apply(reservoir_restructure_row, axis=1)

In [16]:
generate_display(reservoir_df, metric="mse", effect="fixed")

Unnamed: 0_level_0,dataset,train_fixed,train_fixed,test_fixed,test_fixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Reservoir model,x1+…+x8,0.035,0.99,0.074,1.1
Reservoir model,x2*x5+x4*x7+x6*x8,0.0046,1.0,0.0033,0.99


In [17]:
generate_display(reservoir_df, metric="mse", effect="mixed")

Unnamed: 0_level_0,dataset,train_mixed,train_mixed,test_mixed,test_mixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Reservoir model,x1+…+x8,300.0,300.0,270.0,280.0
Reservoir model,x1+…+x8+y(t-1),0.61,1.6,0.67,1.7
Reservoir model,x2*x5+x4*x7+x6*x8,290.0,290.0,250.0,260.0
Reservoir model,x2*x5+x4*x7+x6*x8+y(t-1),0.52,1.5,0.57,1.6


# Comparison between "fit" and "forecast" mixed model predictions.

In [18]:
copy_1 = arthur_df.copy()
copy_1[METD] = "fit"
copy_2 = new_df.copy()
copy_2[METD] = "forecast"


comp_df = pd.concat([copy_1, copy_2])
comp_df = comp_df.dropna(subset=[DSET])

In [19]:
generate_display(
    comp_df,
    metric="mse",
    effect="fixed",
    column_order=[
        ("fit", "forecast"),
        ("train_fixed", "test_fixed"),
        ("w/o noise", "w/ noise"),
    ],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,fit,fit,fit,fit,forecast,forecast,forecast,forecast
Unnamed: 0_level_1,Unnamed: 1_level_1,train_fixed,train_fixed,test_fixed,test_fixed,train_fixed,train_fixed,test_fixed,test_fixed
Unnamed: 0_level_2,Unnamed: 1_level_2,w/o noise,w/ noise,w/o noise,w/ noise,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Linear model,x1+…+x8,21.0,22.0,20.0,21.0,,,,
Linear model,x2*x5+x4*x7+x6*x8,0.00027,1.0,0.00026,0.99,,,,
Mixed model,t^1+…+t^4,3.0,3.9,2.9,3.7,3.2,4.2,3.1,4.1
Mixed model,x1+…+x8,0.13,0.91,0.13,0.9,0.35,1.3,0.36,1.3


In [20]:
generate_display(
    comp_df,
    metric="mse",
    effect="mixed",
    column_order=[
        ("fit", "forecast"),
        ("train_mixed", "test_mixed"),
        ("w/o noise", "w/ noise"),
    ],
)

Unnamed: 0_level_0,Unnamed: 1_level_0,fit,fit,fit,fit,forecast,forecast,forecast,forecast
Unnamed: 0_level_1,Unnamed: 1_level_1,train_mixed,train_mixed,test_mixed,test_mixed,train_mixed,train_mixed,test_mixed,test_mixed
Unnamed: 0_level_2,Unnamed: 1_level_2,w/o noise,w/ noise,w/o noise,w/ noise,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Mixed model,t^1+…+t^4,8.2,9.1,8.3,9.1,8.4,9.4,8.9,10.0
Mixed model,x1+…+x8,0.16,0.89,0.16,0.89,0.46,1.5,0.45,1.4
Mixed model,x2*x5+x4*x7+x6*x8,0.085,0.92,0.084,0.91,0.18,1.2,0.17,1.2


# Updated results

In [24]:
df_updated = pd.concat(
    [arthur_df[arthur_df[MDL] == "Linear model"], new_df, reservoir_df]
)

In [25]:
generate_display(df_updated, metric="mse", effect="fixed")

Unnamed: 0_level_0,dataset,train_fixed,train_fixed,test_fixed,test_fixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Linear model,x1+…+x8,21.0,22.0,20.0,21.0
Linear model,x2*x5+x4*x7+x6*x8,0.00027,1.0,0.00026,0.99
Mixed model,t^1+…+t^4,3.2,4.2,3.1,4.1
Mixed model,x1+…+x8,0.35,1.3,0.36,1.3
Reservoir model,x1+…+x8,0.035,0.99,0.074,1.1
Reservoir model,x2*x5+x4*x7+x6*x8,0.0046,1.0,0.0033,0.99


In [26]:
generate_display(df_updated, metric="mse", effect="mixed")

Unnamed: 0_level_0,dataset,train_mixed,train_mixed,test_mixed,test_mixed
Unnamed: 0_level_1,target,w/o noise,w/ noise,w/o noise,w/ noise
model,spec.,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Mixed model,t^1+…+t^4,8.4,9.4,8.9,10.0
Mixed model,x1+…+x8,0.46,1.5,0.45,1.4
Mixed model,x2*x5+x4*x7+x6*x8,0.18,1.2,0.17,1.2
Reservoir model,x1+…+x8,300.0,300.0,270.0,280.0
Reservoir model,x1+…+x8+y(t-1),0.61,1.6,0.67,1.7
Reservoir model,x2*x5+x4*x7+x6*x8,290.0,290.0,250.0,260.0
Reservoir model,x2*x5+x4*x7+x6*x8+y(t-1),0.52,1.5,0.57,1.6
