# Evaluate Performance for Diffusion Problem

In [7]:
from os import path
import rareeventestimation as ree
import pandas as pd
import plotly.express as px
from rareeventestimation.evaluation.constants import INDICATOR_APPROX_LATEX_NAME, BM_SOLVER_SCATTER_STYLE, MY_LAYOUT, DF_COLUMNS_TO_LATEX, LATEX_TO_HTML, WRITE_SCALE
import plotly.graph_objects as go
from IPython.display import display, Markdown
# recommended: use autoreload for development: https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data
### Option 1: Get precomputed data online

In [8]:
# data is here: https://archive.org/details/konstantinalthaus-rareeventestimation-data
# you can got to this link and inspect the files pefore loading
df_agg = pd.read_json("https://ia801504.us.archive.org/23/items/konstantinalthaus-rareeventestimation-data/cbree_diffusion_problem_aggregated.json")
df_bm_agg = pd.read_json("https://ia801504.us.archive.org/23/items/konstantinalthaus-rareeventestimation-data/benchmark_diffusion_problems_aggregated.json")

### Option 2: Aggregate locally precomputed data

In [9]:
## uncomment to load existing data 
## or to compile data after computing it yourself:
# data_dir = "docs/benchmarking/data/cbree_sim/diffusion_sim"
# path_df = path.join(data_dir, "cbree_diffusion_problem_processed.json")
# path_df_agg = path.join(data_dir, "cbree_diffusion_problem_aggregated.json")
# if not (path.exists(path_df) and path.exists(path_df_agg)):
#     df = ree.load_data(data_dir, "*vmfnm*")
#     df.drop(columns=["index", "Unnamed: 0",  "VAR Weighted Average Estimate","CVAR", "callback"], inplace=True)
#     df.drop_duplicates(inplace=True)
#     df.reset_index(drop=True, inplace=True)
#     # Round parameters to compare floats safely
#     for col in [c for c in df.columns if c in DF_COLUMNS_TO_LATEX.keys()]:
#         if isinstance(df[col].values[0], float):
#             df[col] = df[col].round(5)
#     # melt aggregated estimates
#     df = df.rename(columns={"Estimate": "Last Estimate"})\
#         .melt(id_vars = [c for c in df.columns if not "Estimate" in c],
#               var_name="Averaging Method",
#               value_name="Estimate")
#     df = df.apply(expand_cbree_name, axis=1, columns = ["Averaging Method", "observation_window"])
#     # pretty names
#     df = df.rename(columns=DF_COLUMNS_TO_LATEX)
#     #process data: add evaluations etc
#     df = ree.add_evaluations(df)
#     df_agg = ree.aggregate_df(df)
#     #save
#     df.to_json(path_df)
#     df_agg.to_json(path_df_agg)
# else:
#     df = pd.read_json(path_df)
#     df_agg = pd.read_json(path_df_agg)
# load benchmarks
# bm_data_dirs = {
#     "enkf":"docs/benchmarking/data/enkf_sim_diffusion",
#     "sis": "docs/benchmarking/data/sis_sim_diffusion"
# }
# bm_df_names ={"df": "benchmark_diffusion_problems_processed.json",
#               "df_agg": "benchmark_diffusion_problems_aggregated.json"}
# df_bm, df_bm_agg = ree.get_benchmark_df(data_dirs=bm_data_dirs,
#                                         df_names=bm_df_names,
#                                         df_dir="docs/benchmarking/data)

## Make Figure



In [12]:
for prob in df_agg["Problem"].unique():
    #filter
    this_df = df_agg.query("Problem == @prob & `Smoothing Function` == 'algebraic'")
    this_df = this_df[this_df["$\\epsilon_{{\\text{{Target}}}}$"]==1]
    this_df = this_df[this_df['$N_{{ \\text{{obs}} }}$'].isin([4,8,12])]
    this_df = this_df.sort_values(["$\\Delta_{{\\text{{Target}}}}$", "$N_{{ \\text{{obs}} }}$"])
    this_df_bm = df_bm_agg.query("Problem == @prob & cvar_tgt == 1")
    #plot
    fig = px.line(
        this_df,
        x = "Relative Root MSE",
        y="Cost Mean",
        facet_col=r'$\epsilon_{{\text{{Target}}}}$',
        facet_row="$N_{{ \\text{{obs}} }}$",
        color="Averaging Method",
        log_x=True,
        log_y=True,
        markers=True,
        labels=LATEX_TO_HTML | {"cvar_tgt_str": LATEX_TO_HTML[DF_COLUMNS_TO_LATEX["cvar_tgt"]]})
    # add benchmark
    num_rows = len(this_df["$N_{{ \\text{{obs}} }}$"].unique())
    num_cols = len(this_df[r'$\epsilon_{{\text{{Target}}}}$'].unique())
    for bm_solver in this_df_bm.Solver.unique():
        dat =this_df_bm.query("Solver == @bm_solver")
        dat = dat.sort_values(["Solver", "Sample Size"])
        trace_dict = {
            "x" : dat["Relative Root MSE"],
            "y" : dat["Cost Mean"],
            "legendgrouptitle_text": "Benchmark Methods",
            "name": bm_solver,
            "legendgroup": "group",
            "mode": "markers+lines",
            "opacity": 0.8,
            "text":dat["Sample Size"],
            "hoverinfo":"text"
        }
        trace_dict = trace_dict  | BM_SOLVER_SCATTER_STYLE[bm_solver]
        fig = ree.add_scatter_to_subplots(fig, num_rows, num_cols, **trace_dict)
    #style
    fig.update_layout(**MY_LAYOUT)
    fig.update_layout(height=900)
    fig.for_each_annotation(
        lambda a: a.update(text =  "" if a.text.startswith(LATEX_TO_HTML[DF_COLUMNS_TO_LATEX["stepsize_tolerance"]]) else a.text)) # remove column heading
    # save
    fig.write_image(f"diffusion problem.png".replace(" ", "_").lower(), scale=WRITE_SCALE)
    fig.show()
    # make and save caption
    fig_description = f"Solving the {prob} with the CBREE (vMFN) method using  \
different parameters. \
We vary the averaging method (color) and \
the divergence check $N_\\text{{obs}}$ (row). \
The choice of the stopping criterion $\\Delta_{{\\text{{Target}}}} = 2$, \
the stepsize tolerance $\\epsilon_{{\\text{{Target}}}}=1$ \
and indicator approximation {INDICATOR_APPROX_LATEX_NAME['algebraic']} \
are fixed. \
Furthermore we plot also the performance of the benchmark methods EnKF \
and SIS.  \
We used the sample sizes $J \\in {ree.vec_to_latex_set(df_agg['Sample Size'].unique())}$. \
Each marker represents the empirical estimates based the successful portion of $200$ simulations."
    display(Markdown(fig_description))
    with open(f"diffusion problem desc.tex".replace(" ", "_").lower(), "w") as file:
        file.write(fig_description)

Solving the Diffusion Problem (d=150) with the CBREE (vMFN) method using  different parameters. We vary the averaging method (color) and the divergence check $N_\text{obs}$ (row). The choice of the stopping criterion $\Delta_{\text{Target}} = 2$, the stepsize tolerance $\epsilon_{\text{Target}}=1$ and indicator approximation $I_\text{alg}$ are fixed. Furthermore we plot also the performance of the benchmark methods EnKF and SIS.  We used the sample sizes $J \in \{1000, 2000, \ldots, 6000\}$. Each marker represents the empirical estimates based the successful portion of $200$ simulations.