In [87]:
import pandas as pd

# 1. Load both datasets
df_missing = pd.read_csv("results_missing.csv")
df_extreme = pd.read_csv("results_missing_extreme.csv")

# 2. Define a helper function to process dataframes identically
def get_grouped_stats(df, alpha=0.1):
    # Filter alpha
    df_filtered = df[df["alpha"] == alpha]
    # Compute mean and std per dataset
    return (
        df_filtered.groupby("dataset")[["coverage_nan", "coverage_full"]]
        .agg(["mean", "std"])
    )

# Calculate stats for both scenarios
stats_missing = get_grouped_stats(df_missing)
stats_extreme = get_grouped_stats(df_extreme)

# 3. Define formatting function
def fmt(mean, std):
    # Formats as "95.0 \pm 1.2"
    return f"${100*mean:.1f} \\pm {100*std:.1f}$"

name_map = {
    "biais": "Bias",
    "casp": "CASP",
    "house": "House",
    "rf1": "rf1",
    "rf2": "rf2",
    "scm1d": "scm1d",
    "scm20d": "scm20d",
    "taxi": "Taxi",
}

# 4. Construct the LaTeX Table
latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += "\\caption{Marginal coverage when conformalizing with missing outputs (desired coverage 0.9).}\n"
latex += "\\resizebox{\linewidth}{!}{\n"
# {l|cc|cc} creates vertical lines between the Dataset and the two main groups
latex += "\\begin{tabular}{l|cc|cc}\n"
latex += "\\hline\n"

# --- Header Row 1: The Main Groups ---
# \multicolumn{2}{c|}{...} spans 2 columns and adds a vertical line after
latex += "Dataset & \\multicolumn{2}{c|}{Missing at random} & \\multicolumn{2}{c}{10\\% extreme removed} \\\\ \n"

# --- Header Row 2: The Sub-columns ---
latex += " & With missing & Full output & With missing & Full output \\\\ \\hline\n"

# 5. Populate Rows
# We iterate through the index of one stats object (assuming datasets match in both files)
for dataset in stats_missing.index:
    # Clean up name using the map
    clean_name = name_map.get(dataset.lower(), dataset.capitalize())
    
    # Extract 'Missing at random' stats
    m_nan = fmt(stats_missing.loc[dataset, ("coverage_nan", "mean")], 
                stats_missing.loc[dataset, ("coverage_nan", "std")])
    m_full = fmt(stats_missing.loc[dataset, ("coverage_full", "mean")], 
                 stats_missing.loc[dataset, ("coverage_full", "std")])
    
    # Extract 'Extreme removed' stats
    e_nan = fmt(stats_extreme.loc[dataset, ("coverage_nan", "mean")], 
                stats_extreme.loc[dataset, ("coverage_nan", "std")])
    e_full = fmt(stats_extreme.loc[dataset, ("coverage_full", "mean")], 
                 stats_extreme.loc[dataset, ("coverage_full", "std")])

    # Append row to LaTeX string
    latex += f"{clean_name} & {m_nan} & {m_full} & {e_nan} & {e_full} \\\\ \\hline\n"

# 6. Close Table
latex += "\\end{tabular}\n"
latex += "}\n"
latex += "\\label{tab:coverage:missing:0.9}\n"
latex += "\\end{table}"

print(latex)

\begin{table}[ht]
\centering
\small
\caption{Marginal coverage when conformalizing with missing outputs (desired coverage 0.9).}
\resizebox{\linewidth}{!}{
\begin{tabular}{l|cc|cc}
\hline
Dataset & \multicolumn{2}{c|}{Missing at random} & \multicolumn{2}{c}{10\% extreme removed} \\ 
 & With missing & Full output & With missing & Full output \\ \hline
Bias & $90.4 \pm 1.1$ & $85.1 \pm 1.7$ & $90.8 \pm 1.2$ & $90.7 \pm 1.1$ \\ \hline
CASP & $89.8 \pm 0.7$ & $89.3 \pm 0.9$ & $90.2 \pm 0.4$ & $89.7 \pm 0.4$ \\ \hline
House & $89.7 \pm 1.1$ & $87.9 \pm 1.3$ & $89.6 \pm 1.0$ & $87.6 \pm 0.9$ \\ \hline
rf1 & $90.7 \pm 1.2$ & $89.0 \pm 1.6$ & $90.4 \pm 1.3$ & $89.2 \pm 1.3$ \\ \hline
rf2 & $90.5 \pm 1.5$ & $89.1 \pm 1.5$ & $90.1 \pm 1.2$ & $88.7 \pm 1.2$ \\ \hline
scm1d & $89.9 \pm 0.7$ & $81.3 \pm 2.1$ & $89.8 \pm 1.4$ & $86.9 \pm 1.9$ \\ \hline
scm20d & $89.9 \pm 1.2$ & $81.4 \pm 2.2$ & $89.6 \pm 1.4$ & $85.5 \pm 1.3$ \\ \hline
Taxi & $89.9 \pm 0.5$ & $89.3 \pm 0.7$ & $90.0 \pm 0.6$ & $82.7 

In [None]:
import pandas as pd
import numpy as np

alpha = 0.1

df = pd.read_csv("results_full.csv")



mask = df["coverage_HDR"] == 1.0
df.loc[mask, ["coverage_HDR", "ERT_HDR", "WSC_HDR", "volume_HDR", "time_HDR"]] = np.nan


methods = ["levelset", "HDR", "one", "ot", "MVCS"]

# dataset name mapping
dataset_map = {
    "biais": "Bias [2]",
    "casp": "CASP [2]",
    "house": "House [2]",
    "rf1": "rf1 [8]",
    "rf2": "rf2 [8]",
    "scm1d": "scm1d [16]",
    "scm20d": "scm20d [16]",
    "taxi": "Taxi [2]",
}

# method name mapping (for LaTeX header)
method_map = {
    "levelset": "Mahalanobis",
    "HDR": "HPD",
    "one": "ECM",
    "ot": "OT",
    "MVCS": "MVCS",
}

# keep alpha = 0.1 and compute means
res = (
    df[df["alpha"] == alpha]
    .groupby("dataset", as_index=False)
    .agg(lambda s: np.nan if s.isna().sum() > 2 else s.mean())
)

def format_row(values, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    """
    vals = np.array(values, dtype=float)

    if mode == "min":
        order = np.argsort(vals)
    else:
        order = np.argsort(-vals)

    best, second = order[0], order[1]

    out = []
    for i, v in enumerate(vals):
        if np.isnan(v):
            s = "N/A"
            out.append(s)
        else:
            s = f"{v:.2f}"
            if mode is not None:
                if i == best:
                    s = f"\\textbf{{{s}}}"
                elif i == second:
                    s = f"\\underline{{{s}}}"
            out.append(s)
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.1$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better) and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions}}\n"
latex += "\\resizebox{\linewidth}{!}{\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"{{\\begin{{tabular}}{{c}} Dataset \\\\ {{\small [output dim]}} \\end{{tabular}}}} & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for _, row in res.iterrows():
    dataset_raw = row["dataset"]
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    # ert = [row[f"ERT_{m}"]*100 for m in methods]
    # vol = [row[f"volume_{m}"] for m in methods]
    # wsc = [row[f"WSC_{m}"] for m in methods]
    # cov = [row[f"coverage_{m}"] for m in methods]
    ert = [row.get(f"ERT_{m}", np.nan) * 100 if f"ERT_{m}" in row else np.nan for m in methods]
    vol = [row.get(f"volume_{m}", np.nan) for m in methods]
    wsc = [row.get(f"WSC_{m}", np.nan) for m in methods]
    cov = [row.get(f"coverage_{m}", np.nan) for m in methods]
    time = [row.get(f"time_{m}", np.nan) for m in methods]

    ert_fmt = format_row(ert, mode="min")
    vol_fmt = format_row(vol, mode="min")
    wsc_fmt = format_row(wsc, mode="max")
    cov_fmt = format_row(cov, mode=None)
    time_fmt = format_row(time, mode="min")

    # latex += f"\\multirow{{3}}{{*}}{{\\rotatebox{{{90}}}{{{dataset}}}}} & ERT & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f"\\multirow{{3}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    # latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    # latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += f" & Time [s] & " + " & ".join(time_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "}\n"
latex += "\\label{tab:results_alpha01}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.1$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better) and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions}
\resizebox{\linewidth}{!}{
\begin{tabular}{l l c c c c c }
\hline
{\begin{tabular}{c} Dataset \\ {\small [output dim]} \end{tabular}} & Metric & Mahalanobis & HPD & ECM & OT & MVCS \\
\hline
\multirow{3}{*}{{Bias [2]}} & ERT [\%] & \textbf{1.68} & \underline{2.00} & 3.72 & 3.69 & 3.05 \\
 & WSC & 0.72 & 0.72 & 0.72 & \textbf{0.73} & \underline{0.72} \\
 & Time [s] & \underline{0.06} & 556.89 & \textbf{0.01} & 19.91 & 0.12 \\
\hline
\multirow{3}{*}{{CASP [2]}} & ERT [\%] & \underline{1.89} & \textbf{1.52} & 4.64 & 

In [None]:
import pandas as pd
import numpy as np

alpha = 0.1

df = pd.read_csv("results_full.csv")



mask = df["coverage_HDR"] == 1.0
df.loc[mask, ["coverage_HDR", "ERT_HDR", "WSC_HDR", "volume_HDR", "time_HDR"]] = np.nan
ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["levelset", "HDR", "one", "ot", "MVCS"]

# dataset name mapping
dataset_map = {
    "biais": "Bias [2]",
    "casp": "CASP [2]",
    "house": "House [2]",
    "rf1": "rf1 [8]",
    "rf2": "rf2 [8]",
    "scm1d": "scm1d [16]",
    "scm20d": "scm20d [16]",
    "taxi": "Taxi [2]",
}

# method name mapping (for LaTeX header)
method_map = {
    "levelset": "Mahalanobis",
    "HDR": "HPD",
    "one": "ECM",
    "ot": "OT",
    "MVCS": "MVCS",
}

# keep alpha = 0.1 and compute means
res = ( 
    df[df["alpha"] == alpha] 
    .groupby("dataset") 
    .agg(["mean", "sem"]) 
)

def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.1$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"{{\\begin{{tabular}}{{c}} Dataset \\\\ {{\small [output dim]}} \\end{{tabular}}}} & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    def get_metric(metric, method):
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])
    time_mean, time_std = zip(*[get_metric("time", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)
    time_fmt = format_row_with_std(time_mean, time_std, mode=None)

    
    latex += f"\\multirow{{5}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += f" & Time [s] & " + " & ".join(time_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results_alpha01}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.1$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c c c }
\hline
{\begin{tabular}{c} Dataset \\ {\small [output dim]} \end{tabular}} & Metric & Mahalanobis & HPD & ECM & OT & MVCS \\
\hline
\multirow{5}{*}{{Bias [2]}} & ERT [\%] & $\mathbf{1.68_{0.45}}$ & $\underline{2.00_{0.58}}$ & $3.72_{0.51}$ & $3.69_{0.44}$ & $3.05_{0.40}$ \\
 & WSC & $0.72_{0.01}$ & $0.72_{0.01}$ & $0.72_{0.01}$ & $\mathbf{0.73_{0.01}

In [None]:
import pandas as pd
import numpy as np

alpha = 0.05

df = pd.read_csv("results_full.csv")



mask = df["coverage_HDR"] == 1.0
df.loc[mask, ["coverage_HDR", "ERT_HDR", "WSC_HDR", "volume_HDR", "time_HDR"]] = np.nan
ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["levelset", "HDR", "one", "ot", "MVCS"]

# dataset name mapping
dataset_map = {
    "biais": "Bias [2]",
    "casp": "CASP [2]",
    "house": "House [2]",
    "rf1": "rf1 [8]",
    "rf2": "rf2 [8]",
    "scm1d": "scm1d [16]",
    "scm20d": "scm20d [16]",
    "taxi": "Taxi [2]",
}

# method name mapping (for LaTeX header)
method_map = {
    "levelset": "Mahalanobis",
    "HDR": "HPD",
    "one": "ECM",
    "ot": "OT",
    "MVCS": "MVCS",
}

# keep alpha = 0.1 and compute means
res = ( 
    df[df["alpha"] == alpha] 
    .groupby("dataset") 
    .agg(["mean", "sem"]) 
)

def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.05$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.95$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"{{\\begin{{tabular}}{{c}} Dataset \\\\ {{\small [output dim]}} \\end{{tabular}}}} & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    def get_metric(metric, method):
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])
    time_mean, time_std = zip(*[get_metric("time", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)
    time_fmt = format_row_with_std(time_mean, time_std, mode=None)

    
    latex += f"\\multirow{{5}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += f" & Time [s] & " + " & ".join(time_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results_alpha005}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.05$) for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.95$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c c c }
\hline
{\begin{tabular}{c} Dataset \\ {\small [output dim]} \end{tabular}} & Metric & Mahalanobis & HPD & ECM & OT & MVCS \\
\hline
\multirow{5}{*}{{Bias [2]}} & ERT [\%] & $\mathbf{1.26_{0.24}}$ & $1.61_{0.23}$ & $2.12_{0.20}$ & $1.82_{0.43}$ & $\underline{1.28_{0.25}}$ \\
 & WSC & $0.72_{0.02}$ & $\underline{0.76_{0.02}}$ & $0.75_{0.01}$ & $\math

In [None]:
import pandas as pd

# 1. Load both datasets
df_missing = pd.read_csv("results_missing.csv")
df_extreme = pd.read_csv("results_missing_extreme.csv")

# 2. Define a helper function to process dataframes identically
def get_grouped_stats(df, alpha=0.05):
    # Filter alpha
    df_filtered = df[df["alpha"] == alpha]
    # Compute mean and std per dataset
    return (
        df_filtered.groupby("dataset")[["coverage_nan", "coverage_full"]]
        .agg(["mean", "std"])
    )

# Calculate stats for both scenarios
stats_missing = get_grouped_stats(df_missing)
stats_extreme = get_grouped_stats(df_extreme)

# 3. Define formatting function
def fmt(mean, std):
    # Formats as "95.0 \pm 1.2"
    return f"${100*mean:.1f} \\pm {100*std:.1f}$"

name_map = {
    "biais": "Bias",
    "casp": "CASP",
    "house": "House",
    "rf1": "rf1",
    "rf2": "rf2",
    "scm1d": "scm1d",
    "scm20d": "scm20d",
    "taxi": "Taxi",
}

# 4. Construct the LaTeX Table
latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += "\\caption{Marginal coverage when conformalizing with missing outputs (desired coverage 0.9).}\n"
# {l|cc|cc} creates vertical lines between the Dataset and the two main groups
latex += "\\begin{tabular}{l|cc|cc}\n"
latex += "\\hline\n"

# --- Header Row 1: The Main Groups ---
# \multicolumn{2}{c|}{...} spans 2 columns and adds a vertical line after
latex += "Dataset & \\multicolumn{2}{c|}{Missing at random} & \\multicolumn{2}{c}{10\\% extreme removed} \\\\ \n"

# --- Header Row 2: The Sub-columns ---
latex += " & With missing & Full output & With missing & Full output \\\\ \\hline\n"

# 5. Populate Rows
# We iterate through the index of one stats object (assuming datasets match in both files)
for dataset in stats_missing.index:
    # Clean up name using the map
    clean_name = name_map.get(dataset.lower(), dataset.capitalize())
    
    # Extract 'Missing at random' stats
    m_nan = fmt(stats_missing.loc[dataset, ("coverage_nan", "mean")], 
                stats_missing.loc[dataset, ("coverage_nan", "std")])
    m_full = fmt(stats_missing.loc[dataset, ("coverage_full", "mean")], 
                 stats_missing.loc[dataset, ("coverage_full", "std")])
    
    # Extract 'Extreme removed' stats
    e_nan = fmt(stats_extreme.loc[dataset, ("coverage_nan", "mean")], 
                stats_extreme.loc[dataset, ("coverage_nan", "std")])
    e_full = fmt(stats_extreme.loc[dataset, ("coverage_full", "mean")], 
                 stats_extreme.loc[dataset, ("coverage_full", "std")])

    # Append row to LaTeX string
    latex += f"{clean_name} & {m_nan} & {m_full} & {e_nan} & {e_full} \\\\ \\hline\n"

# 6. Close Table
latex += "\\end{tabular}\n"
latex += "\\label{tab:coverage:missing:0.95}\n"
latex += "\\end{table}"

print(latex)

\begin{table}[ht]
\centering
\small
\caption{Marginal coverage when conformalizing with missing outputs (desired coverage 0.9).}
\begin{tabular}{l|cc|cc}
\hline
Dataset & \multicolumn{2}{c|}{Missing at random} & \multicolumn{2}{c}{10\% extreme removed} \\ 
 & With missing & Full output & With missing & Full output \\ \hline
Bias & $95.8 \pm 0.6$ & $92.1 \pm 0.9$ & $95.7 \pm 0.7$ & $96.2 \pm 0.6$ \\ \hline
CASP & $95.0 \pm 0.4$ & $93.7 \pm 0.7$ & $95.1 \pm 0.4$ & $95.2 \pm 0.4$ \\ \hline
House & $94.9 \pm 0.8$ & $93.1 \pm 1.1$ & $94.9 \pm 0.9$ & $93.5 \pm 1.0$ \\ \hline
rf1 & $95.6 \pm 0.8$ & $93.4 \pm 1.1$ & $95.6 \pm 0.9$ & $94.6 \pm 1.1$ \\ \hline
rf2 & $95.1 \pm 0.8$ & $93.1 \pm 1.1$ & $95.2 \pm 1.1$ & $94.0 \pm 1.4$ \\ \hline
scm1d & $94.8 \pm 0.8$ & $89.0 \pm 1.9$ & $94.8 \pm 0.7$ & $93.2 \pm 1.2$ \\ \hline
scm20d & $94.7 \pm 0.7$ & $88.8 \pm 1.7$ & $95.3 \pm 1.5$ & $92.9 \pm 1.3$ \\ \hline
Taxi & $95.0 \pm 0.5$ & $93.8 \pm 0.5$ & $94.9 \pm 0.4$ & $88.9 \pm 0.6$ \\ \hline
\end{tab

In [None]:
import pandas as pd
import numpy as np

alpha = 0.1

df = pd.read_csv("results_revealed.csv")



mask = df["coverage_HDR_known"] == 1.0
df.loc[mask, ["coverage_HDR_known", "ERT_HDR_known", "WSC_HDR_known", "volume_HDR_known", 'time_HDR_known']] = np.nan
ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["with_bayes_levelset", "HDR_known", "one_covariance_known", "ot_known", "MVCS_known"]

# dataset name mapping
dataset_map = {
    "biais": "Bias [2]",
    "casp": "CASP [2]",
    "house": "House [2]",
    "rf1": "rf1 [8]",
    "rf2": "rf2 [8]",
    "scm1d": "scm1d [16]",
    "scm20d": "scm20d [16]",
    "taxi": "Taxi [2]",
}

# method name mapping (for LaTeX header)
method_map = {
    "with_bayes_levelset": "Mahalanobis",
    "HDR_known": "HPD",
    "one_covariance_known": "ECM",
    "ot_known": "OT",
    "MVCS_known": "MVCS",
}

# keep alpha = 0.1 and compute means
res = ( 
    df[df["alpha"] == alpha] 
    .groupby("dataset") 
    .agg(["mean", "sem"]) 
)



def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.1$) for partially reveled outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"{{\\begin{{tabular}}{{c}} Dataset \\\\ {{\small [output dim]}} \\end{{tabular}}}} & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    

    def get_metric(metric, method):
        
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])
    time_mean, time_std = zip(*[get_metric("time", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)
    time_fmt = format_row_with_std(time_mean, time_std, mode="min")

    
    latex += f"\\multirow{{5}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += f" & Time [s] & " + " & ".join(time_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results:revealed:alpha01}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.1$) for partially reveled outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions. Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c c c }
\hline
{\begin{tabular}{c} Dataset \\ {\small [output dim]} \end{tabular}} & Metric & Mahalanobis & HPD & ECM & OT & MVCS \\
\hline
\multirow{5}{*}{{Bias [2]}} & ERT [\%] & $\mathbf{1.74_{0.32}}$ & $\underline{1.98_{0.53}}$ & $4.03_{0.44}$ & $3.76_{0.37}$ & $3.31_{0.33}$ \\
 & WSC & $\mathbf{0.73_{0.01}}$ & $0.72_{0.02}$

In [None]:
import pandas as pd
import numpy as np

alpha = 0.05

df = pd.read_csv("results_revealed.csv")



mask = df["coverage_HDR_known"] == 1.0
df.loc[mask, ["coverage_HDR_known", "ERT_HDR_known", "WSC_HDR_known", "volume_HDR_known", 'time_HDR_known']] = np.nan
ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["with_bayes_levelset", "HDR_known", "one_covariance_known", "ot_known", "MVCS_known"]

# dataset name mapping
dataset_map = {
    "biais": "Bias [2]",
    "casp": "CASP [2]",
    "house": "House [2]",
    "rf1": "rf1 [8]",
    "rf2": "rf2 [8]",
    "scm1d": "scm1d [16]",
    "scm20d": "scm20d [16]",
    "taxi": "Taxi [2]",
}

# method name mapping (for LaTeX header)
method_map = {
    "with_bayes_levelset": "Mahalanobis",
    "HDR_known": "HPD",
    "one_covariance_known": "ECM",
    "ot_known": "OT",
    "MVCS_known": "MVCS",
}

# keep alpha = 0.1 and compute means
res = ( 
    df[df["alpha"] == alpha] 
    .groupby("dataset") 
    .agg(["mean", "sem"]) 
)



def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.05$) for partially reveled outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.95$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"{{\\begin{{tabular}}{{c}} Dataset \\\\ {{\small [output dim]}} \\end{{tabular}}}} & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    

    def get_metric(metric, method):
        
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])
    time_mean, time_std = zip(*[get_metric("time", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)
    time_fmt = format_row_with_std(time_mean, time_std, mode="min")

    
    latex += f"\\multirow{{4}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += f" & Time [s] & " + " & ".join(time_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results:revealed:alpha005}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.05$) for partially reveled outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.95$ is better); volume, marginal coverage and the running time to calibrate the predictions. Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c c c }
\hline
{\begin{tabular}{c} Dataset \\ {\small [output dim]} \end{tabular}} & Metric & Mahalanobis & HPD & ECM & OT & MVCS \\
\hline
\multirow{4}{*}{{Bias [2]}} & ERT [\%] & $\mathbf{1.30_{0.22}}$ & $1.67_{0.23}$ & $2.25_{0.20}$ & $2.14_{0.34}$ & $\underline{1.54_{0.22}}$ \\
 & WSC & $0.75_{0.01}$ & $\mathbf{0.76_{0.02}}

In [None]:
import pandas as pd
import numpy as np

alpha = 0.1

df = pd.read_csv("results_projection.csv")


ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["levelset", "one", "ot"]

# dataset name mapping
dataset_map = {
    "biais": "Bias",
    "casp": "CASP",
    "house": "House",
    "rf1": "rf1",
    "rf2": "rf2",
    "taxi": "Taxi",
}

# method name mapping (for LaTeX header)
method_map = {
    "levelset": "Mahalanobis",
    "one": "ECM",
    "ot": "OT",
}

# keep alpha = 0.1 and compute means
res = (
    df[df["alpha"] == alpha]
    .groupby("dataset")
    .agg(["mean", "sem"])
)


def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.1$) for a projection of the outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better). Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"Dataset & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    def get_metric(metric, method):
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)

    
    latex += f"\\multirow{{4}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results:projection:alpha01}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.1$) for a projection of the outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better). Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c }
\hline
Dataset & Metric & Mahalanobis & ECM & OT \\
\hline
\multirow{4}{*}{{Bias}} & ERT [\%] & $\mathbf{2.62_{0.39}}$ & $4.70_{0.49}$ & $\underline{3.64_{0.50}}$ \\
 & WSC & $\mathbf{0.73_{0.01}}$ & $0.72_{0.01}$ & $\underline{0.73_{0.01}}$ \\
 & Volume & $\mathbf{1.34_{0.15}}$ & $\underline{1.52_{0.16}}$ & $2.00_{0.22}$ \\
 & Coverage & $0.90_{0.01}$ & $0.90_{0.01}$ & $0.90_{0.01}$ \\
\hline
\mult

In [None]:
import pandas as pd
import numpy as np

alpha = 0.05

df = pd.read_csv("results_projection.csv")


ert_cols = [c for c in df.columns if c.startswith("ERT_")]
df[ert_cols] = df[ert_cols] * 100

cols = df.select_dtypes(include=np.number).columns
mask = df.groupby(["alpha", "dataset"])[cols].transform(lambda s: s.isna().sum() >= 2)
df[cols] = df[cols].mask(mask)

methods = ["levelset", "one", "ot"]

# dataset name mapping
dataset_map = {
    "biais": "Bias",
    "casp": "CASP",
    "house": "House",
    "rf1": "rf1",
    "rf2": "rf2",
    "taxi": "Taxi",
}

# method name mapping (for LaTeX header)
method_map = {
    "levelset": "Mahalanobis",
    "one": "ECM",
    "ot": "OT",
}

# keep alpha = 0.1 and compute means
res = (
    df[df["alpha"] == alpha]
    .groupby("dataset")
    .agg(["mean", "sem"])
)


def format_row_with_std(means, stds, mode="min"):
    """
    mode="min": smallest bold, second smallest underlined
    mode="max": largest bold, second largest underlined
    mode=None: no highlighting
    """
    means = np.array(means, dtype=float)
    stds = np.array(stds, dtype=float)

    valid = ~np.isnan(means)

    if mode == "min":
        order = np.argsort(means)
    elif mode == "max":
        order = np.argsort(-means)
    else:
        order = []

    best = order[0] if len(order) > 0 else None
    second = order[1] if len(order) > 1 else None

    out = []
    for i, (m, s) in enumerate(zip(means, stds)):
        if np.isnan(m):
            out.append("N/A")
        else:
            val = f"{m:.2f}_{{{s:.2f}}}"
            if mode is not None:
                if i == best:
                    val = f"\\mathbf{{{val}}}"
                elif i == second:
                    val = f"\\underline{{{val}}}"
            out.append("$"+val+"$")
    return out


# ===== Build LaTeX table =====latex += f"\\caption{{Comparison of methods for $\\alpha={alpha}$. Best values in bold, second best underlined.}}\n"


header_methods = " & ".join(method_map[m] for m in methods)

latex = ""
latex += "\\begin{table}[ht]\n"
latex += "\\centering\n"
latex += "\\small\n"
latex += f"\\caption{{Comparison of conditional coverage ($\\alpha=0.05$) for a projection of the outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better). Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}}\n"
latex += "\\begin{tabular}{l l " + "c " * len(methods) + "}\n"
latex += "\\hline\n"
latex += f"Dataset & Metric & {header_methods} \\\\\n"
latex += "\\hline\n"

for dataset_raw, row in res.iterrows():
    dataset = dataset_map.get(dataset_raw, dataset_raw)

    def get_metric(metric, method):
        return (
            row[(f"{metric}_{method}", "mean")] if (f"{metric}_{method}", "mean") in row else np.nan,
            row[(f"{metric}_{method}", "sem")] if (f"{metric}_{method}", "sem") in row else np.nan,
        )

    ert_mean, ert_std = zip(*[get_metric("ERT", m) for m in methods])
    vol_mean, vol_std = zip(*[get_metric("volume", m) for m in methods])
    wsc_mean, wsc_std = zip(*[get_metric("WSC", m) for m in methods])
    cov_mean, cov_std = zip(*[get_metric("coverage", m) for m in methods])

    ert_fmt = format_row_with_std(ert_mean, ert_std, mode="min")
    vol_fmt = format_row_with_std(vol_mean, vol_std, mode="min")
    wsc_fmt = format_row_with_std(wsc_mean, wsc_std, mode="max")
    cov_fmt = format_row_with_std(cov_mean, cov_std, mode=None)

    
    latex += f"\\multirow{{4}}{{*}}{{{{{dataset}}}}} & ERT [\%] & " + " & ".join(ert_fmt) + " \\\\\n"
    latex += f" & WSC & " + " & ".join(wsc_fmt) + " \\\\\n"
    latex += f" & Volume & " + " & ".join(vol_fmt) + " \\\\\n"
    latex += f" & Coverage & " + " & ".join(cov_fmt) + " \\\\\n"
    latex += "\\hline\n"

latex += "\\end{tabular}\n"
latex += "\\label{tab:app:results:projection:alpha005}\n"
latex += "\\end{table}\n"

print(latex)


\begin{table}[ht]
\centering
\small
\caption{Comparison of conditional coverage ($\alpha=0.05$) for a projection of the outputs for different conformal methods using the ERT metric (lower is better) and WSC (closer to $0.9$ is better). Best values in bold, second best underlined. N/A indicates that the method failed to produce valid results for the corresponding dataset due to poor conditional density estimation leading to numerical issues in high dimensions Experiments are repeated 10 times, and the index number is the standard error across those 10 experiments.}
\begin{tabular}{l l c c c }
\hline
Dataset & Metric & Mahalanobis & ECM & OT \\
\hline
\multirow{4}{*}{{Bias}} & ERT [\%] & $\mathbf{0.93_{0.29}}$ & $\underline{2.20_{0.29}}$ & $2.35_{0.40}$ \\
 & WSC & $0.74_{0.01}$ & $\underline{0.75_{0.01}}$ & $\mathbf{0.77_{0.02}}$ \\
 & Volume & $\mathbf{1.64_{0.18}}$ & $\underline{1.96_{0.20}}$ & $2.47_{0.26}$ \\
 & Coverage & $0.95_{0.00}$ & $0.95_{0.00}$ & $0.96_{0.01}$ \\
\hline
\mul