In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

fl = pd.read_csv("FL_DeepEval.csv")
ll = pd.read_csv("LL_DeepEval.csv")
oll = pd.read_csv("Obscured_LL_DeepEval.csv")

fl["Environment"] = "Frozen Lake"
ll["Environment"] = "Lunar Lander"
oll["Environment"] = "Obscured Lunar Lander"

df = pd.concat([fl, ll, oll], ignore_index=True)

metrics = ["Prompt Alignment Score", "Correctness Score", "Helpfulness Score",
           "Conciseness Score", "Relevance Score", "Coherence Score"]

agg = df.groupby(["Environment", "PromptType", "LLM"])[metrics].agg(["mean","std"]).reset_index()
print(agg)

              Environment             PromptType       LLM  \
                                                             
0             Frozen Lake           LongDetailed  DeepSeek   
1             Frozen Lake           LongDetailed     GPT-5   
2             Frozen Lake           LongDetailed   Llama-3   
3             Frozen Lake          ShortSpecific  DeepSeek   
4             Frozen Lake          ShortSpecific     GPT-5   
5             Frozen Lake          ShortSpecific   Llama-3   
6            Lunar Lander           LongDetailed  DeepSeek   
7            Lunar Lander           LongDetailed     GPT-5   
8            Lunar Lander           LongDetailed   Llama-3   
9            Lunar Lander          ShortSpecific  DeepSeek   
10           Lunar Lander          ShortSpecific     GPT-5   
11           Lunar Lander          ShortSpecific   Llama-3   
12  Obscured Lunar Lander  ShortSpecificModified  DeepSeek   
13  Obscured Lunar Lander  ShortSpecificModified     GPT-5   
14  Obsc

In [23]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

os.makedirs("figures", exist_ok=True)

paths = {
    "FrozenLake": "FL_DeepEval.csv",
    "LunarLander": "LL_DeepEval.csv",
    "LunarLanderObscured": "Obscured_LL_DeepEval.csv"
}

dfs = []

for env_name, filename in paths.items():
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df["Environment"] = env_name
        dfs.append(df)

if len(dfs) == 0:
    raise FileNotFoundError("No evaluation CSVs found.")

df = pd.concat(dfs, ignore_index=True)

df["PromptType"] = df["PromptType"].astype(str).str.strip()
df["LLM"] = df["LLM"].astype(str).str.strip()

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score"
]

mean_df = df.groupby(["Environment", "PromptType", "LLM"])[metrics].mean().reset_index()
count_df = df.groupby(["Environment", "PromptType", "LLM"]).size().reset_index(name="N")

melted = mean_df.melt(
    id_vars=["Environment", "PromptType", "LLM"],
    value_vars=metrics,
    var_name="Metric",
    value_name="MeanScore"
)


for env in sorted(df["Environment"].unique()):
    sub = melted[melted["Environment"] == env]


    g = sns.catplot(
        data=sub,
        x="LLM",
        y="MeanScore",
        hue="PromptType",
        col="Metric",
        kind="bar",
        col_wrap=3,
        height=3.8,
        aspect=1
    )

    for ax in g.axes.flatten():
        ax.set_xlabel("")
        ax.set_ylabel("Mean Score")

    g._legend.set_title("")
    g.fig.suptitle(f"{env} — Mean Scores by LLM and Prompt Type", y=1.02)
    plt.tight_layout()
    plt.savefig(f"figures/mean_scores_{env}.png", bbox_inches="tight")
    plt.close()

for metric in metrics:
    plt.figure(figsize=(9, 5))
    sns.boxplot(data=df, x="LLM", y=metric, hue="PromptType")
    plt.title(f"Distribution of {metric} by LLM and Prompt Type")
    plt.xlabel("")
    plt.ylabel(metric)
    plt.legend(title="")
    plt.tight_layout()
    plt.savefig(f"figures/boxplot_{metric.replace(' ', '_')}.png", bbox_inches="tight")
    plt.close()

for env in sorted(df["Environment"].unique()):
    env_subset = mean_df[mean_df["Environment"] == env]

    for metric in metrics:
        pivot = env_subset.pivot(index="LLM", columns="PromptType", values=metric)

        if pivot.empty:
            continue

        plt.figure(figsize=(6, max(2, 0.8 * pivot.shape[0])))
        sns.heatmap(
            pivot,
            annot=True,
            fmt=".2f",
            cbar_kws={"label": metric},
            linewidths=0.5
        )
        plt.xlabel("Prompt Type")
        plt.ylabel("LLM")
        plt.title(f"{env} — Mean {metric}")
        plt.tight_layout()
        plt.savefig(f"figures/heatmap_{env}_{metric.replace(' ', '_')}.png", bbox_inches="tight")
        plt.close()

corr = df[metrics].corr()
plt.figure(figsize=(7, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", center=0)
plt.title("Correlation Matrix of Evaluation Metrics")
plt.tight_layout()
plt.savefig("figures/correlation_matrix.png", bbox_inches="tight")
plt.close()


regular_envs = df[df["Environment"] != "LunarLanderObscured"]

mean_df_regular = mean_df[mean_df["Environment"] != "LunarLanderObscured"]

pivot_multi = mean_df_regular.pivot_table(
    index=["Environment", "LLM"],
    columns="PromptType",
    values=metrics
)

pivot_multi.columns = [f"{metric}__{prompt}" for metric, prompt in pivot_multi.columns]

diff_df = {}

for metric in metrics:
    col_long = f"{metric}__LongDetailed"
    col_short = f"{metric}__ShortSpecific"
    if col_long in pivot_multi.columns and col_short in pivot_multi.columns:
        diff_df[metric] = pivot_multi[col_long] - pivot_multi[col_short]

diff_df = pd.DataFrame(diff_df)

if not diff_df.empty:
    mean_gain = diff_df.mean().sort_values(ascending=False)

    plt.figure(figsize=(8, 4))
    mean_gain.plot(kind="bar")
    plt.title("Average difference of Long Detailed over Short Specific (Original Lunar + FrozenLake)")
    plt.ylabel("Mean Score Difference")
    plt.tight_layout()
    plt.savefig("figures/mean_difference_bar.png", bbox_inches="tight")
    plt.close()

for env in sorted(df["Environment"].unique()):
    c = count_df[count_df["Environment"] == env].pivot(
        index="LLM", columns="PromptType", values="N"
    ).fillna(0)

    plt.figure(figsize=(6, max(2, 0.8 * c.shape[0])))
    sns.heatmap(c, annot=True, fmt=".0f", cmap="Blues")
    plt.xlabel("Prompt Type")
    plt.ylabel("LLM")
    plt.title(f"{env} — Example Counts per LLM and Prompt Type")
    plt.tight_layout()
    plt.savefig(f"figures/counts_{env}.png", bbox_inches="tight")
    plt.close()

for metric in metrics:
    plt.figure(figsize=(10, 5))
    sns.pointplot(
        data=mean_df,
        x="LLM",
        y=metric,
        hue="PromptType",
        dodge=0.4,
        join=False
    )
    plt.title(f"Mean {metric} by LLM and Prompt Type")
    plt.xlabel("")
    plt.ylabel(metric)
    plt.legend(title="")
    plt.tight_layout()
    plt.savefig(f"figures/pointplot_{metric.replace(' ', '_')}.png", bbox_inches="tight")
    plt.close()



The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(

The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`.

  sns.pointplot(


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_rel, wilcoxon
from math import sqrt

sns.set(style="whitegrid")

os.makedirs("figures/LL_comparison", exist_ok=True)

orig_path = "LL_DeepEval.csv"
mod_path = "Obscured_LL_DeepEval.csv"

orig = pd.read_csv(orig_path)
mod = pd.read_csv(mod_path)

orig["LLM"] = orig["LLM"].astype(str).str.strip()
mod["LLM"] = mod["LLM"].astype(str).str.strip()

orig = orig[orig["PromptType"].isin(["ShortSpecific", "LongDetailed"])]

mod = mod[mod["PromptType"] == "ShortSpecificModified"]

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score"
]

matched = orig.merge(
    mod,
    on=["Episode", "LLM"],
    suffixes=("_orig", "_mod"),
    how="inner"
)

if matched.empty:
    raise ValueError("No overlapping episodes between LL and Obscured LL! Check seeds or file contents.")

print(f"Matched episodes: {matched['Episode'].nunique()}")

for metric in metrics:
    plt.figure(figsize=(10,5))

    data = matched[["LLM", f"{metric}_orig", f"{metric}_mod"]]
    melted = pd.melt(
        data,
        id_vars="LLM",
        value_vars=[f"{metric}_orig", f"{metric}_mod"],
        var_name="Version",
        value_name="Score"
    )
    melted["Version"] = melted["Version"].map({
        f"{metric}_orig": "Original",
        f"{metric}_mod": "Obscured"
    })

    sns.barplot(data=melted, x="LLM", y="Score", hue="Version")
    plt.title(f"{metric}: Original vs Obscured Lunar Lander")
    plt.tight_layout()
    plt.savefig(f"figures/LL_comparison/{metric.replace(' ','_')}_bar.png", bbox_inches="tight")
    plt.close()

for metric in metrics:
    matched[f"{metric}_diff"] = matched[f"{metric}_mod"] - matched[f"{metric}_orig"]

    plt.figure(figsize=(10,5))
    sns.barplot(data=matched, x="LLM", y=f"{metric}_diff")
    plt.axhline(0, color="black", linewidth=1)
    plt.title(f"Difference ({metric}): Obscured − Original")
    plt.ylabel("Score Difference")
    plt.tight_layout()
    plt.savefig(f"figures/LL_comparison/{metric.replace(' ','_')}_difference.png", bbox_inches="tight")
    plt.close()

for metric in metrics:
    plt.figure(figsize=(6,6))
    sns.scatterplot(
        x=matched[f"{metric}_orig"],
        y=matched[f"{metric}_mod"],
        hue=matched["LLM"]
    )
    plt.plot([0,1],[0,1], "--", color="black")
    plt.xlabel("Original")
    plt.ylabel("Obscured")
    plt.title(f"{metric}: Original vs Obscured (paired scatter)")
    plt.tight_layout()
    plt.savefig(f"figures/LL_comparison/{metric.replace(' ','_')}_scatter.png", bbox_inches="tight")
    plt.close()

stats_results = []

def cohens_d(x, y):
    return (np.mean(y) - np.mean(x)) / np.std(y - x, ddof=1)

for metric in metrics:
    x = matched[f"{metric}_orig"]
    y = matched[f"{metric}_mod"]

    t_stat, p_ttest = ttest_rel(y, x)

    w_stat, p_wilcox = wilcoxon(y - x)

    d = cohens_d(x, y)

    diff = (y - x)
    mean_diff = diff.mean()
    ci_low = mean_diff - 1.96 * diff.std(ddof=1) / sqrt(len(diff))
    ci_high = mean_diff + 1.96 * diff.std(ddof=1) / sqrt(len(diff))

    stats_results.append({
        "Metric": metric,
        "T-test p": p_ttest,
        "Wilcoxon p": p_wilcox,
        "Cohen d": d,
        "Mean diff": mean_diff,
        "CI 95% low": ci_low,
        "CI 95% high": ci_high
    })

stats_df = pd.DataFrame(stats_results)
stats_df.to_csv("figures/LL_comparison/LL_original_vs_obscured_statistics.csv", index=False)

print(stats_df)


Matched episodes: 44
                   Metric      T-test p    Wilcoxon p   Cohen d  Mean diff  \
0  Prompt Alignment Score  1.084777e-03  1.287816e-03  0.203360   0.092424   
1       Correctness Score  4.825080e-04  4.784445e-05 -0.217534  -0.026809   
2       Helpfulness Score  2.324693e-01  2.792826e-03  0.073657   0.016083   
3       Conciseness Score  1.649081e-16  1.429083e-16  0.542658   0.118545   
4         Relevance Score  1.570890e-01  9.935268e-01  0.087332   0.019132   
5         Coherence Score  3.530438e-01  3.389745e-02  0.057259   0.011731   

   CI 95% low  CI 95% high  
0    0.037600     0.147249  
1   -0.041675    -0.011943  
2   -0.010257     0.042423  
3    0.092193     0.144897  
4   -0.007295     0.045559  
5   -0.012983     0.036444  


In [None]:
import pandas as pd

fl = pd.read_csv("FL_DeepEval.csv")
ll = pd.read_csv("LL_DeepEval.csv")

fl["Environment"] = "Frozen Lake"
ll["Environment"] = "Lunar Lander"

df = pd.concat([fl, ll], ignore_index=True)

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score",
]

means = (
    df.groupby(["Environment", "PromptType", "LLM"])[metrics]
      .mean()
      .reset_index()
)

long_df = means[means["PromptType"] == "LongDetailed"]
short_df = means[means["PromptType"] == "ShortSpecific"]

merged = pd.merge(
    long_df,
    short_df,
    on=["Environment", "LLM"],
    suffixes=("_Long", "_Short")
)

diffs = pd.DataFrame({
    "Environment": merged["Environment"],
    "LLM": merged["LLM"],
})

for metric in metrics:
    diffs[metric.replace(" Score", "")] = (
        merged[f"{metric}_Long"] - merged[f"{metric}_Short"]
    )

diffs = diffs.round(2)

print(diffs)

    Environment       LLM  Prompt Alignment  Correctness  Helpfulness  \
0   Frozen Lake  DeepSeek             -0.34        -0.09        -0.15   
1   Frozen Lake     GPT-5             -0.68        -0.03         0.01   
2   Frozen Lake   Llama-3             -0.34        -0.14        -0.03   
3  Lunar Lander  DeepSeek             -0.14        -0.01        -0.01   
4  Lunar Lander     GPT-5             -0.23        -0.04        -0.00   
5  Lunar Lander   Llama-3             -0.30         0.04        -0.07   

   Conciseness  Relevance  Coherence  
0         0.01      -0.04      -0.20  
1        -0.09       0.00       0.00  
2        -0.15      -0.04      -0.06  
3        -0.33      -0.00      -0.01  
4        -0.13      -0.00      -0.01  
5        -0.23      -0.07      -0.07  


In [None]:
import pandas as pd

ll = pd.read_csv("LL_DeepEval.csv")
oll = pd.read_csv("Obscured_LL_DeepEval.csv")

ll["Environment"] = "Lunar Lander"
oll["Environment"] = "Obscured Lunar Lander"

df = pd.concat([ll, oll], ignore_index=True)

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score",
]

means = (
    df.groupby(["Environment", "PromptType", "LLM"])[metrics]
      .mean()
      .reset_index()
)

orig = means[
    (means["Environment"] == "Lunar Lander") &
    (means["PromptType"] == "ShortSpecific")
]

obsc = means[
    (means["Environment"] == "Obscured Lunar Lander") &
    (means["PromptType"] == "ShortSpecificModified")
]

merged = pd.merge(
    obsc,
    orig,
    on="LLM",
    suffixes=("_Obscured", "_Original")
)

diffs = pd.DataFrame({
    "Environment": "Obscured vs Original Lunar Lander",
    "LLM": merged["LLM"],
})

for metric in metrics:
    diffs[metric.replace(" Score", "")] = (
        merged[f"{metric}_Obscured"] - merged[f"{metric}_Original"]
    )

diffs = diffs.round(2)

print(diffs)

                         Environment       LLM  Prompt Alignment  Correctness  \
0  Obscured vs Original Lunar Lander  DeepSeek              0.07         0.01   
1  Obscured vs Original Lunar Lander     GPT-5              0.02        -0.04   
2  Obscured vs Original Lunar Lander   Llama-3             -0.14        -0.05   

   Helpfulness  Conciseness  Relevance  Coherence  
0        -0.00        -0.01      -0.00      -0.00  
1        -0.00        -0.01      -0.00      -0.01  
2         0.01         0.03       0.02       0.00  


In [24]:
import pandas as pd
from scipy.stats import ttest_rel

fl = pd.read_csv("FL_DeepEval.csv")
ll = pd.read_csv("LL_DeepEval.csv")
oll = pd.read_csv("Obscured_LL_DeepEval.csv")

fl["Environment"] = "Frozen Lake"
ll["Environment"] = "Lunar Lander"
oll["Environment"] = "Obscured Lunar Lander"

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score",
]

def compute_paired_ttests(df1, df2, env_label):
    merged = pd.merge(df1, df2, on="LLM", suffixes=("_1", "_2"))
    results = []

    for llm in merged["LLM"].unique():
        row = merged[merged["LLM"] == llm]
        for metric in metrics:
            t_stat, p_val = ttest_rel(row[f"{metric}_1"].values, row[f"{metric}_2"].values)
            results.append([env_label, llm, metric, p_val])

    for metric in metrics:
        t_stat, p_val = ttest_rel(merged[[f"{metric}_1"]].values.flatten(),
                                  merged[[f"{metric}_2"]].values.flatten())
        results.append([env_label, "All LLMs", metric, p_val])

    return pd.DataFrame(results, columns=["Environment", "Model", "Metric", "p-value"])

fl_long = fl[fl["PromptType"] == "LongDetailed"]
fl_short = fl[fl["PromptType"] == "ShortSpecific"]
fl_ttests_df = compute_paired_ttests(fl_long, fl_short, "Frozen Lake")

ll_short = ll[ll["PromptType"] == "ShortSpecific"]
oll_short = oll[oll["PromptType"] == "ShortSpecificModified"]
ll_ttests_df = compute_paired_ttests(oll_short, ll_short, "Obscured vs Original Lunar Lander")

ttest_results = pd.concat([fl_ttests_df, ll_ttests_df], ignore_index=True)
ttest_results["p-value"] = ttest_results["p-value"].apply(lambda x: f"{x:.2e}")

print(ttest_results)

                          Environment     Model                  Metric  \
0                         Frozen Lake     GPT-5  Prompt Alignment Score   
1                         Frozen Lake     GPT-5       Correctness Score   
2                         Frozen Lake     GPT-5       Helpfulness Score   
3                         Frozen Lake     GPT-5       Conciseness Score   
4                         Frozen Lake     GPT-5         Relevance Score   
5                         Frozen Lake     GPT-5         Coherence Score   
6                         Frozen Lake   Llama-3  Prompt Alignment Score   
7                         Frozen Lake   Llama-3       Correctness Score   
8                         Frozen Lake   Llama-3       Helpfulness Score   
9                         Frozen Lake   Llama-3       Conciseness Score   
10                        Frozen Lake   Llama-3         Relevance Score   
11                        Frozen Lake   Llama-3         Coherence Score   
12                       

In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

sns.set(style="whitegrid")
os.makedirs("figures/mean_scores_multi", exist_ok=True)

paths = {
    "FrozenLake": "FL_DeepEval.csv",
    "LunarLander": "LL_DeepEval.csv",
    "LunarLanderObscured": "Obscured_LL_DeepEval.csv"
}

dfs = []
for env_name, filename in paths.items():
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df["Environment"] = env_name
        dfs.append(df)

df = pd.concat(dfs, ignore_index=True)
df["PromptType"] = df["PromptType"].astype(str).str.strip()
df["LLM"] = df["LLM"].astype(str).str.strip()

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score"
]

grouped = df.groupby(["Environment", "PromptType", "LLM"])[metrics]
mean_df = grouped.mean().reset_index()
sem_df = pd.DataFrame()

for env, prompt, llm in mean_df[["Environment","PromptType","LLM"]].values:
    subset = df[(df["Environment"]==env) & (df["PromptType"]==prompt) & (df["LLM"]==llm)]
    sems = []
    for metric in metrics:
        if metric == "Prompt Alignment Score":
            mean_val = subset[metric].mean()
            N = len(subset)
            sem = np.sqrt(mean_val*(1-mean_val)/N)
        else:
            sem = subset[metric].std(ddof=1)/np.sqrt(len(subset))
        sems.append(sem)
    sem_df = pd.concat([sem_df, pd.DataFrame([[env,prompt,llm]+sems], columns=["Environment","PromptType","LLM"]+metrics)], ignore_index=True)

melted_mean = mean_df.melt(id_vars=["Environment","PromptType","LLM"], value_vars=metrics, var_name="Metric", value_name="MeanScore")
melted_sem = sem_df.melt(id_vars=["Environment","PromptType","LLM"], value_vars=metrics, var_name="Metric", value_name="SEM")
plot_df = melted_mean.merge(melted_sem, on=["Environment","PromptType","LLM","Metric"])

for env in plot_df["Environment"].unique():
    sub = plot_df[plot_df["Environment"]==env]
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    for i, metric in enumerate(metrics):
        ax = axes[i]
        metric_sub = sub[sub["Metric"]==metric]
        sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)
        for j, row in metric_sub.iterrows():
            x_positions = list(metric_sub["LLM"].unique())
            x = x_positions.index(row["LLM"]) + (0 if row["PromptType"] in ["ShortSpecific","ShortSpecificModified"] else 0.2)
            ax.errorbar(x, row["MeanScore"], yerr=row["SEM"], fmt="none", c='black', capsize=4)
        ax.set_title(metric, fontsize=14)
        ax.set_xlabel("")
        ax.set_ylabel("Mean Score", fontsize=12)
        ax.tick_params(axis='x', labelsize=12)
        ax.tick_params(axis='y', labelsize=12)
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, title="", loc="upper left", ncol=2, fontsize=12)
    fig.suptitle(f"{env} — Mean Scores by LLM and Prompt Type", fontsize=16)
    plt.tight_layout(rect=[0,0,1,0.95])
    safe_env = env.replace(" ", "_")
    plt.savefig(f"figures/mean_scores_multi/{safe_env}_all_metrics.png", bbox_inches="tight")
    plt.close()



The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(data=metric_sub, x="LLM", y="MeanScore", hue="PromptType", ci=None, ax=ax)

The

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
os.makedirs("figures/correlation", exist_ok=True)

metrics = [
    "Prompt Alignment Score",
    "Correctness Score",
    "Helpfulness Score",
    "Conciseness Score",
    "Relevance Score",
    "Coherence Score"
]

orig = pd.read_csv("LL_DeepEval.csv")
orig["PromptType"] = orig["PromptType"].astype(str).str.strip()
orig["LLM"] = orig["LLM"].astype(str).str.strip()

orig_corr = orig[metrics].corr()

plt.figure(figsize=(7, 6))
sns.heatmap(
    orig_corr,
    annot=True,
    fmt=".2f",
    cmap="vlag",
    center=0,
    square=True,
    cbar_kws={"label": "Correlation"}
)
plt.title("Original Lunar Lander — Metric Correlation Matrix", fontsize=14)
plt.tight_layout()
plt.savefig("figures/correlation/original_lunarlander_corr.png", bbox_inches="tight")
plt.close()

obs = pd.read_csv("Obscured_LL_DeepEval.csv")
obs["PromptType"] = obs["PromptType"].astype(str).str.strip()
obs["LLM"] = obs["LLM"].astype(str).str.strip()

obs_corr = obs[metrics].corr()

plt.figure(figsize=(7, 6))
sns.heatmap(
    obs_corr,
    annot=True,
    fmt=".2f",
    cmap="vlag",
    center=0,
    square=True,
    cbar_kws={"label": "Correlation"}
)
plt.title("Obscured Lunar Lander — Metric Correlation Matrix", fontsize=14)
plt.tight_layout()
plt.savefig("figures/correlation/obscured_lunarlander_corr.png", bbox_inches="tight")
plt.close()


In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
os.makedirs("figures/prompt_alignment", exist_ok=True)

files = {
    "Frozen Lake": "FL_DeepEval.csv",
    "Original Lunar Lander": "LL_DeepEval.csv",
    "Obscured Lunar Lander": "Obscured_LL_DeepEval.csv"
}

metric = "Prompt Alignment Score"
all_data = []

for env, path in files.items():
    df = pd.read_csv(path)

    df["PromptType"] = df["PromptType"].astype(str).str.strip()
    df["LLM"] = df["LLM"].astype(str).str.strip()

    grouped = df.groupby(["LLM", "PromptType"])[metric]
    means = grouped.mean()
    ses = grouped.apply(lambda x: np.sqrt(x.mean() * (1 - x.mean()) / len(x)))

    summary = pd.DataFrame({
        "Mean": means,
        "SE": ses,
        "Environment": env
    }).reset_index()

    all_data.append(summary)

df_all = pd.concat(all_data, ignore_index=True)

plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_all,
    x="Environment",
    y="Mean",
    hue="PromptType",
    ci=None,
    palette="Set2"
)

for i, row in df_all.iterrows():
    env_idx = list(df_all["Environment"].unique()).index(row["Environment"])
    prompt_types = df_all["PromptType"].unique()
    hue_idx = list(prompt_types).index(row["PromptType"])
    total_prompts = len(prompt_types)
    offset = (hue_idx - total_prompts/2) * 0.2 + 0.1
    plt.errorbar(
        x=env_idx + offset,
        y=row["Mean"],
        yerr=row["SE"],
        fmt='none',
        c='black',
        capsize=4
    )

plt.ylim(0, 1)
plt.ylabel("Mean Prompt Alignment")
plt.title("Prompt Alignment Across Environments, LLMs, and Prompt Types")
plt.legend(title="Prompt Type")
plt.tight_layout()
plt.savefig("figures/prompt_alignment/all_environments_prompt_alignment.png", bbox_inches="tight")
plt.close()


The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(


In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
os.makedirs("figures/prompt_alignment", exist_ok=True)

files = {
    "Frozen Lake": "FL_DeepEval.csv",
    "Original Lunar Lander": "LL_DeepEval.csv",
    "Obscured Lunar Lander": "Obscured_LL_DeepEval.csv"
}

metric = "Prompt Alignment Score"
all_data = []

for env, path in files.items():
    df = pd.read_csv(path)
    df["PromptType"] = df["PromptType"].astype(str).str.strip()
    df["LLM"] = df["LLM"].astype(str).str.strip()

    grouped = df.groupby(["LLM", "PromptType"])[metric]
    means = grouped.mean()
    ses = grouped.apply(lambda x: np.sqrt(x.mean() * (1 - x.mean()) / len(x)))

    summary = pd.DataFrame({
        "Mean": means,
        "SE": ses,
        "Environment": env
    }).reset_index()

    all_data.append(summary)

df_all = pd.concat(all_data, ignore_index=True)

g = sns.catplot(
    data=df_all,
    x="LLM",
    y="Mean",
    hue="PromptType",
    col="Environment",
    kind="bar",
    palette="Set2",
    ci=None,
    height=5,
    aspect=1
)

for ax, env in zip(g.axes.flat, df_all["Environment"].unique()):
    env_data = df_all[df_all["Environment"] == env]
    for i, row in env_data.iterrows():
        llm_idx = list(env_data["LLM"].unique()).index(row["LLM"])
        prompt_idx = list(env_data["PromptType"].unique()).index(row["PromptType"])
        total_prompts = len(env_data["PromptType"].unique())
        offset = (prompt_idx - total_prompts / 2) * 0.8 / total_prompts + 0.4 / total_prompts
        ax.errorbar(
            x=llm_idx + offset,
            y=row["Mean"],
            yerr=row["SE"],
            fmt='none',
            c='black',
            capsize=4
        )

g.set(ylim=(0, 1))
g.set_axis_labels("LLM", "Mean Prompt Alignment")
g.set_titles("{col_name}")
g.fig.suptitle("Prompt Alignment Across Environments, LLMs, and Prompt Types", y=1.05)
plt.tight_layout()
plt.savefig("figures/prompt_alignment/all_environments_prompt_alignment.png", bbox_inches="tight")
plt.close()



The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  g = sns.catplot(


In [23]:
import pandas as pd

files = {
    "Frozen Lake": "FL_DeepEval.csv",
    "Original Lunar Lander": "LL_DeepEval.csv",
    "Obscured Lunar Lander": "Obscured_LL_DeepEval.csv"
}

bands = {
    "Low (0--0.2)": (0.0, 0.2),
    "Medium (0.3--0.5)": (0.3, 0.5),
    "High (0.8--1.0)": (0.8, 1.0)
}

metrics = {
    "Prompt Alignment": "Prompt Alignment Score",
    "Correctness": "Correctness Score",
    "Helpfulness": "Helpfulness Score",
    "Conciseness": "Conciseness Score",
    "Relevance": "Relevance Score",
    "Coherence": "Coherence Score"
}

def in_band(s, lo, hi):
    return (s >= lo) & (s <= hi)

def escape_latex(t):
    return (
        str(t)
        .replace("\\", "\\textbackslash{}")
        .replace("&", "\\&")
        .replace("%", "\\%")
        .replace("$", "\\$")
        .replace("#", "\\#")
        .replace("_", "\\_")
        .replace("{", "\\{")
        .replace("}", "\\}")
        .replace("~", "\\textasciitilde{}")
        .replace("^", "\\textasciicircum{}")
    )

with open("appendix_qualitative_examples.tex", "w") as f:
    f.write("\\section{Automatically Selected Qualitative Examples}\n")
    f.write("\\label{app:qualitative_examples}\n")

    for env, path in files.items():
        df = pd.read_csv(path)
        df["LLM"] = df["LLM"].astype(str).str.strip()
        df["PromptType"] = df["PromptType"].astype(str).str.strip()

        f.write(f"\\subsection{{{env}}}\n")

        for metric_name, col in metrics.items():

            if metric_name == "Prompt Alignment":
                selections = [(0.0, "0"), (1.0, "1")]
            else:
                selections = bands.items()

            for label, rng in selections:

                if metric_name == "Prompt Alignment":
                    subset = df[df[col] == label]
                    title = f"{metric_name} = {rng}"
                else:
                    lo, hi = rng
                    subset = df[in_band(df[col], lo, hi)]
                    title = f"{metric_name} -- {label}"

                if subset.empty:
                    continue

                r = subset.iloc[0]
                text = escape_latex(r["Explanation"])

                f.write(rf"""
\begin{{tcolorbox}}[
    colback=gray!5,
    colframe=black,
    width=\linewidth,
    breakable,
    enhanced,
    boxrule=0.6pt
]
\justifying
\tiny
\textbf{{LLM:}} {r["LLM"]} \\
\textbf{{Prompt Type:}} {r["PromptType"]} \\[2pt]
\textbf{{Metric:}} {title} \\[2pt]
\textit{{Explanation:}} ``{text}''
\end{{tcolorbox}}
""")