In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

csv_path  = "rq2_results_relative.csv"
p_col, n_col, trial_col = "p", "N", "trial"

# -----------------------
# Plot 1: split A and B (Relative HR)
# -----------------------
hr_cols_A = ["rA_prom", "rA_demo"]
hr_cols_B = ["rB_prom", "rB_demo"]
label_map_hr_A = {c: c for c in hr_cols_A}
label_map_hr_B = {c: c for c in hr_cols_B}

# -----------------------
# Plot 2: split A and B (selected CT combos only)
# -----------------------
ct_cols_A = ["ctA_pp","ctA_dd","ctA_pd"]
ct_cols_B = ["ctB_pp","ctB_dd","ctB_pd"]

label_map_ct_A = {
    "ctA_pp": "CT(a↑ 1,b↑ 2)",
    "ctA_dd": "CT(a↓ 1,b↓ 2)",
    "ctA_pd": "CT(a↑ 1,b↓ 2)",
}
label_map_ct_B = {
    "ctB_pp": "CT(a↑ 2,b↑ 1)",
    "ctB_dd": "CT(a↓ 2,b↓ 1)",
    "ctB_pd": "CT(a↑ 2,b↓ 1)",
}

# Filters
MAX_VAL = 20    # drop values > 20
MIN_VAL = -15   # drop values < -15

# 95% t-based CI (fallback to 1.96 if SciPy isn't available)
try:
    from scipy import stats
    def tcrit(df): return stats.t.ppf(0.975, df)
except Exception:
    def tcrit(df): return 1.96

def add_ci(df, mean_col="mean"):
    df = df.copy()
    def hw(row):
        if row["n"] > 1 and pd.notna(row["std"]):
            return tcrit(row["n"] - 1) * row["std"] / np.sqrt(row["n"])
        return np.nan
    df["halfwidth"] = df.apply(hw, axis=1)
    df["lower"] = df[mean_col] - df["halfwidth"]
    df["upper"] = df[mean_col] + df["halfwidth"]
    return df

def summarize_across_trials(df, value_cols, family_name=""):
    """
    Collapse across trials: for each (metric, N, p) compute mean, std, n, 95% CI.
    Applies value filters before aggregation.
    """
    long = df.melt(
        id_vars=[n_col, p_col, trial_col],
        value_vars=value_cols,
        var_name="metric",
        value_name="value",
    )
    long = long[(long["value"] <= MAX_VAL) & (long["value"] >= MIN_VAL)]

    agg = (long
           .groupby(["metric", n_col, p_col])["value"]
           .agg(mean="mean", std=lambda x: x.std(ddof=1), n="count")
           .reset_index())
    agg = add_ci(agg, "mean")
    agg[p_col] = pd.to_numeric(agg[p_col], errors="coerce")
    agg[n_col] = pd.to_numeric(agg[n_col], errors="coerce")
    return agg.sort_values(["metric", n_col, p_col])

def plot_family(summary, ylabel, label_map, baseline=None, outfile=None, legend_title="Condition, N"):
    s = summary.copy()
    # legend label: "<metric pretty>, N=<N>"
    s[n_col] = s[n_col].astype("Int64")
    s["series"] = s["metric"].map(label_map).fillna(s["metric"]) + ", N=" + s[n_col].astype(str)

    series_list = s["series"].unique().tolist()
    colors = plt.cm.tab20.colors
    color_map = {name: colors[i % len(colors)] for i, name in enumerate(series_list)}

    plt.figure(figsize=(11, 5))
    for name in series_list:
        sub = s[s["series"] == name].sort_values(p_col)
        x, y = sub[p_col].values, sub["mean"].values
        lo, hi = sub["lower"].values, sub["upper"].values

        c = color_map[name]
        plt.plot(x, y, label=name, linewidth=2, color=c)
        if np.isfinite(lo).any() and np.isfinite(hi).any():
            plt.fill_between(x, lo, hi, alpha=0.20, color=c, linewidth=0)

    if baseline is not None:
        plt.axhline(baseline, color="black", linewidth=1.5)

    plt.xlabel("Homogeneity (p)")
    plt.ylabel(ylabel)
    plt.grid(alpha=0.25)
    plt.legend(title=legend_title, ncol=2, bbox_to_anchor=(1.02, 1),
               loc="upper left", frameon=False)
    plt.tight_layout()
    if outfile:
        plt.savefig(outfile, dpi=300)
    plt.show()

# -----------------------
# Run
# -----------------------
df = pd.read_csv(csv_path)

# Plot 1A: Relative HR for A (baseline = 1.0)
hrA_summary = summarize_across_trials(df, hr_cols_A, family_name="HR A")
hrA_summary.to_csv("agg_relative_hr_A_by_(metric,N,p)_filtered.csv", index=False)
plot_family(
    hrA_summary,
    ylabel="Relative Hit Ratio (A)",
    label_map=label_map_hr_A,
    baseline=1.0,
    outfile="plot_relative_hr_A_single_solid_baseline.png",
    legend_title="A conditions, N",
)

# Plot 1B: Relative HR for B (baseline = 1.0)
hrB_summary = summarize_across_trials(df, hr_cols_B, family_name="HR B")
hrB_summary.to_csv("agg_relative_hr_B_by_(metric,N,p)_filtered.csv", index=False)
plot_family(
    hrB_summary,
    ylabel="Relative Hit Ratio (B)",
    label_map=label_map_hr_B,
    baseline=1.0,
    outfile="plot_relative_hr_B_single_solid_baseline.png",
    legend_title="B conditions, N",
)

# Plot 2A: CT combos for A (baseline = 0.0)
ctA_summary = summarize_across_trials(df, ct_cols_A, family_name="CT A combos")
ctA_summary.to_csv("agg_ct_A_selected_combos_by_(metric,N,p)_filtered.csv", index=False)
plot_family(
    ctA_summary,
    ylabel="CT Score (A)",
    label_map=label_map_ct_A,
    baseline=0.0,
    outfile="plot_ct_A_selected_combos_solid_baseline.png",
    legend_title="CT A combo, N",
)

# Plot 2B: CT combos for B (baseline = 0.0)
ctB_summary = summarize_across_trials(df, ct_cols_B, family_name="CT B combos")
ctB_summary.to_csv("agg_ct_B_selected_combos_by_(metric,N,p)_filtered.csv", index=False)
plot_family(
    ctB_summary,
    ylabel="CT Score (B)",
    label_map=label_map_ct_B,
    baseline=0.0,
    outfile="plot_ct_B_selected_combos_solid_baseline.png",
    legend_title="CT B combo, N",
)

print("Saved figures:")
print(" - plot_relative_hr_A_single_solid_baseline.png")
print(" - plot_relative_hr_B_single_solid_baseline.png")
print(" - plot_ct_A_selected_combos_solid_baseline.png")
print(" - plot_ct_B_selected_combos_solid_baseline.png")




FileNotFoundError: [Errno 2] No such file or directory: 'rq2_results_relative_hr.csv'

In [None]:
# Choose the condition
N_val = 20
p_val = 0.1

# Select rows matching that condition, and only keep the requested columns
subset = df.loc[
    (df["N"] == N_val) & (df["p"] == p_val),
    ["N", "p", "trial", "gA_base", "gB_base", "gA_prom", "gA_demo", "gB_prom", "gB_demo", "rA_prom", "rB_prom", "rA_demo","rB_demo"]
]
subset

In [None]:
subset = df[(df["N"] == 10) & (df["p"] == 0.1)]
subset

In [None]:
averaged_df = df.groupby(["N", "p"], as_index=False).mean()
averaged_df = averaged_df.loc[
    (df["N"] == N_val) & (df["p"] == p_val),
    ["N", "p", "trial", "gA_base", "gB_base", "gA_prom", "gA_demo", "gB_prom", "gB_demo", "rA_prom", "rB_prom", "rA_demo","rB_demo"]
]
averaged_df

In [None]:
df