In [1]:
#!/usr/bin/env python3

import sys
import pandas as pd
import os, re
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

def save_df(df: pd.DataFrame, name: str, directory: str = ".") -> None:
    # ensure directory exists
    os.makedirs(directory, exist_ok=True)

    # create full path
    filename = os.path.join(directory, f"{name}.csv")

    try:
        df.to_csv(filename, index=False)
        print(f"✅ Saved DataFrame to {filename}")
    except Exception as e:
        print(f"❌ Failed to save DataFrame: {e}")


def load_csv_to_df(csv_path: str) -> pd.DataFrame:
    """Load a CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(csv_path)
        return df
    except FileNotFoundError:
        print(f"Error: File '{csv_path}' not found.")
        sys.exit(1)
    except pd.errors.EmptyDataError:
        print(f"Error: File '{csv_path}' is empty.")
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        sys.exit(1)


def reshape_rubric(df):
    # rename rubric column
    df = df.rename(columns={df.columns[0]: "rubric"})

    # known model order cycling through triplets
    models = ["gemini", "grok", "chatgpt"]

    # extract graph types from non-"Unnamed" columns
    graph_cols = [c for c in df.columns if not re.match(r"Unnamed", c) and c != "rubric"]

    long_rows = []

    for graph in graph_cols:
        # find this graph's 3 column window (graph + next 2 cols)
        idx = df.columns.get_loc(graph)
        triplet = df.columns[idx:idx+3]  # assumes consistent structure

        for model, col in zip(models, triplet):
            temp = df[["rubric", col]].copy()
            temp["graph_type"] = graph
            temp["model"] = model
            temp = temp.rename(columns={col: "value"})
            long_rows.append(temp)

    tidy_df = pd.concat(long_rows, ignore_index=True)
    return tidy_df

def explode_prompt_types(df):
    prompt_types = ["baseline", "selfcheck", "standards"]

    verdict_map = {
        "y": "YES",
        "n": "NO",
        "-": "N/A"
    }

    long_rows = []

    for _, row in df.iterrows():
        val = str(row["value"])  # original value column

        if len(val) != 3:
            raise ValueError(f"Value '{val}' does not have 3 characters.")

        for char, pt in zip(val.lower(), prompt_types):  # lower-case normalize
            if char not in verdict_map:
                raise ValueError(f"Unexpected verdict character '{char}' in '{val}'")

            new_row = row.copy()
            new_row["verdict"] = verdict_map[char]
            new_row["prompt_type"] = pt
            new_row = new_row.drop(labels=["value"])  # remove old column
            long_rows.append(new_row)

    return pd.DataFrame(long_rows)


In [2]:
df = load_csv_to_df("LLM_Plotting_Rubric.csv")
print(df)

                               rubric    graph_type    model prompt_type  \
0     Code validity & reproducibility          Bars   gemini    baseline   
1     Code validity & reproducibility          Bars   gemini   selfcheck   
2     Code validity & reproducibility          Bars   gemini   standards   
3        Encoding choice matches task          Bars   gemini    baseline   
4        Encoding choice matches task          Bars   gemini   selfcheck   
...                               ...           ...      ...         ...   
1003            Task intent adherence  stacked bars  chatgpt   selfcheck   
1004            Task intent adherence  stacked bars  chatgpt   standards   
1005             Holistic readability  stacked bars  chatgpt    baseline   
1006             Holistic readability  stacked bars  chatgpt   selfcheck   
1007             Holistic readability  stacked bars  chatgpt   standards   

     verdict  
0        YES  
1        YES  
2        YES  
3        YES  
4        YES

In [3]:
save_df(df, "LLM_Plotting_Rubric")

✅ Saved DataFrame to ./LLM_Plotting_Rubric.csv


In [4]:
from scipy.stats import chi2_contingency

# Convert verdict to binary success for analysis (ignore N/A)
df_eval = df[df["verdict"] != "N/A"].copy()
df_eval["success"] = (df_eval["verdict"] == "YES").astype(int)

def print_section(title):
    print("\n" + "="*80)
    print(title)
    print("="*80)

# ---------------------------------------------------------------------------
# Overall success rate
# ---------------------------------------------------------------------------
overall_success = df_eval["success"].mean()
print_section("OVERALL SUCCESS RATE")
print(f"Overall success rate: {overall_success:.3f} ({overall_success*100:.1f}%)")

# ---------------------------------------------------------------------------
# Hardest/Easiest Rubrics
# ---------------------------------------------------------------------------
rubric_stats = df_eval.groupby("rubric")["success"].mean().sort_values()

print_section("HARDEST / EASIEST RUBRICS")
print("Bottom 5 (hardest):")
print(rubric_stats.head(5).to_string())
print("\nTop 5 (easiest):")
print(rubric_stats.tail(5).to_string())

# ---------------------------------------------------------------------------
# Hardest/Easiest Graph Types
# ---------------------------------------------------------------------------
graph_stats = df_eval.groupby("graph_type")["success"].mean().sort_values()

print_section("HARDEST / EASIEST GRAPH TYPES")
print("Hardest graph types:")
print(graph_stats.head().to_string())
print("\nEasiest graph types:")
print(graph_stats.tail().to_string())

# ---------------------------------------------------------------------------
# Best Performing Model
# ---------------------------------------------------------------------------
model_stats = df_eval.groupby("model")["success"].mean().sort_values()

print_section("BEST PERFORMING MODELS")
print(model_stats.to_string())

# ---------------------------------------------------------------------------
# Prompt Type Effects
# ---------------------------------------------------------------------------
prompt_stats = df_eval.groupby("prompt_type")["success"].mean().sort_values()

print_section("PROMPT TYPE EFFECTS")
print(prompt_stats.to_string())

# ---------------------------------------------------------------------------
# Chi-Square Test: Does prompt type matter?
# ---------------------------------------------------------------------------
cont_table = pd.crosstab(df_eval["prompt_type"], df_eval["success"])
chi2, p, dof, expected = chi2_contingency(cont_table)

print_section("STATISTICAL SIGNIFICANCE: PROMPT TYPE EFFECT")
print("Chi-square test on prompt_type vs success:")
print(f"Chi² = {chi2:.3f}, df = {dof}, p = {p:.5f}")

if p < 0.05:
    print("✅ Prompt type has a statistically significant effect on success.")
else:
    print("❌ No statistically significant evidence that prompt type affects success.")



OVERALL SUCCESS RATE
Overall success rate: 0.733 (73.3%)

HARDEST / EASIEST RUBRICS
Bottom 5 (hardest):
rubric
Axis integrity                     0.361111
Legend–encoding alignment          0.402778
Redundancy                         0.500000
Appropriate Data‑ink ratio         0.652778
Scale consistency across panels    0.694444

Top 5 (easiest):
rubric
Gridlines/ticks & sizing           0.805556
Data faithfulness                  0.847222
Code validity & reproducibility    0.972222
Encoding choice matches task       0.972222
Holistic readability               0.972222

HARDEST / EASIEST GRAPH TYPES
Hardest graph types:
graph_type
dual axis       0.539683
small multi     0.619048
heatmap         0.674603
line gaps       0.746032
stacked bars    0.777778

Easiest graph types:
graph_type
line gaps        0.746032
stacked bars     0.777778
scatter group    0.817460
Bars             0.833333
histogram        0.857143

BEST PERFORMING MODELS
model
chatgpt    0.720238
grok       0.720238
ge

In [5]:
# Compute success rate by (model, prompt_type)
pair_stats = (
    df_eval
    .groupby(["model", "prompt_type"])["success"]
    .agg(["mean", "count"])
    .sort_values(by="mean", ascending=False)
)

print("\nSuccess rate by model–prompt pair:")
print(pair_stats)

# Extract top-performing combination
best_pair = pair_stats.iloc[0]
best_model, best_prompt = pair_stats.index[0]

print("\nBest prompt–model pair:")
print(f"Model: {best_model}")
print(f"Prompt type: {best_prompt}")
print(f"Success rate: {best_pair['mean']:.3f} ({best_pair['mean']*100:.1f}%)")
print(f"Sample size: {best_pair['count']}")



Success rate by model–prompt pair:
                         mean  count
model   prompt_type                 
gemini  baseline     0.776786    112
        standards    0.776786    112
grok    standards    0.767857    112
        selfcheck    0.750000    112
chatgpt selfcheck    0.741071    112
gemini  selfcheck    0.723214    112
chatgpt standards    0.723214    112
        baseline     0.696429    112
grok    baseline     0.642857    112

Best prompt–model pair:
Model: gemini
Prompt type: baseline
Success rate: 0.777 (77.7%)
Sample size: 112.0


In [9]:

# --------------------------------------------------------------------------------------
# I/O
# --------------------------------------------------------------------------------------
LINTER_CSV    = "lint_summary.csv"
VIOLATIONS_CSV= "violations.csv"

OUT_TAB_DIR = "metrics"
OUT_FIG_DIR = "figs"
os.makedirs(OUT_TAB_DIR, exist_ok=True)
os.makedirs(OUT_FIG_DIR, exist_ok=True)

# --------------------------------------------------------------------------------------
# Load
# --------------------------------------------------------------------------------------
linter_df = pd.read_csv(LINTER_CSV)
viol_df   = pd.read_csv(VIOLATIONS_CSV)

# Minimal cleanup: normalize labels
def norm(s): return str(s).strip().lower().replace(" ", "").replace("-", "_")
linter_df["task"]      = linter_df["task"].astype(str)
linter_df["model"]     = linter_df["model"].astype(str)
linter_df["condition"] = linter_df["condition"].astype(str)
linter_df["rule"]      = linter_df["rule"].astype(str)
linter_df["status"]    = linter_df["status"].astype(str)

# --------------------------------------------------------------------------------------
# 1) “Gates” / objective heuristics from linter (partial, no s2)
#    G2 baseline_zero_bar; G3 no dual_axes; + useful heuristics (labels, legend, contrast)
# --------------------------------------------------------------------------------------

# Helper: status -> violation (1 if warn/fail, else 0)
VIOL_STATUSES = {"warn", "fail"}
def is_viol(row):
    return 1 if str(row["status"]).lower() in VIOL_STATUSES else 0

linter_df["viol"] = linter_df.apply(is_viol, axis=1)

# Gate-like rules we can quantify directly from linter
GATE_RULES = {
    "baseline_zero_bar": "Bars baseline at zero (or justified)",  # warn/fail => violation
    "dual_axes":         "No unjustified dual axes",              # fail => violation
}

# Useful non-gate heuristics for the paper
AUX_RULES = {
    "labels_present": "Title + X + Y labels present",
    "legend_call":    "Legend present when needed",
    "contrast_text":  "Text contrast >= threshold",
}

KEEP_RULES = list(GATE_RULES.keys()) + list(AUX_RULES.keys())

sub = linter_df[linter_df["rule"].isin(KEEP_RULES)].copy()

# Group to violation rate by (model, condition, rule)
viol_rate = (
    sub.groupby(["model", "condition", "rule"])["viol"]
       .mean()
       .reset_index()
       .rename(columns={"viol":"violation_rate"})
)

# Save table
viol_rate.sort_values(["rule","model","condition"]).to_csv(
    os.path.join(OUT_TAB_DIR, "violation_rate_by_model_condition_rule.csv"),
    index=False
)

# Also compute pass rate (1 - violation_rate)
pass_rate = viol_rate.copy()
pass_rate["pass_rate"] = 1 - pass_rate["violation_rate"]
pass_rate.to_csv(os.path.join(OUT_TAB_DIR, "pass_rate_by_model_condition_rule.csv"), index=False)

# Pivot for a heatmap: rule x (model,condition) → violation rate
heat_piv = viol_rate.pivot_table(index="rule", columns=["model","condition"], values="violation_rate")
heat_piv.to_csv(os.path.join(OUT_TAB_DIR, "violation_heatmap_table.csv"))

# --------------------------------------------------------------------------------------
# 2) Prompt-gain by condition (baseline→standards; standards→selfcheck) per rule and per model
# --------------------------------------------------------------------------------------
pg = viol_rate.pivot_table(index=["rule","model"], columns="condition", values="violation_rate")
for col in ("baseline","standards","selfcheck"):
    if col not in pg.columns: pg[col] = np.nan

pg["gain_baseline_to_standards"] = pg["baseline"] - pg["standards"]  # positive = fewer violations under standards
pg["gain_standards_to_selfcheck"] = pg["standards"] - pg["selfcheck"]
pg = pg.reset_index()
pg.to_csv(os.path.join(OUT_TAB_DIR, "prompt_gain_by_rule_model.csv"), index=False)

# Also aggregate across models for a rule-level view
pg_rule = (
    pg.groupby("rule")[["gain_baseline_to_standards","gain_standards_to_selfcheck"]]
      .mean()
      .sort_values("gain_baseline_to_standards", ascending=False)
      .reset_index()
)
pg_rule.to_csv(os.path.join(OUT_TAB_DIR, "prompt_gain_by_rule_mean.csv"), index=False)

# --------------------------------------------------------------------------------------
# 3) Contrast distributions (WCAG-like) by model and by condition
# --------------------------------------------------------------------------------------
contrast = linter_df[linter_df["rule"]=="contrast_text"].copy()
# Keep only rows with numeric ratio
contrast = contrast[pd.to_numeric(contrast["ratio"], errors="coerce").notna()]
contrast["ratio"] = contrast["ratio"].astype(float)

# Table: % below thresholds by (model, condition)
def below_thresh_rate(group, thr):
    return (group["ratio"] < thr).mean()

rows = []
for (m, c), grp in contrast.groupby(["model","condition"]):
    rows.append({"model": m, "condition": c,
                 "pct_below_4p5": below_thresh_rate(grp, 4.5),
                 "pct_below_3p0": below_thresh_rate(grp, 3.0),
                 "n": len(grp)})
contrast_summary = pd.DataFrame(rows).sort_values(["model","condition"])
contrast_summary.to_csv(os.path.join(OUT_TAB_DIR, "contrast_summary.csv"), index=False)

# Figure: boxplot of contrast ratios by model (collapsing conditions) and by condition (collapsing models)
def boxplot_grouped(df, group_col, value_col, title, outfile):
    order = sorted(df[group_col].unique())
    data = [df[df[group_col]==g][value_col].values for g in order]
    fig, ax = plt.subplots(figsize=(7,4))
    ax.boxplot(data, tick_labels=order, showmeans=True)
    ax.axhline(4.5, linestyle="--")  # text threshold (reference)
    ax.axhline(3.0, linestyle=":")   # non-text threshold (reference)
    ax.set_ylabel("Contrast ratio")
    ax.set_title(title)
    fig.tight_layout()
    fig.savefig(outfile, dpi=200)
    plt.close(fig)

boxplot_grouped(
    contrast, "model", "ratio",
    "Contrast ratio by model (all conditions)",
    os.path.join(OUT_FIG_DIR, "contrast_by_model.png")
)
boxplot_grouped(
    contrast, "condition", "ratio",
    "Contrast ratio by prompt condition (all models)",
    os.path.join(OUT_FIG_DIR, "contrast_by_condition.png")
)
# Pretty boxplot of contrast ratios by model with WCAG legend
# Expects a DataFrame `contrast` with columns: model, condition, ratio (float)

def pretty_contrast_boxplot_by_model(df: pd.DataFrame,
                                     value_col: str = "ratio",
                                     group_col: str = "model",
                                     title: str = "Contrast ratio by model (all conditions)",
                                     outfile: str = "reports/figs/contrast_by_model_pretty.png",
                                     sort_desc: bool = True,  # sort by median (desc = best on the right)
                                     display_name_map=None
                                     ):
    # Order groups alphabetically (or switch to median/mean order if preferred)
    # --- order models by median contrast ---
    med = df.groupby(group_col)[value_col].median().sort_values(ascending=not sort_desc)
    order = med.index.tolist()

    # pretty names only for display
    if display_name_map is None: 
        display_name_map = {}
    labels = [display_name_map.get(g, g) for g in order]

    # --- gather data and sample sizes in the chosen order ---
    data = [df[df[group_col] == g][value_col].dropna().values for g in order]
    ns   = [len(x) for x in data]

    fig, ax = plt.subplots(figsize=(9.5, 4.8))

    # --- styling props so legend matches what we draw ---
    meanprops   = dict(marker="D", markersize=6, markerfacecolor="tab:blue",
                       markeredgecolor="white")
    medianprops = dict(color="tab:orange", linewidth=2)
    boxprops    = dict(linewidth=1.2)
    whiskerprops= dict(linewidth=1.0)
    capprops    = dict(linewidth=1.0)
    flierprops  = dict(marker="o", markersize=4, markerfacecolor="none",
                       markeredgecolor="0.4", alpha=0.7)

    bp = ax.boxplot(
        data,
        tick_labels=labels,
        vert=True,
        showmeans=True, # show a mean marker per box
        meanline=False, # use a marker rather than a line
        meanprops=meanprops,
        medianprops=medianprops,
        boxprops=boxprops,
        whiskerprops=whiskerprops,
        capprops=capprops,
        flierprops=flierprops,
        manage_ticks=True
    )

    # Axis, grid, and limits: start at 0 for interpretability wrt thresholds
    ymax = max([x.max() if len(x) else 0 for x in data] + [5.0]) * 1.15
    ax.set_ylim(0, ymax)                          # start y at 0
    ax.yaxis.grid(True, linestyle=":", linewidth=0.8, alpha=0.8)
    ax.set_ylabel("Contrast ratio")
    ax.set_title(title, pad=10)

    # WCAG reference lines
    line_text    = ax.axhline(4.5, linestyle="--", linewidth=1.4, color="tab:blue")
    line_nontext = ax.axhline(3.0, linestyle=":",  linewidth=1.4, color="tab:blue")

    # Sample size under each tick
    y0, y1 = ax.get_ylim()
    for xtick, n in zip(ax.get_xticks(), ns):
        ax.text(xtick, y0 + 0.02*(y1 - y0), f"n={n}", ha="center", va="bottom", fontsize=8)

    # Legend outside: thresholds + mean/median glyphs
    legend_handles = [
        Line2D([0], [0], linestyle="--", color="tab:blue", linewidth=1.4, label="WCAG text 4.5:1"),
        Line2D([0], [0], linestyle=":",  color="tab:blue", linewidth=1.4, label="WCAG non-text 3:1"),
        Line2D([0], [0], marker="D", color="w", markerfacecolor="tab:blue",
               markeredgecolor="white", markersize=6, linestyle="None", label="Mean"),
        Line2D([0], [0], color="tab:orange", linewidth=2, label="Median"), # median line is drawn inside the box
    ]
    ax.legend(handles=legend_handles, loc="upper left", bbox_to_anchor=(1.02, 1),
              borderaxespad=0., frameon=False)

    fig.tight_layout()
    fig.savefig(outfile, dpi=300, bbox_inches="tight")  # tight to include outside legend
    plt.close(fig)

pretty_contrast_boxplot_by_model(
    contrast,
    outfile=os.path.join(OUT_FIG_DIR, "contrast_by_model_pretty.png"),
    display_name_map={"gemini25pro": "gemini2.5pro"}
)


# --------------------------------------------------------------------------------------
# 4) Rule × task heatmap: where violations concentrate (supports RQ2)
# --------------------------------------------------------------------------------------
viol_task = (
    sub.groupby(["task","rule"])["viol"]
       .mean()
       .reset_index()
       .rename(columns={"viol":"violation_rate"})
)
# Save table
viol_task.to_csv(os.path.join(OUT_TAB_DIR, "violation_rate_by_task_rule.csv"), index=False)

# Heatmap figure (rule x task)
hm = viol_task.pivot_table(index="rule", columns="task", values="violation_rate")
# Simple imshow heatmap
fig, ax = plt.subplots(figsize=(8, 5))
im = ax.imshow(hm.values, aspect="auto")
ax.set_yticks(range(hm.shape[0])); ax.set_yticklabels(hm.index)
ax.set_xticks(range(hm.shape[1])); ax.set_xticklabels(hm.columns, rotation=45, ha="right")
ax.set_title("Violation rates by rule × task (lower is better)")
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
fig.tight_layout()
fig.savefig(os.path.join(OUT_FIG_DIR, "violation_heatmap_rule_by_task.png"), dpi=200)
plt.close(fig)

# --------------------------------------------------------------------------------------
# 5) Gate-like pass rates by model (bar chart)
# --------------------------------------------------------------------------------------
gate_like = sub[sub["rule"].isin(GATE_RULES.keys())].copy()
gate_like["pass"] = 1 - gate_like["viol"]
gate_model = gate_like.groupby(["model","rule"])["pass"].mean().reset_index()

# Table
gate_model_piv = gate_model.pivot_table(index="rule", columns="model", values="pass")
gate_model_piv.to_csv(os.path.join(OUT_TAB_DIR, "gate_like_pass_rate_by_model.csv"))

# Figure: bars per rule (grouped by model)
rules_order = list(gate_model["rule"].unique())
models_order = sorted(gate_model["model"].unique())
x = np.arange(len(rules_order))
width = 0.8 / max(1, len(models_order))

fig, ax = plt.subplots(figsize=(7,4))
for i, m in enumerate(models_order):
    vals = [gate_model[(gate_model["rule"]==r) & (gate_model["model"]==m)]["pass"].mean() for r in rules_order]
    ax.bar(x + i*width, vals, width, label=m)
ax.set_xticks(x + width*(len(models_order)-1)/2)
ax.set_xticklabels(rules_order, rotation=30, ha="right")
ax.set_ylim(0,1)
ax.set_ylabel("Pass rate")
ax.set_title("Gate-like pass rates by model")
ax.legend()
fig.tight_layout()
fig.savefig(os.path.join(OUT_FIG_DIR, "gate_like_pass_by_model.png"), dpi=200)
plt.close(fig)

# --------------------------------------------------------------------------------------
# 6) Top violations (from violations.csv) — counts and normalized rates
# --------------------------------------------------------------------------------------
# Raw counts already in violations.csv
viol_df.to_csv(os.path.join(OUT_TAB_DIR, "top_violations_raw_counts.csv"), index=False)

# Normalize by opportunity: approximate denominator = number of trials where rule was applicable
# We estimate applicability as: count of trials that have that rule at all in linter_df
denom = (linter_df.groupby(["task","model","condition","rule"])["trial_id"]
                 .nunique()
                 .groupby(["rule"])
                 .sum()
                 .rename("applicable_trials")
                 .reset_index())

viol_norm = (viol_df.groupby("rule")["violations"].sum().reset_index()
             .merge(denom, on="rule", how="left"))
viol_norm["violations_per_100_trials"] = 100 * viol_norm["violations"] / viol_norm["applicable_trials"].clip(lower=1)
viol_norm.sort_values("violations_per_100_trials", ascending=False, inplace=True)
viol_norm.to_csv(os.path.join(OUT_TAB_DIR, "top_violations_normalized.csv"), index=False)

# Simple bar plot of normalized top-10
top10 = viol_norm.head(10)
fig, ax = plt.subplots(figsize=(7,4))
ax.barh(top10["rule"][::-1], top10["violations_per_100_trials"][::-1])
ax.set_xlabel("Violations per 100 applicable trials")
ax.set_title("Top violation rules (normalized)")
fig.tight_layout()
fig.savefig(os.path.join(OUT_FIG_DIR, "top_violations_normalized.png"), dpi=200)
plt.close(fig)

# --------------------------------------------------------------------------------------
# 7) Summaries for manuscript tables
# --------------------------------------------------------------------------------------

# (a) Model × condition overall pass rate on selected rules
sel_rules = ["baseline_zero_bar","dual_axes","labels_present","legend_call","contrast_text"]
overall = (
    sub[sub["rule"].isin(sel_rules)]
    .assign(pass_flag=lambda d: 1 - d["viol"])
    .groupby(["model","condition"])["pass_flag"]
    .mean()
    .reset_index()
    .rename(columns={"pass_flag":"overall_pass_rate_on_selected_rules"})
)
overall.to_csv(os.path.join(OUT_TAB_DIR, "overall_selected_pass_by_model_condition.csv"), index=False)

# (b) Prompt-gain table (mean across models) for selected rules
pg_sel = pg[pg["rule"].isin(sel_rules)].copy()
pg_sel_mean = (
    pg_sel.groupby("rule")[["gain_baseline_to_standards","gain_standards_to_selfcheck"]]
          .mean()
          .reset_index()
          .sort_values("gain_baseline_to_standards", ascending=False)
)
pg_sel_mean.to_csv(os.path.join(OUT_TAB_DIR, "prompt_gain_selected_rules_mean.csv"), index=False)

print("✅ Wrote tables to:", OUT_TAB_DIR)
print("✅ Wrote figures to:", OUT_FIG_DIR)


✅ Wrote tables to: metrics
✅ Wrote figures to: figs
