In [None]:
print("hello world")

In [None]:
#!/usr/bin/env python3

import sys
import pandas as pd
import os, re

def save_df(df: pd.DataFrame, name: str, directory: str = ".") -> None:
    # ensure directory exists
    os.makedirs(directory, exist_ok=True)

    # create full path
    filename = os.path.join(directory, f"{name}.csv")

    try:
        df.to_csv(filename, index=False)
        print(f"✅ Saved DataFrame to {filename}")
    except Exception as e:
        print(f"❌ Failed to save DataFrame: {e}")


def load_csv_to_df(csv_path: str) -> pd.DataFrame:
    """Load a CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(csv_path)
        return df
    except FileNotFoundError:
        print(f"Error: File '{csv_path}' not found.")
        sys.exit(1)
    except pd.errors.EmptyDataError:
        print(f"Error: File '{csv_path}' is empty.")
        sys.exit(1)
    except Exception as e:
        print(f"Unexpected error: {e}")
        sys.exit(1)


def reshape_rubric(df):
    # rename rubric column
    df = df.rename(columns={df.columns[0]: "rubric"})

    # known model order cycling through triplets
    models = ["gemini", "grok", "chatgpt"]

    # extract graph types from non-"Unnamed" columns
    graph_cols = [c for c in df.columns if not re.match(r"Unnamed", c) and c != "rubric"]

    long_rows = []

    for graph in graph_cols:
        # find this graph's 3 column window (graph + next 2 cols)
        idx = df.columns.get_loc(graph)
        triplet = df.columns[idx:idx+3]  # assumes consistent structure

        for model, col in zip(models, triplet):
            temp = df[["rubric", col]].copy()
            temp["graph_type"] = graph
            temp["model"] = model
            temp = temp.rename(columns={col: "value"})
            long_rows.append(temp)

    tidy_df = pd.concat(long_rows, ignore_index=True)
    return tidy_df

def explode_prompt_types(df):
    prompt_types = ["baseline", "selfcheck", "standards"]

    verdict_map = {
        "y": "YES",
        "n": "NO",
        "-": "N/A"
    }

    long_rows = []

    for _, row in df.iterrows():
        val = str(row["value"])  # original value column

        if len(val) != 3:
            raise ValueError(f"Value '{val}' does not have 3 characters.")

        for char, pt in zip(val.lower(), prompt_types):  # lower-case normalize
            if char not in verdict_map:
                raise ValueError(f"Unexpected verdict character '{char}' in '{val}'")

            new_row = row.copy()
            new_row["verdict"] = verdict_map[char]
            new_row["prompt_type"] = pt
            new_row = new_row.drop(labels=["value"])  # remove old column
            long_rows.append(new_row)

    return pd.DataFrame(long_rows)


In [None]:
df = load_csv_to_df("LLM_Plotting_Rubric.csv")
print(df)

                               rubric    graph_type    model prompt_type  \
0     Code validity & reproducibility          Bars   gemini    baseline   
1     Code validity & reproducibility          Bars   gemini   selfcheck   
2     Code validity & reproducibility          Bars   gemini   standards   
3        Encoding choice matches task          Bars   gemini    baseline   
4        Encoding choice matches task          Bars   gemini   selfcheck   
...                               ...           ...      ...         ...   
1075            Task intent adherence  stacked bars  chatgpt   selfcheck   
1076            Task intent adherence  stacked bars  chatgpt   standards   
1077             Holistic readability  stacked bars  chatgpt    baseline   
1078             Holistic readability  stacked bars  chatgpt   selfcheck   
1079             Holistic readability  stacked bars  chatgpt   standards   

     verdict  
0        YES  
1        YES  
2        YES  
3        YES  
4        YES

In [25]:
save_df(df, "LLM_Plotting_Rubric")

✅ Saved DataFrame to ./LLM_Plotting_Rubric.csv


In [27]:
import pandas as pd
from scipy.stats import chi2_contingency

# Convert verdict to binary success for analysis (ignore N/A)
df_eval = df[df["verdict"] != "N/A"].copy()
df_eval["success"] = (df_eval["verdict"] == "YES").astype(int)

def print_section(title):
    print("\n" + "="*80)
    print(title)
    print("="*80)

# ---------------------------------------------------------------------------
# Overall success rate
# ---------------------------------------------------------------------------
overall_success = df_eval["success"].mean()
print_section("OVERALL SUCCESS RATE")
print(f"Overall success rate: {overall_success:.3f} ({overall_success*100:.1f}%)")

# ---------------------------------------------------------------------------
# Hardest/Easiest Rubrics
# ---------------------------------------------------------------------------
rubric_stats = df_eval.groupby("rubric")["success"].mean().sort_values()

print_section("HARDEST / EASIEST RUBRICS")
print("Bottom 5 (hardest):")
print(rubric_stats.head(5).to_string())
print("\nTop 5 (easiest):")
print(rubric_stats.tail(5).to_string())

# ---------------------------------------------------------------------------
# Hardest/Easiest Graph Types
# ---------------------------------------------------------------------------
graph_stats = df_eval.groupby("graph_type")["success"].mean().sort_values()

print_section("HARDEST / EASIEST GRAPH TYPES")
print("Hardest graph types:")
print(graph_stats.head().to_string())
print("\nEasiest graph types:")
print(graph_stats.tail().to_string())

# ---------------------------------------------------------------------------
# Best Performing Model
# ---------------------------------------------------------------------------
model_stats = df_eval.groupby("model")["success"].mean().sort_values()

print_section("BEST PERFORMING MODELS")
print(model_stats.to_string())

# ---------------------------------------------------------------------------
# Prompt Type Effects
# ---------------------------------------------------------------------------
prompt_stats = df_eval.groupby("prompt_type")["success"].mean().sort_values()

print_section("PROMPT TYPE EFFECTS")
print(prompt_stats.to_string())

# ---------------------------------------------------------------------------
# Chi-Square Test: Does prompt type matter?
# ---------------------------------------------------------------------------
cont_table = pd.crosstab(df_eval["prompt_type"], df_eval["success"])
chi2, p, dof, expected = chi2_contingency(cont_table)

print_section("STATISTICAL SIGNIFICANCE: PROMPT TYPE EFFECT")
print("Chi-square test on prompt_type vs success:")
print(f"Chi² = {chi2:.3f}, df = {dof}, p = {p:.5f}")

if p < 0.05:
    print("✅ Prompt type has a statistically significant effect on success.")
else:
    print("❌ No statistically significant evidence that prompt type affects success.")



OVERALL SUCCESS RATE
Overall success rate: 0.684 (68.4%)

HARDEST / EASIEST RUBRICS
Bottom 5 (hardest):
rubric
Uncertainty depiction (when applicable)    0.000000
Axis integrity                             0.361111
Legend–encoding alignment                  0.402778
Redundancy                                 0.500000
Appropriate Data‑ink ratio                 0.652778

Top 5 (easiest):
rubric
Gridlines/ticks & sizing           0.805556
Data faithfulness                  0.847222
Encoding choice matches task       0.972222
Code validity & reproducibility    0.972222
Holistic readability               0.972222

HARDEST / EASIEST GRAPH TYPES
Hardest graph types:
graph_type
dual axis       0.503704
small multi     0.577778
heatmap         0.629630
line gaps       0.696296
stacked bars    0.725926

Easiest graph types:
graph_type
line gaps        0.696296
stacked bars     0.725926
scatter group    0.762963
Bars             0.777778
histogram        0.800000

BEST PERFORMING MODELS
model
ch

In [28]:
# Compute success rate by (model, prompt_type)
pair_stats = (
    df_eval
    .groupby(["model", "prompt_type"])["success"]
    .agg(["mean", "count"])
    .sort_values(by="mean", ascending=False)
)

print("\nSuccess rate by model–prompt pair:")
print(pair_stats)

# Extract top-performing combination
best_pair = pair_stats.iloc[0]
best_model, best_prompt = pair_stats.index[0]

print("\nBest prompt–model pair:")
print(f"Model: {best_model}")
print(f"Prompt type: {best_prompt}")
print(f"Success rate: {best_pair['mean']:.3f} ({best_pair['mean']*100:.1f}%)")
print(f"Sample size: {best_pair['count']}")



Success rate by model–prompt pair:
                         mean  count
model   prompt_type                 
gemini  baseline     0.725000    120
        standards    0.725000    120
grok    standards    0.716667    120
        selfcheck    0.700000    120
chatgpt selfcheck    0.691667    120
gemini  selfcheck    0.675000    120
chatgpt standards    0.675000    120
        baseline     0.650000    120
grok    baseline     0.600000    120

Best prompt–model pair:
Model: gemini
Prompt type: baseline
Success rate: 0.725 (72.5%)
Sample size: 120.0
