# Evaluating SimplePromptPipeline

In [None]:
import os
import sys
import json
import pandas as pd
from datetime import datetime

# Add project root (the directory that contains "src")
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Imports from your project ---
from src.pipelines.SimplePromptPipeline import SimplePromptPipeline
from src.app.Evaluator import Evaluator
from src.llms.LLM_Wrappers import AbstractLLM

# --- Define Models to Evaluate ---
models = {
    "GPT-3": AbstractLLM.from_name("gpt-3.5-turbo"),
    "GPT-4o-mini": AbstractLLM.from_name(model_name="gpt-4o-mini"),
    "GPT-4o": AbstractLLM.from_name(model_name="gpt-4o"),
}

# --- Files to Evaluate ---
ground_truth_files = [
    "../src/data/Q17_Annotated_Responses.json",
    # "../src/data/Q19_Annotated_Responses.json",
    # "../src/data/Q20_Annotated_Responses.json",
    # "../src/data/Q21_Annotated_Responses.json",
    # "../src/data/Q22_Annotated_Responses.json",
]

# --- Output and Results Directories ---
output_dir = "../outputs"
os.makedirs(output_dir, exist_ok=True)

results_dir = os.path.join("../analysis", "results")
os.makedirs(results_dir, exist_ok=True)

# --- Parameters ---
min_conf = 0.7
records = []

# --- Run pipelines + evaluate ---
for model_name, llm in models.items():
    print(f"\n=== Running {model_name} ===")
    for gt_path in ground_truth_files:
        qname = os.path.basename(gt_path).replace("_Annotated_Responses.json", "")
        print(f" â†’ Processing {qname}...")

        # Run pipeline
        pipeline = SimplePromptPipeline(llm=llm, input_path=gt_path, output_dir=output_dir)
        maybe_path = pipeline.run()

        # Create model-specific filename
        model_suffix = model_name.replace(" ", "_").replace("/", "-")
        base_name = os.path.splitext(os.path.basename(gt_path))[0]
        model_output_name = f"{base_name}_{model_suffix}_annotated.json"
        output_path = os.path.join(output_dir, model_output_name)

        # If pipeline didn't create that, rename existing _annotated.json file if present
        if not os.path.exists(output_path):
            generic_output = os.path.join(output_dir, f"{base_name}_annotated.json")
            if os.path.exists(generic_output):
                os.rename(generic_output, output_path)
            else:
                raise FileNotFoundError(f"Expected output file not found: {output_path}")

        # Evaluate results
        evaluator = Evaluator(output_path, gt_path)
        results = evaluator.evaluate_precision_recall(min_confidence=min_conf)
        global_metrics = results["global"]

        records.append({
            "Model": model_name,
            "Question": qname,
            "Precision": global_metrics["precision"],
            "Recall": global_metrics["recall"],
            "F1-Score": global_metrics["f1-score"],
        })

# --- Build DataFrames ---
df = pd.DataFrame(records)
summary_df = (
    df.groupby("Model")[["Precision", "Recall", "F1-Score"]]
      .mean()
      .reset_index()
      .sort_values("Model")
)

# --- Save Results ---
df.to_csv(os.path.join(results_dir, "per_question_results.csv"), index=False)
summary_df.to_csv(os.path.join(results_dir, "summary_results.csv"), index=False)

In [9]:
# --- Display Results (with graceful fallback if jinja2 missing) ---
print("\n\n=== ðŸ“Š Per-Question Results ===")
try:
    display(df.style.format({"Precision": "{:.3f}", "Recall": "{:.3f}", "F1-Score": "{:.3f}"}))
except AttributeError:
    print(df.to_string(index=False, formatters={"Precision": "{:.3f}".format, "Recall": "{:.3f}".format, "F1-Score": "{:.3f}".format}))

print("\n\n=== ðŸ§® Average Precision/Recall per Model ===")
try:
    display(summary_df.style.format({"Precision": "{:.3f}", "Recall": "{:.3f}", "F1-Score": "{:.3f}"}))
except AttributeError:
    print(summary_df.to_string(index=False, formatters={"Precision": "{:.3f}".format, "Recall": "{:.3f}".format, "F1-Score": "{:.3f}".format}))



=== ðŸ“Š Per-Question Results ===


Unnamed: 0,Model,Question,Precision,Recall,F1-Score
0,GPT-3,Q17,0.362,0.602,0.452
1,GPT-4o-mini,Q17,0.417,0.759,0.538
2,GPT-4o,Q17,0.457,0.759,0.57




=== ðŸ§® Average Precision/Recall per Model ===


Unnamed: 0,Model,Precision,Recall,F1-Score
0,GPT-3,0.362,0.602,0.452
1,GPT-4o,0.457,0.759,0.57
2,GPT-4o-mini,0.417,0.759,0.538
