# Evaluating SimplePromptPipeline

In [7]:
import os
import sys
import json
import pandas as pd
from datetime import datetime

# Add project root (the directory that contains "src")
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Imports from your project ---
from src.pipelines.SimplePromptPipeline import SimplePromptPipeline
from src.app.Evaluator import Evaluator
from src.llms.LLM_Wrappers import AbstractLLM

# --- Define Models to Evaluate ---
# Assuming AbstractLLM is subclassed for OpenAI-like models
# If you have specific wrappers like GPT3LLM, replace AbstractLLM() calls accordingly
models = {
    "GPT-3": AbstractLLM.from_name("gpt-3.5-turbo"),
    "GPT-4o-mini": AbstractLLM.from_name(model_name="gpt-4o-mini"),
    "GPT-4o": AbstractLLM.from_name(model_name="gpt-4o"),
}

# --- Files to Evaluate ---
ground_truth_files = [
    "../src/data/Q17_Annotated_Responses.json",
    # "../src/data/Q19_Annotated_Responses.json",
    # "../src/data/Q20_Annotated_Responses.json",
    # "../src/data/Q21_Annotated_Responses.json",
    # "../src/data/Q22_Annotated_Responses.json",
]

# --- Output directory ---
output_dir = "../outputs"
os.makedirs(output_dir, exist_ok=True)

# --- Parameters ---
min_conf = 0.7
records = []  # will hold all evaluation rows

# --- Run pipelines + evaluate ---
for model_name, llm in models.items():
    print(f"\n=== Running {model_name} ===")
    for gt_path in ground_truth_files:
        qname = os.path.basename(gt_path).replace("_Annotated_Responses.json", "")
        print(f" â†’ Processing {qname}...")

        # Run pipeline
        pipeline = SimplePromptPipeline(llm=llm, input_path=gt_path, output_dir=output_dir)
        # Run the pipeline (sometimes returns None even if file is written)
        maybe_path = pipeline.run()

        # Robustly determine the actual output path
        if maybe_path and os.path.exists(maybe_path):
            output_path = maybe_path
        else:
            # Reconstruct expected path from AbstractTAPipeline logic
            base_name = os.path.splitext(os.path.basename(gt_path))[0]
            output_path = os.path.join(output_dir, f"{base_name}_annotated.json")
            if not os.path.exists(output_path):
                raise FileNotFoundError(f"Expected output file not found: {output_path}")

        # Evaluate results
        evaluator = Evaluator(output_path, gt_path)

        results = evaluator.evaluate_precision_recall(min_confidence=min_conf)
        global_metrics = results["global"]

        records.append({
            "Model": model_name,
            "Question": qname,
            "Precision": global_metrics["precision"],
            "Recall": global_metrics["recall"]
        })

# --- Build DataFrames ---
df = pd.DataFrame(records)

# Compute average metrics across all questions per model
summary_df = (
    df.groupby("Model")[["Precision", "Recall"]]
      .mean()
      .reset_index()
      .sort_values("Model")
)

print("\n\n=== ðŸ“Š Per-Question Results ===")
display(df.style.format({"Precision": "{:.3f}", "Recall": "{:.3f}"}))

print("\n\n=== ðŸ§® Average Precision/Recall per Model ===")
display(summary_df.style.format({"Precision": "{:.3f}", "Recall": "{:.3f}"}))



=== Running GPT-3 ===
 â†’ Processing Q17...


Annotating entries: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 104/104 [01:29<00:00,  1.17entry/s, Last: 1.32s]


Annotated JSON written to ../outputs/Q17_Annotated_Responses_annotated.json
Logs saved to logs/Q17_Annotated_Responses_20251103_101049.log

=== Running GPT-4o-mini ===
 â†’ Processing Q17...


Annotating entries: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 104/104 [02:26<00:00,  1.41s/entry, Last: 1.64s]


Annotated JSON written to ../outputs/Q17_Annotated_Responses_annotated.json
Logs saved to logs/Q17_Annotated_Responses_20251103_101218.log

=== Running GPT-4o ===
 â†’ Processing Q17...


Annotating entries: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 104/104 [01:57<00:00,  1.13s/entry, Last: 1.78s]

Annotated JSON written to ../outputs/Q17_Annotated_Responses_annotated.json
Logs saved to logs/Q17_Annotated_Responses_20251103_101444.log


=== ðŸ“Š Per-Question Results ===





AttributeError: The '.style' accessor requires jinja2