In [None]:
import os
working_dir = "/home/gpinon/more_europa/clean_rdc_experiments/projects/P04_official_reg_db_creation"
os.chdir(working_dir)
print(f"Changed working directory to {working_dir}")
import logging
import time
import pandas as pd
import json
from pathlib import Path
from dotenv import load_dotenv

from src.p04_official_reg_db_creation import config
import llm_backends
from llm_backends.cache import DiskCacheStorage
from llm_backends.mistral import dummy_config
from llm_backends.openai import dummy_config

In [None]:
DATASET_TYPE ="eval" # "test" # 
RAW_PUBLICATIONS_DICT = {
    "eval": "../../datasets/001_publications_dataset/publications_dataset.jsonl",
    "test": "../../datasets/001_publications_dataset/prod_publication_test_dataset.jsonl",
}
FIELD = "registry_name"
MODEL = "small_mistral"

# Load environment variables from .env file and get API key
load_dotenv()
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
extraction_results=f"data/from_notebooks/NW01/R02_comparison/{MODEL}/{FIELD}/compare_{FIELD}.json"
output_json = f"data/from_notebooks/NW01/R03_evaluation/{MODEL}/{FIELD}/perf_metrics.json"

In [None]:
def format_percentage(value):
    """Format a float as percentage with 1 decimal place."""
    return f"{value * 100:.1f}%"

In [None]:
# Ensure output directory exists
out_dir = Path(output_json).parent
out_dir.mkdir(parents=True, exist_ok=True)

# Load extraction results
with open(extraction_results, "r", encoding="utf-8") as f:
    results = json.load(f)

print(f"Loaded {len(results)} records from {extraction_results}")
print(f"Computing metrics for field '{FIELD}' and model '{MODEL}'")

# Convert to DataFrame for easier analysis
df = pd.DataFrame(results)

# Calculate overall metrics
total_samples = len(df)
correct_extractions = int(df["final_label"].sum())
incorrect_extractions = int(total_samples - correct_extractions)
accuracy = correct_extractions / total_samples if total_samples > 0 else 0

In [None]:
# Group by labeling_reason and final_label
reason_groups = (
    df.groupby(["labeling_reason", "final_label"]).size().reset_index(name="count")
)

# Structure the breakdown data
correct_reasons = {}
incorrect_reasons = {}

for _, row in reason_groups.iterrows():
    reason = row["labeling_reason"]
    is_correct = row["final_label"] == 1
    count = int(row["count"])

    if is_correct:
        correct_reasons[reason] = count
    else:
        incorrect_reasons[reason] = count

# Create a structured breakdown
reason_breakdown = {
    "correct_extractions": {
        "total": correct_extractions,
        "percentage": accuracy,
        "reasons": correct_reasons,
    },
    "incorrect_extractions": {
        "total": incorrect_extractions,
        "percentage": 1 - accuracy,
        "reasons": incorrect_reasons,
    },
}

# Create the summary structure
summary = {
    "total_samples": total_samples,
    "accuracy": accuracy,
    "correct_extractions": correct_extractions,
    "incorrect_extractions": incorrect_extractions,
}

# Collect all metrics into a single dictionary
all_metrics = {
    "field": FIELD,
    "model": MODEL,
    "summary": summary,
    "reason_breakdown": reason_breakdown,
}

# Display summary metrics in logs with the requested format
print(f"Overall performance metrics:")
print(f"  Total samples: {summary['total_samples']}")
print(
    f"  Accuracy: {format_percentage(accuracy)} ({correct_extractions} / {total_samples})"
)

# Display correct extractions breakdown
print(f"  Correct extractions: {correct_extractions} ({format_percentage(accuracy)})")
for reason, count in correct_reasons.items():
    percentage = count / total_samples if total_samples > 0 else 0
    print(f"        {reason}: {count} ({format_percentage(percentage)})")

# Display incorrect extractions breakdown
print(
    f"  Incorrect extractions: {incorrect_extractions} ({format_percentage(1 - accuracy)})"
)
for reason, count in incorrect_reasons.items():
    percentage = count / total_samples if total_samples > 0 else 0
    print(f"        {reason}: {count} ({format_percentage(percentage)})")

# Save metrics to JSON
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(all_metrics, f, indent=4, ensure_ascii=False)

print(f"Saved performance metrics to {output_json}")