In [3]:
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
model_dirs = {
    "claude": base / "scored_themes",
    "openai": base / "scored_themes_openai",
    "grok": base / "scored_themes_grok"
}

# === Collect results from each model ===
def collect_model_outputs(model_name, dir_path):
    results = []
    for file in tqdm(list(dir_path.glob("*.json")), desc=f"Loading {model_name} outputs"):
        name = file.stem
        if "_chunk" not in name:
            continue
        try:
            # Example: 2006Blood.Diamond_chunk8_themes
            before_chunk, chunk_suffix = name.split("_chunk")
            chunk_num_str = chunk_suffix.split("_")[0]  # Get just the number, drop "_themes"
            chunk_num = int(chunk_num_str)
            with open(file, "r", encoding="utf-8") as f:
                data = json.load(f)
            results.append({
                "model": model_name,
                "subtitle_filename": before_chunk,
                "chunk": chunk_num,
                "themes": "; ".join(data.get("themes", [])),
                "confidence": data.get("confidence", None),
                "explanation": data.get("explanation", "")
            })
        except Exception as e:
            print(f"⚠️ Failed to parse {file.name}: {e}")
    return results

# === Aggregate all outputs ===
all_results = []
for model_name, dir_path in model_dirs.items():
    if dir_path.exists():
        all_results.extend(collect_model_outputs(model_name, dir_path))
    else:
        print(f"⚠️ Directory not found: {dir_path}")

# === Save results to CSV ===
if all_results:
    df = pd.DataFrame(all_results)
    output_path = base / "aggregated_theme_outputs.csv"
    df.to_csv(output_path, index=False)
    print(f"✅ Aggregated results saved to: {output_path}")
else:
    print("⚠️ No data found. Make sure your model output directories are populated.")


Loading claude outputs: 100%|███████████████| 211/211 [00:00<00:00, 4391.27it/s]
Loading openai outputs: 100%|███████████████| 213/213 [00:00<00:00, 5405.82it/s]
Loading grok outputs: 100%|█████████████████| 213/213 [00:00<00:00, 5694.28it/s]

✅ Aggregated results saved to: /Users/cedricroetheli/Desktop/Benchmark/aggregated_theme_outputs.csv





In [11]:
import pandas as pd
from pathlib import Path

# Set the path
base = Path.home() / "Desktop" / "Benchmark"
input_path = base / "aggregated_theme_outputs.csv"
output_path = base / "per_movie_deduplicated_movie_themes.csv"

# Load the CSV
df = pd.read_csv(input_path)

# Parse themes safely: handles both list-like strings and semicolon-separated strings
def parse_themes(raw):
    if isinstance(raw, list):
        return raw
    if isinstance(raw, str):
        if raw.startswith("[") and raw.endswith("]"):
            try:
                return [t.strip().strip("'\"") for t in raw[1:-1].split(",")]
            except:
                pass
        return [t.strip() for t in raw.split(';')]
    return []

df['themes'] = df['themes'].apply(parse_themes)

# Aggregate and deduplicate themes by movie + model
def aggregate_themes(theme_lists):
    unique_themes = set()
    for lst in theme_lists:
        unique_themes.update(lst)
    return list(unique_themes)

aggregated_df = df.groupby(['subtitle_filename', 'model']).agg({
    'themes': aggregate_themes,
    'confidence': 'mean',
    'explanation': lambda x: ' '.join(set(x))
}).reset_index()

# Save result
aggregated_df.to_csv(output_path, index=False)
print(f"✅ Aggregated and deduplicated themes saved to: {output_path}")


✅ Aggregated and deduplicated themes saved to: /Users/cedricroetheli/Desktop/Benchmark/per_movie_deduplicated_movie_themes.csv
