In [3]:
import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === OpenAI API setup ===
api_key = "api-key"  # 🔒 Replace with your OpenAI key
api_url = "https://api.openai.com/v1/chat/completions"
model_name = "gpt-4o"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
}

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_hero_type_openai"
prompts_dir = base / "prompts_hero_type_openai"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

# === Load match file ===
matches_df = pd.read_csv("matches_benchmark.csv")

# === Prompt template ===
def create_hero_type_prompt(summary, chunk):
    return f"""
You are analyzing the protagonist(s) of a film based on its subtitles and plot summary.

Your task is to classify the central figure(s) in the film into one of six predefined hero type categories.

The six hero type categories are defined as follows:

- Ordinary Individual: A regular person caught in extraordinary circumstances; succeeds through grit, not destiny.
- Heroic Individual: A character with clear heroic traits—bravery, leadership, strength, often iconic or idealized.
- Group / Ensemble: A collective effort drives the story; no single person is the exclusive hero.
- Institution / System: An organization or societal body serves as protagonist or moral force.
- None: There is no clear hero in the film.
- Anti-Hero: The main character may be active but is morally ambiguous, selfish, or destructive.

Please:
1. Select the most appropriate hero type label
2. Provide a brief explanation (1–2 sentences max)
3. Include a confidence score from 0.0 to 1.0

Strictly return your answer in the following JSON format:

{{
  "hero_type": "...",
  "confidence": ...,
  "explanation": "..."
}}

Film Summary:
\"\"\"{summary}\"\"\"

Dialogue:
\"\"\"{chunk}\"\"\"
""".strip()

# === OpenAI API call ===
def call_openai(prompt):
    payload = {
        "model": model_name,
        "temperature": 0,
        "max_tokens": 500,
        "messages": [
            {"role": "system", "content": "You are a careful film analyst. Always return exactly and only the required JSON."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 429:
        print("⚠️ Rate limit hit. Waiting...")
        time.sleep(60)
        return call_openai(prompt)
    elif response.status_code != 200:
        print(f"❌ API Error: {response.status_code}")
        print(response.text)
        return None

    try:
        content = response.json()["choices"][0]["message"]["content"]

        # Strip code fences if present
        if content.strip().startswith("```"):
            content = content.strip().strip("`")  # removes leading/trailing ```
            if content.lower().startswith("json"):
                content = content[4:].strip()  # remove 'json\n'

        return json.loads(content)
    except Exception as e:
        print(f"⚠️ JSON parsing error:\n{response.text}")
        return None


# === Subtitle chunking ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Main loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring hero type (OpenAI)"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_hero_type.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_hero_type_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_hero_type_prompt(summary, chunk)

        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_openai(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring hero type (OpenAI):   0%|                        | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
✅ Scored: 2006Blood.Diamond chunk 5
✅ Scored: 2006Blood.Diamond chunk 6
✅ Scored: 2006Blood.Diamond chunk 7
✅ Scored: 2006Blood.Diamond chunk 8
✅ Scored: 2006Blood.Diamond chunk 9
✅ Scored: 2006Blood.Diamond chunk 10
✅ Scored: 2006Blood.Diamond chunk 11


Scoring hero type (OpenAI):   5%|▊               | 1/20 [00:36<11:28, 36.26s/it]

✅ Scored: 2005The.Constant.Gardener chunk 1
✅ Scored: 2005The.Constant.Gardener chunk 2
✅ Scored: 2005The.Constant.Gardener chunk 3
✅ Scored: 2005The.Constant.Gardener chunk 4
✅ Scored: 2005The.Constant.Gardener chunk 5
✅ Scored: 2005The.Constant.Gardener chunk 6
✅ Scored: 2005The.Constant.Gardener chunk 7
✅ Scored: 2005The.Constant.Gardener chunk 8
✅ Scored: 2005The.Constant.Gardener chunk 9
✅ Scored: 2005The.Constant.Gardener chunk 10
✅ Scored: 2005The.Constant.Gardener chunk 11
✅ Scored: 2005The.Constant.Gardener chunk 12
✅ Scored: 2005The.Constant.Gardener chunk 13


Scoring hero type (OpenAI):  10%|█▌              | 2/20 [01:20<12:15, 40.85s/it]

✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 1
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 2
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 3
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 4
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 5
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 6


Scoring hero type (OpenAI):  15%|██▍             | 3/20 [01:40<08:55, 31.47s/it]

✅ Scored: 2009Avatar chunk 1
✅ Scored: 2009Avatar chunk 2
✅ Scored: 2009Avatar chunk 3
✅ Scored: 2009Avatar chunk 4
✅ Scored: 2009Avatar chunk 5
✅ Scored: 2009Avatar chunk 6
✅ Scored: 2009Avatar chunk 7
✅ Scored: 2009Avatar chunk 8
✅ Scored: 2009Avatar chunk 9
✅ Scored: 2009Avatar chunk 10
✅ Scored: 2009Avatar chunk 11


Scoring hero type (OpenAI):  20%|███▏            | 4/20 [02:20<09:17, 34.86s/it]

✅ Scored: 2012The.Hunger.Games chunk 1
✅ Scored: 2012The.Hunger.Games chunk 2
✅ Scored: 2012The.Hunger.Games chunk 3
✅ Scored: 2012The.Hunger.Games chunk 4
✅ Scored: 2012The.Hunger.Games chunk 5
✅ Scored: 2012The.Hunger.Games chunk 6
✅ Scored: 2012The.Hunger.Games chunk 7
✅ Scored: 2012The.Hunger.Games chunk 8


Scoring hero type (OpenAI):  25%|████            | 5/20 [02:44<07:43, 30.93s/it]

✅ Scored: 1984Ghostbusters chunk 1
✅ Scored: 1984Ghostbusters chunk 2
✅ Scored: 1984Ghostbusters chunk 3
✅ Scored: 1984Ghostbusters chunk 4
✅ Scored: 1984Ghostbusters chunk 5
✅ Scored: 1984Ghostbusters chunk 6
✅ Scored: 1984Ghostbusters chunk 7
✅ Scored: 1984Ghostbusters chunk 8
✅ Scored: 1984Ghostbusters chunk 9
✅ Scored: 1984Ghostbusters chunk 10


Scoring hero type (OpenAI):  30%|████▊           | 6/20 [03:27<08:07, 34.82s/it]

✅ Scored: 1978Superman chunk 1
✅ Scored: 1978Superman chunk 2
✅ Scored: 1978Superman chunk 3
✅ Scored: 1978Superman chunk 4
✅ Scored: 1978Superman chunk 5
✅ Scored: 1978Superman chunk 6
✅ Scored: 1978Superman chunk 7
✅ Scored: 1978Superman chunk 8
✅ Scored: 1978Superman chunk 9
✅ Scored: 1978Superman chunk 10
✅ Scored: 1978Superman chunk 11


Scoring hero type (OpenAI):  35%|█████▌          | 7/20 [04:00<07:27, 34.42s/it]

✅ Scored: 2008The.Hurt.Locker chunk 1
✅ Scored: 2008The.Hurt.Locker chunk 2
✅ Scored: 2008The.Hurt.Locker chunk 3
✅ Scored: 2008The.Hurt.Locker chunk 4
✅ Scored: 2008The.Hurt.Locker chunk 5
✅ Scored: 2008The.Hurt.Locker chunk 6
✅ Scored: 2008The.Hurt.Locker chunk 7
✅ Scored: 2008The.Hurt.Locker chunk 8
✅ Scored: 2008The.Hurt.Locker chunk 9


Scoring hero type (OpenAI):  40%|██████▍         | 8/20 [04:28<06:27, 32.29s/it]

✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 1
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 2
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 3
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 4
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 5
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 6
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 7
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 8
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 9
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 10
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 11
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 12


Scoring hero type (OpenAI):  45%|███████▏        | 9/20 [05:09<06:24, 34.95s/it]

✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 1
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 2
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 3
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 4
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 5
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 6
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 7
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 8
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 9
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 10


Scoring hero type (OpenAI):  50%|███████▌       | 10/20 [05:41<05:42, 34.24s/it]

✅ Scored: 2018Black.Panther chunk 1
✅ Scored: 2018Black.Panther chunk 2
✅ Scored: 2018Black.Panther chunk 3
✅ Scored: 2018Black.Panther chunk 4
✅ Scored: 2018Black.Panther chunk 5
✅ Scored: 2018Black.Panther chunk 6
✅ Scored: 2018Black.Panther chunk 7
✅ Scored: 2018Black.Panther chunk 8
✅ Scored: 2018Black.Panther chunk 9
✅ Scored: 2018Black.Panther chunk 10


Scoring hero type (OpenAI):  55%|████████▎      | 11/20 [06:14<05:03, 33.77s/it]

✅ Scored: 2021Dont.Look.Up chunk 1
✅ Scored: 2021Dont.Look.Up chunk 2
✅ Scored: 2021Dont.Look.Up chunk 3
✅ Scored: 2021Dont.Look.Up chunk 4
✅ Scored: 2021Dont.Look.Up chunk 5
✅ Scored: 2021Dont.Look.Up chunk 6
✅ Scored: 2021Dont.Look.Up chunk 7
✅ Scored: 2021Dont.Look.Up chunk 8
✅ Scored: 2021Dont.Look.Up chunk 9
✅ Scored: 2021Dont.Look.Up chunk 10
✅ Scored: 2021Dont.Look.Up chunk 11
✅ Scored: 2021Dont.Look.Up chunk 12
✅ Scored: 2021Dont.Look.Up chunk 13
✅ Scored: 2021Dont.Look.Up chunk 14
✅ Scored: 2021Dont.Look.Up chunk 15
✅ Scored: 2021Dont.Look.Up chunk 16
✅ Scored: 2021Dont.Look.Up chunk 17


Scoring hero type (OpenAI):  60%|█████████      | 12/20 [07:13<05:31, 41.46s/it]

✅ Scored: 1982First.Blood chunk 1
✅ Scored: 1982First.Blood chunk 2
✅ Scored: 1982First.Blood chunk 3
✅ Scored: 1982First.Blood chunk 4
✅ Scored: 1982First.Blood chunk 5
✅ Scored: 1982First.Blood chunk 6
✅ Scored: 1982First.Blood chunk 7


Scoring hero type (OpenAI):  65%|█████████▊     | 13/20 [07:38<04:14, 36.35s/it]

✅ Scored: 2019Joker chunk 1
✅ Scored: 2019Joker chunk 2
✅ Scored: 2019Joker chunk 3
✅ Scored: 2019Joker chunk 4
✅ Scored: 2019Joker chunk 5
✅ Scored: 2019Joker chunk 6
✅ Scored: 2019Joker chunk 7
✅ Scored: 2019Joker chunk 8
✅ Scored: 2019Joker chunk 9


Scoring hero type (OpenAI):  70%|██████████▌    | 14/20 [08:07<03:24, 34.15s/it]

✅ Scored: 2006Night.at.the.Museum chunk 1
✅ Scored: 2006Night.at.the.Museum chunk 2
✅ Scored: 2006Night.at.the.Museum chunk 3
✅ Scored: 2006Night.at.the.Museum chunk 4
✅ Scored: 2006Night.at.the.Museum chunk 5
✅ Scored: 2006Night.at.the.Museum chunk 6
✅ Scored: 2006Night.at.the.Museum chunk 7
✅ Scored: 2006Night.at.the.Museum chunk 8
✅ Scored: 2006Night.at.the.Museum chunk 9
✅ Scored: 2006Night.at.the.Museum chunk 10


Scoring hero type (OpenAI):  75%|███████████▎   | 15/20 [08:47<03:00, 36.11s/it]

✅ Scored: 1976Rocky.I chunk 1
✅ Scored: 1976Rocky.I chunk 2
✅ Scored: 1976Rocky.I chunk 3
✅ Scored: 1976Rocky.I chunk 4
✅ Scored: 1976Rocky.I chunk 5
✅ Scored: 1976Rocky.I chunk 6
✅ Scored: 1976Rocky.I chunk 7
✅ Scored: 1976Rocky.I chunk 8
✅ Scored: 1976Rocky.I chunk 9
✅ Scored: 1976Rocky.I chunk 10
✅ Scored: 1976Rocky.I chunk 11
✅ Scored: 1976Rocky.I chunk 12
✅ Scored: 1976Rocky.I chunk 13


Scoring hero type (OpenAI):  80%|████████████   | 16/20 [09:35<02:38, 39.54s/it]

✅ Scored: 2005V.for.Vendetta chunk 1
✅ Scored: 2005V.for.Vendetta chunk 2
✅ Scored: 2005V.for.Vendetta chunk 3
✅ Scored: 2005V.for.Vendetta chunk 4
✅ Scored: 2005V.for.Vendetta chunk 5
✅ Scored: 2005V.for.Vendetta chunk 6
✅ Scored: 2005V.for.Vendetta chunk 7
✅ Scored: 2005V.for.Vendetta chunk 8
✅ Scored: 2005V.for.Vendetta chunk 9
✅ Scored: 2005V.for.Vendetta chunk 10
✅ Scored: 2005V.for.Vendetta chunk 11
✅ Scored: 2005V.for.Vendetta chunk 12
✅ Scored: 2005V.for.Vendetta chunk 13
✅ Scored: 2005V.for.Vendetta chunk 14
✅ Scored: 2005V.for.Vendetta chunk 15
✅ Scored: 2005V.for.Vendetta chunk 16


Scoring hero type (OpenAI):  85%|████████████▊  | 17/20 [10:32<02:14, 44.96s/it]

✅ Scored: 2017Paddington.2 chunk 1
✅ Scored: 2017Paddington.2 chunk 2
✅ Scored: 2017Paddington.2 chunk 3
✅ Scored: 2017Paddington.2 chunk 4
✅ Scored: 2017Paddington.2 chunk 5
✅ Scored: 2017Paddington.2 chunk 6
✅ Scored: 2017Paddington.2 chunk 7
✅ Scored: 2017Paddington.2 chunk 8
✅ Scored: 2017Paddington.2 chunk 9
✅ Scored: 2017Paddington.2 chunk 10
✅ Scored: 2017Paddington.2 chunk 11
✅ Scored: 2017Paddington.2 chunk 12


Scoring hero type (OpenAI):  90%|█████████████▌ | 18/20 [11:20<01:31, 45.59s/it]

✅ Scored: 1985Back.To.The.Future chunk 1
✅ Scored: 1985Back.To.The.Future chunk 2
✅ Scored: 1985Back.To.The.Future chunk 3
✅ Scored: 1985Back.To.The.Future chunk 4
✅ Scored: 1985Back.To.The.Future chunk 5
✅ Scored: 1985Back.To.The.Future chunk 6
✅ Scored: 1985Back.To.The.Future chunk 7
✅ Scored: 1985Back.To.The.Future chunk 8
✅ Scored: 1985Back.To.The.Future chunk 9
✅ Scored: 1985Back.To.The.Future chunk 10
✅ Scored: 1985Back.To.The.Future chunk 11


Scoring hero type (OpenAI):  95%|██████████████▎| 19/20 [12:06<00:45, 45.75s/it]

✅ Scored: 2013The.Purge chunk 1
✅ Scored: 2013The.Purge chunk 2
✅ Scored: 2013The.Purge chunk 3
✅ Scored: 2013The.Purge chunk 4
✅ Scored: 2013The.Purge chunk 5
✅ Scored: 2013The.Purge chunk 6
✅ Scored: 2013The.Purge chunk 7


Scoring hero type (OpenAI): 100%|███████████████| 20/20 [12:28<00:00, 37.44s/it]


In [5]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter

# === Paths ===
base_path = Path.home() / "Desktop" / "Benchmark"
scored_dir = base_path / "scored_hero_type_openai"
output_csv_path = base_path / "model_hero_type_output_openai.csv"

# === Aggregate Results ===
aggregated_results = []

# Group chunk files by movie
movie_files = defaultdict(list)
for file in scored_dir.glob("*_chunk*_hero_type.json"):
    movie_id = file.name.split("_chunk")[0]
    movie_files[movie_id].append(file)

# Process each movie's chunks
for movie_id, files in movie_files.items():
    label_counts = Counter()
    confidence_sums = defaultdict(float)
    confidence_counts = defaultdict(int)
    explanations = []

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                label = data.get("hero_type", "").strip().lower()
                confidence = float(data.get("confidence", 0.0))
                explanation = data.get("explanation", "")

                label_counts[label] += 1
                confidence_sums[label] += confidence
                confidence_counts[label] += 1
                explanations.append(f"{label} ({confidence:.2f}): {explanation}")

            except Exception as e:
                print(f"⚠️ Error in file {file.name}: {e}")

    # Compute average confidence per label
    avg_confidences = {
        label: round(confidence_sums[label] / confidence_counts[label], 3)
        for label in label_counts
    }

    # Weighted vote = label with highest total confidence
    weighted_vote = max(confidence_sums.items(), key=lambda x: x[1])[0] if confidence_sums else None

    aggregated_results.append({
        "subtitle_filename": movie_id,
        "label_counts": dict(label_counts),
        "avg_confidences": avg_confidences,
        "weighted_vote": weighted_vote,
        "explanations": explanations
    })

# === Save to CSV ===
df = pd.DataFrame(aggregated_results)
df.to_csv(output_csv_path, index=False)
print(f"✅ Saved aggregated results to: {output_csv_path}")


✅ Saved aggregated results to: /Users/cedricroetheli/Desktop/Benchmark/model_hero_type_output_openai.csv


In [7]:
import pandas as pd
from pathlib import Path

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
truth_path = base / "benchmark_final.csv"
model_path = base / "model_hero_type_output_openai.csv"
output_path = base / "hero_type_evaluation_openai.csv"

# === Load Data ===
truth_df = pd.read_csv(truth_path)
model_df = pd.read_csv(model_path)

# === Normalize filenames for matching ===
truth_df["subtitle_filename"] = truth_df["subtitle_filename"].str.strip()
model_df["subtitle_filename"] = model_df["subtitle_filename"].str.strip()

# === Define Label Normalization ===
label_map = {
    "heroic individual": "heroic",
    "ordinary individual": "ordinary",
    "group / ensemble": "group",
    "institution / system": "institution",
    "none": "none",
    "anti-heroic": "anti-hero",
    "none / anti-heroic": "none|anti-hero",
    "none|anti-heroic": "none|anti-hero",
    "anti-hero": "anti-hero",
    "none / anti-hero": "none|anti-hero",
    "group": "group",
    "heroic": "heroic",
    "ordinary": "ordinary"
}

# Apply label map to both sides
def normalize_label(label):
    if pd.isna(label):
        return ""
    label = label.lower().strip()
    return label_map.get(label, label)

# Normalize benchmark labels (which may have multiple correct ones)
def normalize_set(label_str):
    if pd.isna(label_str):
        return set()
    parts = [normalize_label(part) for part in label_str.split("|")]
    return set(parts)

truth_df["hero_type_set"] = truth_df["hero_type"].apply(normalize_set)
model_df["normalized_vote"] = model_df["weighted_vote"].apply(normalize_label)

# === Merge and Evaluate ===
merged_df = pd.merge(model_df, truth_df, on="subtitle_filename", how="inner")
merged_df["is_correct"] = merged_df.apply(
    lambda row: row["normalized_vote"] in row["hero_type_set"], axis=1
)

# === Prepare Output CSV ===
evaluation_df = merged_df[[
    "subtitle_filename", "hero_type", "normalized_vote", "is_correct"
]].copy()
evaluation_df.columns = ["movie", "benchmark_hero_type", "model_hero_type", "is_correct"]
evaluation_df.to_csv(output_path, index=False)

# === Summary ===
total = len(evaluation_df)
correct = evaluation_df["is_correct"].sum()
accuracy = correct / total if total else 0

print(f"🎯 Evaluation complete:")
print(f"✅ Correct: {correct}/{total}")
print(f"📊 Accuracy: {accuracy:.2%}")
print(f"📁 Saved to: {output_path}")


🎯 Evaluation complete:
✅ Correct: 15/20
📊 Accuracy: 75.00%
📁 Saved to: /Users/cedricroetheli/Desktop/Benchmark/hero_type_evaluation_openai.csv


In [9]:
import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === OpenAI API setup ===
api_key = "api-key"  # 🔐 Replace this with your OpenAI key
api_url = "https://api.openai.com/v1/chat/completions"
model_name = "gpt-4o"

headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json"
}

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_institutions_position_openai"
prompts_dir = base / "prompts_institutions_position_openai"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

matches_df = pd.read_csv(base / "matches_benchmark.csv")

# === Prompt template ===
def create_institutions_position_prompt(summary, chunk):
    return f"""
You are analyzing how a film portrays bureaucracy, the state, and other formal institutions.

Your task is to classify the film’s overall stance toward these institutions into one of three categories:

- Opposed: Institutions are portrayed as corrupt, repressive, harmful, or adversarial. The narrative or characters resist or challenge them.
- Neutral: Institutions are present but play a limited, mixed, or ambiguous role. They are neither clearly positive nor negative.
- Supported: Institutions are shown as legitimate, effective, or beneficial. The story affirms their role or value.

Please:
1. Select one of the three stance labels: "opposed", "neutral", or "supported"
2. Provide a brief explanation (1–2 sentences max)
3. Include a confidence score from 0.0 to 1.0

Strictly return your answer in the following JSON format:

{{
  "institutions_position": "...",
  "confidence": ...,
  "explanation": "..."
}}

Film Summary:
\"\"\"{summary}\"\"\"

Dialogue:
\"\"\"{chunk}\"\"\"
""".strip()

# === OpenAI API call ===
def call_openai(prompt):
    payload = {
        "model": model_name,
        "temperature": 0,
        "max_tokens": 500,
        "messages": [
            {"role": "system", "content": "You are a careful film analyst. Always return exactly and only the required JSON."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(api_url, headers=headers, json=payload)
    
    if response.status_code == 429:
        print("⚠️ Rate limit hit. Waiting 60s...")
        time.sleep(60)
        return call_openai(prompt)
    elif response.status_code != 200:
        print(f"❌ API Error: {response.status_code}\n{response.text}")
        return None

    try:
        content = response.json()["choices"][0]["message"]["content"]
        json_str = content.strip().strip("```json").strip("```")
        return json.loads(json_str)
    except Exception as e:
        print(f"⚠️ JSON parsing error:\n{response.text}")
        return None

# === Chunk subtitles ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Main loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring institutions_position (OpenAI)"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_institutions_position.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_institutions_position_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_institutions_position_prompt(summary, chunk)

        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_openai(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring institutions_position (OpenAI):   0%|            | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
✅ Scored: 2006Blood.Diamond chunk 5
✅ Scored: 2006Blood.Diamond chunk 6
✅ Scored: 2006Blood.Diamond chunk 7
✅ Scored: 2006Blood.Diamond chunk 8
✅ Scored: 2006Blood.Diamond chunk 9
✅ Scored: 2006Blood.Diamond chunk 10
✅ Scored: 2006Blood.Diamond chunk 11


Scoring institutions_position (OpenAI):   5%|▏   | 1/20 [00:40<12:53, 40.73s/it]

✅ Scored: 2005The.Constant.Gardener chunk 1
✅ Scored: 2005The.Constant.Gardener chunk 2
✅ Scored: 2005The.Constant.Gardener chunk 3
✅ Scored: 2005The.Constant.Gardener chunk 4
✅ Scored: 2005The.Constant.Gardener chunk 5
✅ Scored: 2005The.Constant.Gardener chunk 6
✅ Scored: 2005The.Constant.Gardener chunk 7
✅ Scored: 2005The.Constant.Gardener chunk 8
✅ Scored: 2005The.Constant.Gardener chunk 9
✅ Scored: 2005The.Constant.Gardener chunk 10
✅ Scored: 2005The.Constant.Gardener chunk 11
✅ Scored: 2005The.Constant.Gardener chunk 12
✅ Scored: 2005The.Constant.Gardener chunk 13


Scoring institutions_position (OpenAI):  10%|▍   | 2/20 [01:19<11:49, 39.39s/it]

✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 1
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 2
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 3
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 4
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 5
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 6


Scoring institutions_position (OpenAI):  15%|▌   | 3/20 [01:39<08:38, 30.53s/it]

✅ Scored: 2009Avatar chunk 1
✅ Scored: 2009Avatar chunk 2
✅ Scored: 2009Avatar chunk 3
✅ Scored: 2009Avatar chunk 4
✅ Scored: 2009Avatar chunk 5
✅ Scored: 2009Avatar chunk 6
✅ Scored: 2009Avatar chunk 7
✅ Scored: 2009Avatar chunk 8
✅ Scored: 2009Avatar chunk 9
✅ Scored: 2009Avatar chunk 10
✅ Scored: 2009Avatar chunk 11


Scoring institutions_position (OpenAI):  20%|▊   | 4/20 [02:14<08:38, 32.39s/it]

✅ Scored: 2012The.Hunger.Games chunk 1
✅ Scored: 2012The.Hunger.Games chunk 2
✅ Scored: 2012The.Hunger.Games chunk 3
✅ Scored: 2012The.Hunger.Games chunk 4
✅ Scored: 2012The.Hunger.Games chunk 5
✅ Scored: 2012The.Hunger.Games chunk 6
✅ Scored: 2012The.Hunger.Games chunk 7
✅ Scored: 2012The.Hunger.Games chunk 8


Scoring institutions_position (OpenAI):  25%|█   | 5/20 [02:42<07:42, 30.82s/it]

✅ Scored: 1984Ghostbusters chunk 1
✅ Scored: 1984Ghostbusters chunk 2
✅ Scored: 1984Ghostbusters chunk 3
✅ Scored: 1984Ghostbusters chunk 4
✅ Scored: 1984Ghostbusters chunk 5
✅ Scored: 1984Ghostbusters chunk 6
✅ Scored: 1984Ghostbusters chunk 7
✅ Scored: 1984Ghostbusters chunk 8
✅ Scored: 1984Ghostbusters chunk 9
✅ Scored: 1984Ghostbusters chunk 10


Scoring institutions_position (OpenAI):  30%|█▏  | 6/20 [03:14<07:19, 31.38s/it]

✅ Scored: 1978Superman chunk 1
✅ Scored: 1978Superman chunk 2
✅ Scored: 1978Superman chunk 3
✅ Scored: 1978Superman chunk 4
✅ Scored: 1978Superman chunk 5
✅ Scored: 1978Superman chunk 6
✅ Scored: 1978Superman chunk 7
✅ Scored: 1978Superman chunk 8
✅ Scored: 1978Superman chunk 9
✅ Scored: 1978Superman chunk 10
✅ Scored: 1978Superman chunk 11


Scoring institutions_position (OpenAI):  35%|█▍  | 7/20 [03:50<07:05, 32.73s/it]

✅ Scored: 2008The.Hurt.Locker chunk 1
✅ Scored: 2008The.Hurt.Locker chunk 2
✅ Scored: 2008The.Hurt.Locker chunk 3
✅ Scored: 2008The.Hurt.Locker chunk 4
✅ Scored: 2008The.Hurt.Locker chunk 5
✅ Scored: 2008The.Hurt.Locker chunk 6
✅ Scored: 2008The.Hurt.Locker chunk 7
✅ Scored: 2008The.Hurt.Locker chunk 8
✅ Scored: 2008The.Hurt.Locker chunk 9


Scoring institutions_position (OpenAI):  40%|█▌  | 8/20 [04:18<06:16, 31.41s/it]

✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 1
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 2
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 3
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 4
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 5
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 6
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 7
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 8
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 9
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 10
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 11
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 12


Scoring institutions_position (OpenAI):  45%|█▊  | 9/20 [04:55<06:03, 33.01s/it]

✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 1
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 2
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 3
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 4
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 5
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 6
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 7
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 8
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 9
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 10


Scoring institutions_position (OpenAI):  50%|█▌ | 10/20 [05:26<05:23, 32.32s/it]

✅ Scored: 2018Black.Panther chunk 1
✅ Scored: 2018Black.Panther chunk 2
✅ Scored: 2018Black.Panther chunk 3
✅ Scored: 2018Black.Panther chunk 4
✅ Scored: 2018Black.Panther chunk 5
✅ Scored: 2018Black.Panther chunk 6
✅ Scored: 2018Black.Panther chunk 7
✅ Scored: 2018Black.Panther chunk 8
✅ Scored: 2018Black.Panther chunk 9
✅ Scored: 2018Black.Panther chunk 10


Scoring institutions_position (OpenAI):  55%|█▋ | 11/20 [05:55<04:42, 31.41s/it]

✅ Scored: 2021Dont.Look.Up chunk 1
✅ Scored: 2021Dont.Look.Up chunk 2
✅ Scored: 2021Dont.Look.Up chunk 3
✅ Scored: 2021Dont.Look.Up chunk 4
✅ Scored: 2021Dont.Look.Up chunk 5
✅ Scored: 2021Dont.Look.Up chunk 6
✅ Scored: 2021Dont.Look.Up chunk 7
✅ Scored: 2021Dont.Look.Up chunk 8
✅ Scored: 2021Dont.Look.Up chunk 9
✅ Scored: 2021Dont.Look.Up chunk 10
✅ Scored: 2021Dont.Look.Up chunk 11
✅ Scored: 2021Dont.Look.Up chunk 12
✅ Scored: 2021Dont.Look.Up chunk 13
✅ Scored: 2021Dont.Look.Up chunk 14
✅ Scored: 2021Dont.Look.Up chunk 15
✅ Scored: 2021Dont.Look.Up chunk 16
✅ Scored: 2021Dont.Look.Up chunk 17


Scoring institutions_position (OpenAI):  60%|█▊ | 12/20 [06:45<04:57, 37.17s/it]

✅ Scored: 1982First.Blood chunk 1
✅ Scored: 1982First.Blood chunk 2
✅ Scored: 1982First.Blood chunk 3
✅ Scored: 1982First.Blood chunk 4
✅ Scored: 1982First.Blood chunk 5
✅ Scored: 1982First.Blood chunk 6
✅ Scored: 1982First.Blood chunk 7


Scoring institutions_position (OpenAI):  65%|█▉ | 13/20 [07:07<03:46, 32.37s/it]

✅ Scored: 2019Joker chunk 1
✅ Scored: 2019Joker chunk 2
✅ Scored: 2019Joker chunk 3
✅ Scored: 2019Joker chunk 4
✅ Scored: 2019Joker chunk 5
✅ Scored: 2019Joker chunk 6
✅ Scored: 2019Joker chunk 7
✅ Scored: 2019Joker chunk 8
✅ Scored: 2019Joker chunk 9


Scoring institutions_position (OpenAI):  70%|██ | 14/20 [07:36<03:08, 31.39s/it]

✅ Scored: 2006Night.at.the.Museum chunk 1
✅ Scored: 2006Night.at.the.Museum chunk 2
✅ Scored: 2006Night.at.the.Museum chunk 3
✅ Scored: 2006Night.at.the.Museum chunk 4
✅ Scored: 2006Night.at.the.Museum chunk 5
✅ Scored: 2006Night.at.the.Museum chunk 6
✅ Scored: 2006Night.at.the.Museum chunk 7
✅ Scored: 2006Night.at.the.Museum chunk 8
✅ Scored: 2006Night.at.the.Museum chunk 9
✅ Scored: 2006Night.at.the.Museum chunk 10


Scoring institutions_position (OpenAI):  75%|██▎| 15/20 [08:11<02:41, 32.37s/it]

✅ Scored: 1976Rocky.I chunk 1
✅ Scored: 1976Rocky.I chunk 2
✅ Scored: 1976Rocky.I chunk 3
✅ Scored: 1976Rocky.I chunk 4
✅ Scored: 1976Rocky.I chunk 5
✅ Scored: 1976Rocky.I chunk 6
✅ Scored: 1976Rocky.I chunk 7
✅ Scored: 1976Rocky.I chunk 8
✅ Scored: 1976Rocky.I chunk 9
✅ Scored: 1976Rocky.I chunk 10
✅ Scored: 1976Rocky.I chunk 11
✅ Scored: 1976Rocky.I chunk 12
✅ Scored: 1976Rocky.I chunk 13


Scoring institutions_position (OpenAI):  80%|██▍| 16/20 [08:53<02:22, 35.52s/it]

✅ Scored: 2005V.for.Vendetta chunk 1
✅ Scored: 2005V.for.Vendetta chunk 2
✅ Scored: 2005V.for.Vendetta chunk 3
✅ Scored: 2005V.for.Vendetta chunk 4
✅ Scored: 2005V.for.Vendetta chunk 5
✅ Scored: 2005V.for.Vendetta chunk 6
✅ Scored: 2005V.for.Vendetta chunk 7
✅ Scored: 2005V.for.Vendetta chunk 8
✅ Scored: 2005V.for.Vendetta chunk 9
✅ Scored: 2005V.for.Vendetta chunk 10
✅ Scored: 2005V.for.Vendetta chunk 11
✅ Scored: 2005V.for.Vendetta chunk 12
✅ Scored: 2005V.for.Vendetta chunk 13
✅ Scored: 2005V.for.Vendetta chunk 14
✅ Scored: 2005V.for.Vendetta chunk 15
✅ Scored: 2005V.for.Vendetta chunk 16


Scoring institutions_position (OpenAI):  85%|██▌| 17/20 [09:55<02:10, 43.34s/it]

✅ Scored: 2017Paddington.2 chunk 1
✅ Scored: 2017Paddington.2 chunk 2
✅ Scored: 2017Paddington.2 chunk 3
✅ Scored: 2017Paddington.2 chunk 4
✅ Scored: 2017Paddington.2 chunk 5
✅ Scored: 2017Paddington.2 chunk 6
✅ Scored: 2017Paddington.2 chunk 7
✅ Scored: 2017Paddington.2 chunk 8
✅ Scored: 2017Paddington.2 chunk 9
✅ Scored: 2017Paddington.2 chunk 10
✅ Scored: 2017Paddington.2 chunk 11
✅ Scored: 2017Paddington.2 chunk 12


Scoring institutions_position (OpenAI):  90%|██▋| 18/20 [10:34<01:24, 42.08s/it]

✅ Scored: 1985Back.To.The.Future chunk 1
✅ Scored: 1985Back.To.The.Future chunk 2
✅ Scored: 1985Back.To.The.Future chunk 3
✅ Scored: 1985Back.To.The.Future chunk 4
✅ Scored: 1985Back.To.The.Future chunk 5
✅ Scored: 1985Back.To.The.Future chunk 6
✅ Scored: 1985Back.To.The.Future chunk 7
✅ Scored: 1985Back.To.The.Future chunk 8
✅ Scored: 1985Back.To.The.Future chunk 9
✅ Scored: 1985Back.To.The.Future chunk 10
✅ Scored: 1985Back.To.The.Future chunk 11


Scoring institutions_position (OpenAI):  95%|██▊| 19/20 [11:09<00:40, 40.06s/it]

✅ Scored: 2013The.Purge chunk 1
✅ Scored: 2013The.Purge chunk 2
✅ Scored: 2013The.Purge chunk 3
✅ Scored: 2013The.Purge chunk 4
✅ Scored: 2013The.Purge chunk 5
✅ Scored: 2013The.Purge chunk 6
✅ Scored: 2013The.Purge chunk 7


Scoring institutions_position (OpenAI): 100%|███| 20/20 [11:31<00:00, 34.55s/it]


In [11]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter

# === Paths ===
base_path = Path.home() / "Desktop" / "Benchmark"
scored_dir = base_path / "scored_institutions_position_openai"  # 💡 Adjusted directory
output_csv_path = base_path / "model_institutions_position_output_openai.csv"  # 💾 New output file

# === Aggregate Results ===
aggregated_results = []

# Group chunk files by movie
movie_files = defaultdict(list)
for file in scored_dir.glob("*_chunk*_institutions_position.json"):
    movie_id = file.name.split("_chunk")[0]
    movie_files[movie_id].append(file)

# Process each movie's chunks
for movie_id, files in movie_files.items():
    label_counts = Counter()
    confidence_sums = defaultdict(float)
    confidence_counts = defaultdict(int)
    explanations = []

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                label = data.get("institutions_position", "").strip().lower()
                confidence = float(data.get("confidence", 0.0))
                explanation = data.get("explanation", "")

                label_counts[label] += 1
                confidence_sums[label] += confidence
                confidence_counts[label] += 1
                explanations.append(f"{label} ({confidence:.2f}): {explanation}")

            except Exception as e:
                print(f"⚠️ Error in file {file.name}: {e}")

    # Compute average confidence per label
    avg_confidences = {
        label: round(confidence_sums[label] / confidence_counts[label], 3)
        for label in label_counts
    }

    # Weighted vote = label with highest total confidence
    weighted_vote = max(confidence_sums.items(), key=lambda x: x[1])[0] if confidence_sums else None

    aggregated_results.append({
        "subtitle_filename": movie_id,
        "label_counts": dict(label_counts),
        "avg_confidences": avg_confidences,
        "weighted_vote": weighted_vote,
        "explanations": explanations
    })

# === Save to CSV ===
df = pd.DataFrame(aggregated_results)
df.to_csv(output_csv_path, index=False)
print(f"✅ Saved aggregated results to: {output_csv_path}")


✅ Saved aggregated results to: /Users/cedricroetheli/Desktop/Benchmark/model_institutions_position_output_openai.csv


In [13]:
import pandas as pd
from pathlib import Path

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
truth_path = base / "benchmark_final.csv"
model_path = base / "model_institutions_position_output_openai.csv"  # 💡 Adjusted for OpenAI output
output_path = base / "institutions_position_evaluation_openai.csv"   # 💡 New evaluation output

# === Load Data ===
truth_df = pd.read_csv(truth_path)
model_df = pd.read_csv(model_path)

# === Normalize filenames ===
truth_df["subtitle_filename"] = truth_df["subtitle_filename"].str.strip()
model_df["subtitle_filename"] = model_df["subtitle_filename"].str.strip()

# === Label Mapping for Consistency ===
label_map = {
    "opposed": "opposed",
    "neutral": "neutral",
    "supported": "supported"
}

def normalize_label(label):
    if pd.isna(label):
        return ""
    label = label.lower().strip()
    return label_map.get(label, label)

def normalize_set(label_str):
    if pd.isna(label_str):
        return set()
    parts = [normalize_label(part) for part in label_str.split("|")]
    return set(parts)

# Apply normalization
truth_df["institutions_position_set"] = truth_df["institutions_position"].apply(normalize_set)
model_df["normalized_vote"] = model_df["weighted_vote"].apply(normalize_label)

# === Merge and Evaluate ===
merged_df = pd.merge(model_df, truth_df, on="subtitle_filename", how="inner")
merged_df["is_correct"] = merged_df.apply(
    lambda row: row["normalized_vote"] in row["institutions_position_set"], axis=1
)

# === Output CSV ===
evaluation_df = merged_df[[
    "subtitle_filename", "institutions_position", "normalized_vote", "is_correct"
]].copy()
evaluation_df.columns = ["movie", "benchmark_institutions_position", "model_institutions_position", "is_correct"]
evaluation_df.to_csv(output_path, index=False)

# === Summary ===
total = len(evaluation_df)
correct = evaluation_df["is_correct"].sum()
accuracy = correct / total if total else 0

print(f"🎯 Evaluation complete:")
print(f"✅ Correct: {correct}/{total}")
print(f"📊 Accuracy: {accuracy:.2%}")
print(f"📁 Saved to: {output_path}")


🎯 Evaluation complete:
✅ Correct: 18/20
📊 Accuracy: 90.00%
📁 Saved to: /Users/cedricroetheli/Desktop/Benchmark/institutions_position_evaluation_openai.csv


In [19]:
import os
import json
import time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

# === OpenAI Client Setup (1.x syntax) ===
client = OpenAI(api_key="api-key")


# === Config ===
model_name = "gpt-4o"

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_good_vs_evil_openai"
prompts_dir = base / "prompts_good_vs_evil_openai"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

matches_df = pd.read_csv(base / "matches_benchmark.csv")

# === Prompt template ===
def create_good_vs_evil_prompt(summary, chunk):
    return f"""
You are analyzing how a film portrays its central moral conflict based on its plot summary and dialogue.

Your task is to classify the film’s depiction of **Good vs Evil** into one of the following three categories:

- **Clear**: There is a strong moral binary. Good and evil are clearly distinguished. One side is righteous, the other is corrupt or villainous.
- **Neutral**: The film does not emphasize good vs evil. There may be conflict, but it is not framed in strong moral terms.
- **Complicated**: The line between good and evil is blurred. Moral ambiguity is central, and characters or institutions cannot be easily classified as good or evil.

Please:
1. Choose the best fitting label: "clear", "neutral", or "complicated"
2. Give a brief justification (1–2 sentences)
3. Provide a confidence score between 0.0 and 1.0

Return your answer in this exact JSON format:

{{
  "good_vs_evil": "...",
  "confidence": ...,
  "explanation": "..."
}}

Film Summary:
\"\"\"{summary}\"\"\"

Dialogue:
\"\"\"{chunk}\"\"\"
""".strip()

# === OpenAI call with updated SDK ===
def call_openai(prompt):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a careful film analyst. Always return exactly and only the required JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=500,
        )
        content = response.choices[0].message.content.strip()

        # Remove wrapping code fences if present
        if content.startswith("```json"):
            content = content[7:-3].strip()
        elif content.startswith("```"):
            content = content[3:-3].strip()

        return json.loads(content)
    except Exception as e:
        print(f"⚠️ OpenAI API error: {e}")
        return None

# === Subtitle chunking ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Main scoring loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring good_vs_evil (OpenAI)"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_good_vs_evil.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_good_vs_evil_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_good_vs_evil_prompt(summary, chunk)

        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_openai(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring good_vs_evil (OpenAI):   0%|                     | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
✅ Scored: 2006Blood.Diamond chunk 5
✅ Scored: 2006Blood.Diamond chunk 6
✅ Scored: 2006Blood.Diamond chunk 7
✅ Scored: 2006Blood.Diamond chunk 8
✅ Scored: 2006Blood.Diamond chunk 9
✅ Scored: 2006Blood.Diamond chunk 10
✅ Scored: 2006Blood.Diamond chunk 11


Scoring good_vs_evil (OpenAI):   5%|▋            | 1/20 [00:41<13:17, 41.99s/it]

✅ Scored: 2005The.Constant.Gardener chunk 1
✅ Scored: 2005The.Constant.Gardener chunk 2
✅ Scored: 2005The.Constant.Gardener chunk 3
✅ Scored: 2005The.Constant.Gardener chunk 4
✅ Scored: 2005The.Constant.Gardener chunk 5
✅ Scored: 2005The.Constant.Gardener chunk 6
✅ Scored: 2005The.Constant.Gardener chunk 7
✅ Scored: 2005The.Constant.Gardener chunk 8
✅ Scored: 2005The.Constant.Gardener chunk 9
✅ Scored: 2005The.Constant.Gardener chunk 10
✅ Scored: 2005The.Constant.Gardener chunk 11
✅ Scored: 2005The.Constant.Gardener chunk 12
✅ Scored: 2005The.Constant.Gardener chunk 13


Scoring good_vs_evil (OpenAI):  10%|█▎           | 2/20 [01:21<12:13, 40.77s/it]

✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 1
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 2
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 3
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 4
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 5
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 6


Scoring good_vs_evil (OpenAI):  15%|█▉           | 3/20 [01:42<08:56, 31.56s/it]

✅ Scored: 2009Avatar chunk 1
✅ Scored: 2009Avatar chunk 2
✅ Scored: 2009Avatar chunk 3
✅ Scored: 2009Avatar chunk 4
✅ Scored: 2009Avatar chunk 5
✅ Scored: 2009Avatar chunk 6
✅ Scored: 2009Avatar chunk 7
✅ Scored: 2009Avatar chunk 8
✅ Scored: 2009Avatar chunk 9
✅ Scored: 2009Avatar chunk 10
✅ Scored: 2009Avatar chunk 11


Scoring good_vs_evil (OpenAI):  20%|██▌          | 4/20 [02:19<08:57, 33.58s/it]

✅ Scored: 2012The.Hunger.Games chunk 1
✅ Scored: 2012The.Hunger.Games chunk 2
✅ Scored: 2012The.Hunger.Games chunk 3
✅ Scored: 2012The.Hunger.Games chunk 4
✅ Scored: 2012The.Hunger.Games chunk 5
✅ Scored: 2012The.Hunger.Games chunk 6
✅ Scored: 2012The.Hunger.Games chunk 7
✅ Scored: 2012The.Hunger.Games chunk 8


Scoring good_vs_evil (OpenAI):  25%|███▎         | 5/20 [02:45<07:44, 31.00s/it]

✅ Scored: 1984Ghostbusters chunk 1
✅ Scored: 1984Ghostbusters chunk 2
✅ Scored: 1984Ghostbusters chunk 3
✅ Scored: 1984Ghostbusters chunk 4
✅ Scored: 1984Ghostbusters chunk 5
✅ Scored: 1984Ghostbusters chunk 6
✅ Scored: 1984Ghostbusters chunk 7
✅ Scored: 1984Ghostbusters chunk 8
✅ Scored: 1984Ghostbusters chunk 9
✅ Scored: 1984Ghostbusters chunk 10


Scoring good_vs_evil (OpenAI):  30%|███▉         | 6/20 [03:16<07:15, 31.09s/it]

✅ Scored: 1978Superman chunk 1
✅ Scored: 1978Superman chunk 2
✅ Scored: 1978Superman chunk 3
✅ Scored: 1978Superman chunk 4
✅ Scored: 1978Superman chunk 5
✅ Scored: 1978Superman chunk 6
✅ Scored: 1978Superman chunk 7
✅ Scored: 1978Superman chunk 8
✅ Scored: 1978Superman chunk 9
✅ Scored: 1978Superman chunk 10
✅ Scored: 1978Superman chunk 11


Scoring good_vs_evil (OpenAI):  35%|████▌        | 7/20 [03:51<06:59, 32.25s/it]

✅ Scored: 2008The.Hurt.Locker chunk 1
✅ Scored: 2008The.Hurt.Locker chunk 2
✅ Scored: 2008The.Hurt.Locker chunk 3
✅ Scored: 2008The.Hurt.Locker chunk 4
✅ Scored: 2008The.Hurt.Locker chunk 5
✅ Scored: 2008The.Hurt.Locker chunk 6
✅ Scored: 2008The.Hurt.Locker chunk 7
✅ Scored: 2008The.Hurt.Locker chunk 8
✅ Scored: 2008The.Hurt.Locker chunk 9


Scoring good_vs_evil (OpenAI):  40%|█████▏       | 8/20 [04:18<06:08, 30.73s/it]

✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 1
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 2
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 3
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 4
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 5
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 6
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 7
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 8
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 9
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 10
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 11
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 12


Scoring good_vs_evil (OpenAI):  45%|█████▊       | 9/20 [04:59<06:11, 33.74s/it]

✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 1
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 2
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 3
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 4
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 5
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 6
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 7
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 8
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 9
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 10


Scoring good_vs_evil (OpenAI):  50%|██████      | 10/20 [05:36<05:47, 34.76s/it]

✅ Scored: 2018Black.Panther chunk 1
✅ Scored: 2018Black.Panther chunk 2
✅ Scored: 2018Black.Panther chunk 3
✅ Scored: 2018Black.Panther chunk 4
✅ Scored: 2018Black.Panther chunk 5
✅ Scored: 2018Black.Panther chunk 6
✅ Scored: 2018Black.Panther chunk 7
✅ Scored: 2018Black.Panther chunk 8
✅ Scored: 2018Black.Panther chunk 9
✅ Scored: 2018Black.Panther chunk 10


Scoring good_vs_evil (OpenAI):  55%|██████▌     | 11/20 [06:08<05:06, 34.05s/it]

✅ Scored: 2021Dont.Look.Up chunk 1
✅ Scored: 2021Dont.Look.Up chunk 2
✅ Scored: 2021Dont.Look.Up chunk 3
✅ Scored: 2021Dont.Look.Up chunk 4
✅ Scored: 2021Dont.Look.Up chunk 5
✅ Scored: 2021Dont.Look.Up chunk 6
✅ Scored: 2021Dont.Look.Up chunk 7
✅ Scored: 2021Dont.Look.Up chunk 8
✅ Scored: 2021Dont.Look.Up chunk 9
✅ Scored: 2021Dont.Look.Up chunk 10
✅ Scored: 2021Dont.Look.Up chunk 11
✅ Scored: 2021Dont.Look.Up chunk 12
✅ Scored: 2021Dont.Look.Up chunk 13
✅ Scored: 2021Dont.Look.Up chunk 14
✅ Scored: 2021Dont.Look.Up chunk 15
✅ Scored: 2021Dont.Look.Up chunk 16
✅ Scored: 2021Dont.Look.Up chunk 17


Scoring good_vs_evil (OpenAI):  60%|███████▏    | 12/20 [07:05<05:28, 41.08s/it]

✅ Scored: 1982First.Blood chunk 1
✅ Scored: 1982First.Blood chunk 2
✅ Scored: 1982First.Blood chunk 3
✅ Scored: 1982First.Blood chunk 4
✅ Scored: 1982First.Blood chunk 5
✅ Scored: 1982First.Blood chunk 6
✅ Scored: 1982First.Blood chunk 7


Scoring good_vs_evil (OpenAI):  65%|███████▊    | 13/20 [07:28<04:07, 35.34s/it]

✅ Scored: 2019Joker chunk 1
✅ Scored: 2019Joker chunk 2
✅ Scored: 2019Joker chunk 3
✅ Scored: 2019Joker chunk 4
✅ Scored: 2019Joker chunk 5
✅ Scored: 2019Joker chunk 6
✅ Scored: 2019Joker chunk 7
✅ Scored: 2019Joker chunk 8
✅ Scored: 2019Joker chunk 9


Scoring good_vs_evil (OpenAI):  70%|████████▍   | 14/20 [07:56<03:19, 33.23s/it]

✅ Scored: 2006Night.at.the.Museum chunk 1
✅ Scored: 2006Night.at.the.Museum chunk 2
✅ Scored: 2006Night.at.the.Museum chunk 3
✅ Scored: 2006Night.at.the.Museum chunk 4
✅ Scored: 2006Night.at.the.Museum chunk 5
✅ Scored: 2006Night.at.the.Museum chunk 6
✅ Scored: 2006Night.at.the.Museum chunk 7
✅ Scored: 2006Night.at.the.Museum chunk 8
✅ Scored: 2006Night.at.the.Museum chunk 9
✅ Scored: 2006Night.at.the.Museum chunk 10


Scoring good_vs_evil (OpenAI):  75%|█████████   | 15/20 [08:26<02:41, 32.38s/it]

✅ Scored: 1976Rocky.I chunk 1
✅ Scored: 1976Rocky.I chunk 2
✅ Scored: 1976Rocky.I chunk 3
✅ Scored: 1976Rocky.I chunk 4
✅ Scored: 1976Rocky.I chunk 5
✅ Scored: 1976Rocky.I chunk 6
✅ Scored: 1976Rocky.I chunk 7
✅ Scored: 1976Rocky.I chunk 8
✅ Scored: 1976Rocky.I chunk 9
✅ Scored: 1976Rocky.I chunk 10
✅ Scored: 1976Rocky.I chunk 11
✅ Scored: 1976Rocky.I chunk 12
✅ Scored: 1976Rocky.I chunk 13


Scoring good_vs_evil (OpenAI):  80%|█████████▌  | 16/20 [09:04<02:15, 33.97s/it]

✅ Scored: 2005V.for.Vendetta chunk 1
✅ Scored: 2005V.for.Vendetta chunk 2
✅ Scored: 2005V.for.Vendetta chunk 3
✅ Scored: 2005V.for.Vendetta chunk 4
✅ Scored: 2005V.for.Vendetta chunk 5
✅ Scored: 2005V.for.Vendetta chunk 6
✅ Scored: 2005V.for.Vendetta chunk 7
✅ Scored: 2005V.for.Vendetta chunk 8
✅ Scored: 2005V.for.Vendetta chunk 9
✅ Scored: 2005V.for.Vendetta chunk 10
✅ Scored: 2005V.for.Vendetta chunk 11
✅ Scored: 2005V.for.Vendetta chunk 12
✅ Scored: 2005V.for.Vendetta chunk 13
✅ Scored: 2005V.for.Vendetta chunk 14
✅ Scored: 2005V.for.Vendetta chunk 15
✅ Scored: 2005V.for.Vendetta chunk 16


Scoring good_vs_evil (OpenAI):  85%|██████████▏ | 17/20 [09:53<01:55, 38.39s/it]

✅ Scored: 2017Paddington.2 chunk 1
✅ Scored: 2017Paddington.2 chunk 2
✅ Scored: 2017Paddington.2 chunk 3
✅ Scored: 2017Paddington.2 chunk 4
✅ Scored: 2017Paddington.2 chunk 5
✅ Scored: 2017Paddington.2 chunk 6
✅ Scored: 2017Paddington.2 chunk 7
✅ Scored: 2017Paddington.2 chunk 8
✅ Scored: 2017Paddington.2 chunk 9
✅ Scored: 2017Paddington.2 chunk 10
✅ Scored: 2017Paddington.2 chunk 11
✅ Scored: 2017Paddington.2 chunk 12


Scoring good_vs_evil (OpenAI):  90%|██████████▊ | 18/20 [10:30<01:15, 37.96s/it]

✅ Scored: 1985Back.To.The.Future chunk 1
✅ Scored: 1985Back.To.The.Future chunk 2
✅ Scored: 1985Back.To.The.Future chunk 3
✅ Scored: 1985Back.To.The.Future chunk 4
✅ Scored: 1985Back.To.The.Future chunk 5
✅ Scored: 1985Back.To.The.Future chunk 6
✅ Scored: 1985Back.To.The.Future chunk 7
✅ Scored: 1985Back.To.The.Future chunk 8
✅ Scored: 1985Back.To.The.Future chunk 9
✅ Scored: 1985Back.To.The.Future chunk 10
✅ Scored: 1985Back.To.The.Future chunk 11


Scoring good_vs_evil (OpenAI):  95%|███████████▍| 19/20 [11:08<00:38, 38.03s/it]

✅ Scored: 2013The.Purge chunk 1
✅ Scored: 2013The.Purge chunk 2
✅ Scored: 2013The.Purge chunk 3
✅ Scored: 2013The.Purge chunk 4
✅ Scored: 2013The.Purge chunk 5
✅ Scored: 2013The.Purge chunk 6
✅ Scored: 2013The.Purge chunk 7


Scoring good_vs_evil (OpenAI): 100%|████████████| 20/20 [11:30<00:00, 34.51s/it]


In [21]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter

# === Paths ===
base_path = Path.home() / "Desktop" / "Benchmark"
scored_dir = base_path / "scored_good_vs_evil_openai"
output_csv_path = base_path / "model_good_vs_evil_output_openai.csv"  # ← changed output filename

# === Aggregate Results ===
aggregated_results = []

# Group chunk files by movie
movie_files = defaultdict(list)
for file in scored_dir.glob("*_chunk*_good_vs_evil.json"):
    movie_id = file.name.split("_chunk")[0]
    movie_files[movie_id].append(file)

# Process each movie's chunks
for movie_id, files in movie_files.items():
    label_counts = Counter()
    confidence_sums = defaultdict(float)
    confidence_counts = defaultdict(int)
    explanations = []

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                # === Handle optional triple-backtick wrapping (from OpenAI JSON completions) ===
                raw = f.read().strip()
                if raw.startswith("```json"):
                    raw = raw[7:].strip()
                if raw.startswith("```"):
                    raw = raw[3:].strip()
                if raw.endswith("```"):
                    raw = raw[:-3].strip()

                data = json.loads(raw)

                label = data.get("good_vs_evil", "").strip().lower()
                confidence = float(data.get("confidence", 0.0))
                explanation = data.get("explanation", "")

                label_counts[label] += 1
                confidence_sums[label] += confidence
                confidence_counts[label] += 1
                explanations.append(f"{label} ({confidence:.2f}): {explanation}")

            except Exception as e:
                print(f"⚠️ Error in file {file.name}: {e}")

    # Compute average confidence per label
    avg_confidences = {
        label: round(confidence_sums[label] / confidence_counts[label], 3)
        for label in label_counts
    }

    # Weighted vote = label with highest total confidence
    weighted_vote = max(confidence_sums.items(), key=lambda x: x[1])[0] if confidence_sums else None

    aggregated_results.append({
        "subtitle_filename": movie_id,
        "label_counts": dict(label_counts),
        "avg_confidences": avg_confidences,
        "weighted_vote": weighted_vote,
        "explanations": explanations
    })

# === Save to CSV ===
df = pd.DataFrame(aggregated_results)
df.to_csv(output_csv_path, index=False)
print(f"✅ Saved aggregated results to: {output_csv_path}")


✅ Saved aggregated results to: /Users/cedricroetheli/Desktop/Benchmark/model_good_vs_evil_output_openai.csv


In [23]:
import pandas as pd
from pathlib import Path

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
truth_path = base / "benchmark_final.csv"
model_path = base / "model_good_vs_evil_output_openai.csv"  # ← updated input
output_path = base / "good_vs_evil_evaluation_openai.csv"   # ← updated output

# === Load Data ===
truth_df = pd.read_csv(truth_path)
model_df = pd.read_csv(model_path)

# === Normalize filenames ===
truth_df["subtitle_filename"] = truth_df["subtitle_filename"].str.strip()
model_df["subtitle_filename"] = model_df["subtitle_filename"].str.strip()

# === Label Mapping for Consistency ===
label_map = {
    "clear": "clear",
    "complicated": "complicated",
    "complex": "complicated",  # ✅ map 'complex' to 'complicated'
    "neutral": "neutral"
}

def normalize_label(label):
    if pd.isna(label):
        return ""
    label = label.lower().strip()
    return label_map.get(label, label)

def normalize_set(label_str):
    if pd.isna(label_str):
        return set()
    parts = [normalize_label(part) for part in label_str.split("|")]
    return set(parts)

# === Apply normalization ===
truth_df["good_vs_evil_set"] = truth_df["good_vs_evil"].apply(normalize_set)
model_df["normalized_vote"] = model_df["weighted_vote"].apply(normalize_label)

# === Merge and Evaluate ===
merged_df = pd.merge(model_df, truth_df, on="subtitle_filename", how="inner")
merged_df["is_correct"] = merged_df.apply(
    lambda row: row["normalized_vote"] in row["good_vs_evil_set"], axis=1
)

# === Output CSV ===
evaluation_df = merged_df[[
    "subtitle_filename", "good_vs_evil", "normalized_vote", "is_correct"
]].copy()
evaluation_df.columns = ["movie", "benchmark_good_vs_evil", "model_good_vs_evil", "is_correct"]
evaluation_df.to_csv(output_path, index=False)

# === Summary ===
total = len(evaluation_df)
correct = evaluation_df["is_correct"].sum()
accuracy = correct / total if total else 0

print(f"🎯 Evaluation complete:")
print(f"✅ Correct: {correct}/{total}")
print(f"📊 Accuracy: {accuracy:.2%}")
print(f"📁 Saved to: {output_path}")


🎯 Evaluation complete:
✅ Correct: 15/20
📊 Accuracy: 75.00%
📁 Saved to: /Users/cedricroetheli/Desktop/Benchmark/good_vs_evil_evaluation_openai.csv


In [25]:
import os
import json
import time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

# === OpenAI Setup ===
client = OpenAI(api_key="api-key")  # 🔑 Replace with your OpenAI API key
model_name = "gpt-4o"

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_progressive_women_openai"
prompts_dir = base / "prompts_progressive_women_openai"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

matches_df = pd.read_csv(base / "matches_benchmark.csv")

# === Prompt Template ===
def create_progressive_women_prompt(summary, chunk):
    return f"""
You are evaluating how progressively women are portrayed in a film based on its plot summary and dialogue.

Your task is to classify the film into one of the following categories based on the roles, agency, and representation of women:

- **Yes**: The film offers a clearly progressive portrayal of women. Female characters are active agents, hold central narrative roles, and are treated with depth and respect.
- **Mixed**: The portrayal includes both progressive and stereotypical elements. Some female characters may show agency, while others may be sidelined or underdeveloped.
- **No**: The film does not offer a progressive portrayal. Female characters are absent, objectified, stereotyped, or lack meaningful agency.

Please:
1. Choose the best label: "yes", "mixed", or "no"
2. Provide a short justification (1–2 sentences)
3. Include a confidence score between 0.0 and 1.0

Return your answer in this exact JSON format:

{{
  "progressive_women": "...",
  "confidence": ...,
  "explanation": "..."
}}

Film Summary:
\"\"\"{summary}\"\"\"

Dialogue:
\"\"\"{chunk}\"\"\"
""".strip()

# === OpenAI API Call ===
def call_openai(prompt):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are a careful film analyst. Always return exactly and only the required JSON."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=500
        )
        content = response.choices[0].message.content.strip()
        return json.loads(content.strip("```json\n").strip("```"))
    except Exception as e:
        print(f"⚠️ OpenAI error: {e}")
        return None

# === Subtitle Chunking ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Scoring Loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring progressive_women (OpenAI)"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_progressive_women.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_progressive_women_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_progressive_women_prompt(summary, chunk)

        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_openai(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring progressive_women (OpenAI):   0%|                | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
✅ Scored: 2006Blood.Diamond chunk 5
✅ Scored: 2006Blood.Diamond chunk 6
✅ Scored: 2006Blood.Diamond chunk 7
✅ Scored: 2006Blood.Diamond chunk 8
✅ Scored: 2006Blood.Diamond chunk 9
✅ Scored: 2006Blood.Diamond chunk 10
✅ Scored: 2006Blood.Diamond chunk 11


Scoring progressive_women (OpenAI):   5%|▍       | 1/20 [00:51<16:21, 51.68s/it]

✅ Scored: 2005The.Constant.Gardener chunk 1
✅ Scored: 2005The.Constant.Gardener chunk 2
✅ Scored: 2005The.Constant.Gardener chunk 3
✅ Scored: 2005The.Constant.Gardener chunk 4
✅ Scored: 2005The.Constant.Gardener chunk 5
✅ Scored: 2005The.Constant.Gardener chunk 6
✅ Scored: 2005The.Constant.Gardener chunk 7
✅ Scored: 2005The.Constant.Gardener chunk 8
✅ Scored: 2005The.Constant.Gardener chunk 9
✅ Scored: 2005The.Constant.Gardener chunk 10
✅ Scored: 2005The.Constant.Gardener chunk 11
✅ Scored: 2005The.Constant.Gardener chunk 12
✅ Scored: 2005The.Constant.Gardener chunk 13


Scoring progressive_women (OpenAI):  10%|▊       | 2/20 [01:49<16:34, 55.23s/it]

✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 1
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 2
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 3
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 4
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 5
✅ Scored: 1981Indiana.Jones.And.The.Raiders.Of.The.Lost.Ark chunk 6


Scoring progressive_women (OpenAI):  15%|█▏      | 3/20 [02:16<11:57, 42.19s/it]

✅ Scored: 2009Avatar chunk 1
✅ Scored: 2009Avatar chunk 2
✅ Scored: 2009Avatar chunk 3
✅ Scored: 2009Avatar chunk 4
✅ Scored: 2009Avatar chunk 5
✅ Scored: 2009Avatar chunk 6
✅ Scored: 2009Avatar chunk 7
✅ Scored: 2009Avatar chunk 8
✅ Scored: 2009Avatar chunk 9
✅ Scored: 2009Avatar chunk 10
✅ Scored: 2009Avatar chunk 11


Scoring progressive_women (OpenAI):  20%|█▌      | 4/20 [02:54<10:48, 40.53s/it]

✅ Scored: 2012The.Hunger.Games chunk 1
✅ Scored: 2012The.Hunger.Games chunk 2
✅ Scored: 2012The.Hunger.Games chunk 3
✅ Scored: 2012The.Hunger.Games chunk 4
✅ Scored: 2012The.Hunger.Games chunk 5
✅ Scored: 2012The.Hunger.Games chunk 6
✅ Scored: 2012The.Hunger.Games chunk 7
✅ Scored: 2012The.Hunger.Games chunk 8


Scoring progressive_women (OpenAI):  25%|██      | 5/20 [03:22<09:03, 36.26s/it]

✅ Scored: 1984Ghostbusters chunk 1
✅ Scored: 1984Ghostbusters chunk 2
✅ Scored: 1984Ghostbusters chunk 3
✅ Scored: 1984Ghostbusters chunk 4
✅ Scored: 1984Ghostbusters chunk 5
✅ Scored: 1984Ghostbusters chunk 6
✅ Scored: 1984Ghostbusters chunk 7
✅ Scored: 1984Ghostbusters chunk 8
✅ Scored: 1984Ghostbusters chunk 9
✅ Scored: 1984Ghostbusters chunk 10


Scoring progressive_women (OpenAI):  30%|██▍     | 6/20 [04:01<08:37, 36.95s/it]

✅ Scored: 1978Superman chunk 1
✅ Scored: 1978Superman chunk 2
✅ Scored: 1978Superman chunk 3
✅ Scored: 1978Superman chunk 4
✅ Scored: 1978Superman chunk 5
✅ Scored: 1978Superman chunk 6
✅ Scored: 1978Superman chunk 7
✅ Scored: 1978Superman chunk 8
✅ Scored: 1978Superman chunk 9
✅ Scored: 1978Superman chunk 10
✅ Scored: 1978Superman chunk 11


Scoring progressive_women (OpenAI):  35%|██▊     | 7/20 [04:40<08:10, 37.73s/it]

✅ Scored: 2008The.Hurt.Locker chunk 1
✅ Scored: 2008The.Hurt.Locker chunk 2
✅ Scored: 2008The.Hurt.Locker chunk 3
✅ Scored: 2008The.Hurt.Locker chunk 4
✅ Scored: 2008The.Hurt.Locker chunk 5
✅ Scored: 2008The.Hurt.Locker chunk 6
✅ Scored: 2008The.Hurt.Locker chunk 7
✅ Scored: 2008The.Hurt.Locker chunk 8
✅ Scored: 2008The.Hurt.Locker chunk 9


Scoring progressive_women (OpenAI):  40%|███▏    | 8/20 [05:16<07:25, 37.15s/it]

✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 1
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 2
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 3
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 4
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 5
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 6
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 7
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 8
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 9
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 10
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 11
✅ Scored: 1977Star.Wars.Episode.IV.-.A.New.Hope chunk 12


Scoring progressive_women (OpenAI):  45%|███▌    | 9/20 [06:07<07:35, 41.45s/it]

✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 1
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 2
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 3
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 4
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 5
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 6
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 7
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 8
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 9
✅ Scored: 2003Pirates.of.the.Caribbean.The.Curse.of.the.Black.Pearl chunk 10


Scoring progressive_women (OpenAI):  50%|███▌   | 10/20 [06:53<07:08, 42.80s/it]

✅ Scored: 2018Black.Panther chunk 1
✅ Scored: 2018Black.Panther chunk 2
✅ Scored: 2018Black.Panther chunk 3
✅ Scored: 2018Black.Panther chunk 4
✅ Scored: 2018Black.Panther chunk 5
✅ Scored: 2018Black.Panther chunk 6
✅ Scored: 2018Black.Panther chunk 7
✅ Scored: 2018Black.Panther chunk 8
✅ Scored: 2018Black.Panther chunk 9
✅ Scored: 2018Black.Panther chunk 10


Scoring progressive_women (OpenAI):  55%|███▊   | 11/20 [07:25<05:55, 39.54s/it]

✅ Scored: 2021Dont.Look.Up chunk 1
✅ Scored: 2021Dont.Look.Up chunk 2
✅ Scored: 2021Dont.Look.Up chunk 3
✅ Scored: 2021Dont.Look.Up chunk 4
✅ Scored: 2021Dont.Look.Up chunk 5
✅ Scored: 2021Dont.Look.Up chunk 6
✅ Scored: 2021Dont.Look.Up chunk 7
✅ Scored: 2021Dont.Look.Up chunk 8
✅ Scored: 2021Dont.Look.Up chunk 9
✅ Scored: 2021Dont.Look.Up chunk 10
✅ Scored: 2021Dont.Look.Up chunk 11
✅ Scored: 2021Dont.Look.Up chunk 12
✅ Scored: 2021Dont.Look.Up chunk 13
✅ Scored: 2021Dont.Look.Up chunk 14
✅ Scored: 2021Dont.Look.Up chunk 15
✅ Scored: 2021Dont.Look.Up chunk 16
✅ Scored: 2021Dont.Look.Up chunk 17


Scoring progressive_women (OpenAI):  60%|████▏  | 12/20 [08:32<06:22, 47.86s/it]

✅ Scored: 1982First.Blood chunk 1
✅ Scored: 1982First.Blood chunk 2
✅ Scored: 1982First.Blood chunk 3
✅ Scored: 1982First.Blood chunk 4
✅ Scored: 1982First.Blood chunk 5
✅ Scored: 1982First.Blood chunk 6
✅ Scored: 1982First.Blood chunk 7


Scoring progressive_women (OpenAI):  65%|████▌  | 13/20 [08:52<04:36, 39.52s/it]

✅ Scored: 2019Joker chunk 1
✅ Scored: 2019Joker chunk 2
✅ Scored: 2019Joker chunk 3
✅ Scored: 2019Joker chunk 4
✅ Scored: 2019Joker chunk 5
✅ Scored: 2019Joker chunk 6
✅ Scored: 2019Joker chunk 7
✅ Scored: 2019Joker chunk 8
✅ Scored: 2019Joker chunk 9


Scoring progressive_women (OpenAI):  70%|████▉  | 14/20 [09:20<03:35, 35.97s/it]

✅ Scored: 2006Night.at.the.Museum chunk 1
✅ Scored: 2006Night.at.the.Museum chunk 2
✅ Scored: 2006Night.at.the.Museum chunk 3
✅ Scored: 2006Night.at.the.Museum chunk 4
✅ Scored: 2006Night.at.the.Museum chunk 5
✅ Scored: 2006Night.at.the.Museum chunk 6
✅ Scored: 2006Night.at.the.Museum chunk 7
✅ Scored: 2006Night.at.the.Museum chunk 8
✅ Scored: 2006Night.at.the.Museum chunk 9
✅ Scored: 2006Night.at.the.Museum chunk 10


Scoring progressive_women (OpenAI):  75%|█████▎ | 15/20 [09:57<03:01, 36.35s/it]

✅ Scored: 1976Rocky.I chunk 1
✅ Scored: 1976Rocky.I chunk 2
✅ Scored: 1976Rocky.I chunk 3
✅ Scored: 1976Rocky.I chunk 4
✅ Scored: 1976Rocky.I chunk 5
✅ Scored: 1976Rocky.I chunk 6
✅ Scored: 1976Rocky.I chunk 7
✅ Scored: 1976Rocky.I chunk 8
✅ Scored: 1976Rocky.I chunk 9
✅ Scored: 1976Rocky.I chunk 10
✅ Scored: 1976Rocky.I chunk 11
✅ Scored: 1976Rocky.I chunk 12
✅ Scored: 1976Rocky.I chunk 13


Scoring progressive_women (OpenAI):  80%|█████▌ | 16/20 [10:36<02:28, 37.13s/it]

✅ Scored: 2005V.for.Vendetta chunk 1
✅ Scored: 2005V.for.Vendetta chunk 2
✅ Scored: 2005V.for.Vendetta chunk 3
✅ Scored: 2005V.for.Vendetta chunk 4
✅ Scored: 2005V.for.Vendetta chunk 5
✅ Scored: 2005V.for.Vendetta chunk 6
✅ Scored: 2005V.for.Vendetta chunk 7
✅ Scored: 2005V.for.Vendetta chunk 8
✅ Scored: 2005V.for.Vendetta chunk 9
✅ Scored: 2005V.for.Vendetta chunk 10
✅ Scored: 2005V.for.Vendetta chunk 11
✅ Scored: 2005V.for.Vendetta chunk 12
✅ Scored: 2005V.for.Vendetta chunk 13
✅ Scored: 2005V.for.Vendetta chunk 14
✅ Scored: 2005V.for.Vendetta chunk 15
✅ Scored: 2005V.for.Vendetta chunk 16


Scoring progressive_women (OpenAI):  85%|█████▉ | 17/20 [11:30<02:07, 42.38s/it]

✅ Scored: 2017Paddington.2 chunk 1
✅ Scored: 2017Paddington.2 chunk 2
✅ Scored: 2017Paddington.2 chunk 3
✅ Scored: 2017Paddington.2 chunk 4
✅ Scored: 2017Paddington.2 chunk 5
✅ Scored: 2017Paddington.2 chunk 6
✅ Scored: 2017Paddington.2 chunk 7
✅ Scored: 2017Paddington.2 chunk 8
✅ Scored: 2017Paddington.2 chunk 9
✅ Scored: 2017Paddington.2 chunk 10
✅ Scored: 2017Paddington.2 chunk 11
✅ Scored: 2017Paddington.2 chunk 12


Scoring progressive_women (OpenAI):  90%|██████▎| 18/20 [12:18<01:27, 43.84s/it]

✅ Scored: 1985Back.To.The.Future chunk 1
✅ Scored: 1985Back.To.The.Future chunk 2
✅ Scored: 1985Back.To.The.Future chunk 3
✅ Scored: 1985Back.To.The.Future chunk 4
✅ Scored: 1985Back.To.The.Future chunk 5
✅ Scored: 1985Back.To.The.Future chunk 6
✅ Scored: 1985Back.To.The.Future chunk 7
✅ Scored: 1985Back.To.The.Future chunk 8
✅ Scored: 1985Back.To.The.Future chunk 9
✅ Scored: 1985Back.To.The.Future chunk 10
✅ Scored: 1985Back.To.The.Future chunk 11


Scoring progressive_women (OpenAI):  95%|██████▋| 19/20 [13:02<00:43, 43.87s/it]

✅ Scored: 2013The.Purge chunk 1
✅ Scored: 2013The.Purge chunk 2
✅ Scored: 2013The.Purge chunk 3
✅ Scored: 2013The.Purge chunk 4
✅ Scored: 2013The.Purge chunk 5
✅ Scored: 2013The.Purge chunk 6
✅ Scored: 2013The.Purge chunk 7


Scoring progressive_women (OpenAI): 100%|███████| 20/20 [13:23<00:00, 40.18s/it]


In [27]:
import os
import json
import pandas as pd
from pathlib import Path
from collections import defaultdict, Counter

# === Paths ===
base_path = Path.home() / "Desktop" / "Benchmark"
scored_dir = base_path / "scored_progressive_women_openai"
output_csv_path = base_path / "model_progressive_women_output_openai.csv"

# === Aggregate Results ===
aggregated_results = []

# Group chunk files by movie
movie_files = defaultdict(list)
for file in scored_dir.glob("*_chunk*_progressive_women.json"):
    movie_id = file.name.split("_chunk")[0]
    movie_files[movie_id].append(file)

# Process each movie's chunks
for movie_id, files in movie_files.items():
    label_counts = Counter()
    confidence_sums = defaultdict(float)
    confidence_counts = defaultdict(int)
    explanations = []

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                label = data.get("progressive_women", "").strip().lower()
                confidence = float(data.get("confidence", 0.0))
                explanation = data.get("explanation", "")

                label_counts[label] += 1
                confidence_sums[label] += confidence
                confidence_counts[label] += 1
                explanations.append(f"{label} ({confidence:.2f}): {explanation}")

            except Exception as e:
                print(f"⚠️ Error in file {file.name}: {e}")

    # Compute average confidence per label
    avg_confidences = {
        label: round(confidence_sums[label] / confidence_counts[label], 3)
        for label in label_counts
    }

    # Weighted vote = label with highest total confidence
    weighted_vote = max(confidence_sums.items(), key=lambda x: x[1])[0] if confidence_sums else None

    aggregated_results.append({
        "subtitle_filename": movie_id,
        "label_counts": dict(label_counts),
        "avg_confidences": avg_confidences,
        "weighted_vote": weighted_vote,
        "explanations": explanations
    })

# === Save to CSV ===
df = pd.DataFrame(aggregated_results)
df.to_csv(output_csv_path, index=False)
print(f"✅ Saved aggregated results to: {output_csv_path}")


✅ Saved aggregated results to: /Users/cedricroetheli/Desktop/Benchmark/model_progressive_women_output_openai.csv


In [29]:
import pandas as pd
from pathlib import Path

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
truth_path = base / "benchmark_final.csv"
model_path = base / "model_progressive_women_output_openai.csv"
output_path = base / "progressive_women_evaluation_openai.csv"

# === Load Data ===
truth_df = pd.read_csv(truth_path)
model_df = pd.read_csv(model_path)

# === Normalize filenames ===
truth_df["subtitle_filename"] = truth_df["subtitle_filename"].str.strip()
model_df["subtitle_filename"] = model_df["subtitle_filename"].str.strip()

# === Label Mapping for Consistency ===
label_map = {
    "yes": "yes",
    "no": "no",
    "mixed": "mixed",
    "partially": "mixed",
    "partly": "mixed"
}

def normalize_label(label):
    if pd.isna(label):
        return ""
    label = label.lower().strip()
    return label_map.get(label, label)

def normalize_set(label_str):
    if pd.isna(label_str):
        return set()
    parts = [normalize_label(part) for part in label_str.split("|")]
    return set(parts)

truth_df["progressive_women_set"] = truth_df["progressive_women"].apply(normalize_set)
model_df["normalized_vote"] = model_df["weighted_vote"].apply(normalize_label)

# === Merge and Evaluate ===
merged_df = pd.merge(model_df, truth_df, on="subtitle_filename", how="inner")
merged_df["is_correct"] = merged_df.apply(
    lambda row: row["normalized_vote"] in row["progressive_women_set"], axis=1
)

# === Output CSV ===
evaluation_df = merged_df[[
    "subtitle_filename", "progressive_women", "normalized_vote", "is_correct"
]].copy()
evaluation_df.columns = ["movie", "benchmark_progressive_women", "model_progressive_women", "is_correct"]
evaluation_df.to_csv(output_path, index=False)

# === Summary ===
total = len(evaluation_df)
correct = evaluation_df["is_correct"].sum()
accuracy = correct / total if total else 0

print(f"🎯 Evaluation complete:")
print(f"✅ Correct: {correct}/{total}")
print(f"📊 Accuracy: {accuracy:.2%}")
print(f"📁 Saved to: {output_path}")


🎯 Evaluation complete:
✅ Correct: 15/20
📊 Accuracy: 75.00%
📁 Saved to: /Users/cedricroetheli/Desktop/Benchmark/progressive_women_evaluation_openai.csv
