In [None]:
import os
import json
import time
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Claude API setup ===
api_key = "api-key"  # 🔒 Insert your key
api_url = "https://api.anthropic.com/v1/messages"
model_name = "claude-3-haiku-20240307"

headers = {
    "x-api-key": api_key,
    "anthropic-version": "2023-06-01",
    "content-type": "application/json",
}

# === Paths ===
base = Path.home() / "Desktop" / "Benchmark"
subs_dir = base / "json_benchmark"
summaries_dir = base / "summaries"
output_dir = base / "scored_themes"
prompts_dir = base / "prompts_themes"

output_dir.mkdir(parents=True, exist_ok=True)
prompts_dir.mkdir(parents=True, exist_ok=True)

# === Load match file ===
matches_df = pd.read_csv("matches_benchmark.csv")

# === Prompt template ===
def create_theme_prompt(summary, chunk):
    prompt = (
        "You are analyzing a film based on its dialogue and a summary of its plot.\n\n"
        "Your task is to identify the ideological, societal, or political themes expressed in the film.\n\n"
        "Definitions:\n"
        "- Ideological themes refer to ideas or values associated with a worldview or belief system, such as attitudes toward authority, individual rights, or tradition.\n"
        "- Societal themes reflect cultural norms, social roles, or commentary on how people relate to institutions, communities, or each other.\n"
        "- Political themes involve power structures, governance, laws, rights, or critiques of political institutions or ideologies.\n\n"
        "Please:\n"
        "1. Provide a list of 3 to 7 themes in the form of keywords or short phrases.\n"
        "2. Include a confidence score from 0.0 to 1.0 indicating how confident you are in your analysis.\n"
        "3. Provide a short explanation (1–2 plain sentences, no new lines or bullet points).\n\n"
        "Return your answer strictly in the following JSON format:\n\n"
        "{\n"
        "  \"themes\": [\"...\", \"...\", \"...\"],\n"
        "  \"confidence\": ..., \n"
        "  \"explanation\": \"...\"\n"
        "}\n\n"
        "All fields must be JSON-safe and use single-line values only. Do not use line breaks, markdown, or bullet points.\n\n"
        f"Film Summary:\n\"\"\"{summary}\"\"\"\n\n"
        f"Dialogue:\n\"\"\"{chunk}\"\"\""
    )
    return prompt


# === API call ===
def call_claude(prompt):
    payload = {
        "model": model_name,
        "max_tokens": 600,
        "temperature": 0,
        "system": "You are a careful film analyst. Always return exactly and only the required JSON.",
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(api_url, headers=headers, json=payload)

    if response.status_code == 429:
        print("⚠️ Rate limit hit. Waiting...")
        time.sleep(60)
        return call_claude(prompt)
    elif response.status_code != 200:
        print(f"❌ API Error: {response.status_code}")
        return None

    try:
        content = response.json()["content"][0]["text"]
        return json.loads(content)
    except Exception as e:
        print(f"⚠️ JSON parsing error:\n{response.text}")
        return None

# === Subtitle chunking ===
def chunk_dialogue(subs, chunk_size=5000, overlap=200):
    text_blocks = [line.get("text", "").strip() for line in subs if line.get("text")]
    full_text = " ".join(text_blocks)

    chunks = []
    start = 0
    while start < len(full_text):
        end = start + chunk_size
        chunks.append(full_text[start:end])
        start = end - overlap
    return chunks

# === Main loop ===
for _, row in tqdm(matches_df.iterrows(), total=len(matches_df), desc="Scoring themes"):
    filename = row["subtitle_filename"]
    json_path = subs_dir / f"{filename}.json"
    summary_path = summaries_dir / f"{filename}.srt_summary.txt"

    if not json_path.exists() or not summary_path.exists():
        print(f"⚠️ Missing files for: {filename}")
        continue

    with open(json_path, "r", encoding="utf-8") as f:
        subs = json.load(f)

    with open(summary_path, "r", encoding="utf-8") as f:
        summary = f.read().strip()

    chunks = chunk_dialogue(subs)

    for i, chunk in enumerate(chunks):
        output_path = output_dir / f"{filename}_chunk{i+1}_themes.json"
        prompt_path = prompts_dir / f"{filename}_chunk{i+1}_themes_prompt.txt"

        if output_path.exists():
            print(f"🟡 Already scored: {filename} chunk {i+1}")
            continue

        prompt = create_theme_prompt(summary, chunk)

        # Save prompt for transparency
        with open(prompt_path, "w", encoding="utf-8") as f:
            f.write(prompt)

        result = call_claude(prompt)
        if result:
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(result, f, indent=2)
            print(f"✅ Scored: {filename} chunk {i+1}")
        else:
            print(f"❌ Failed: {filename} chunk {i+1}")

        time.sleep(1.5)


Scoring themes:   0%|                                    | 0/20 [00:00<?, ?it/s]

✅ Scored: 2006Blood.Diamond chunk 1
✅ Scored: 2006Blood.Diamond chunk 2
✅ Scored: 2006Blood.Diamond chunk 3
✅ Scored: 2006Blood.Diamond chunk 4
