In [3]:
# === IMPORTS ===
import os
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Paths ===
base_path = Path.home() / "Desktop" / "processed_subs"
scored_dir = base_path / "scored_prompts_claude_new"
output_dir = base_path / "aggregated_newprompt"

output_dir.mkdir(parents=True, exist_ok=True)

# === Helper functions ===
def extract_year_and_title(filename):
    """Extract year and movie title from filename like '2017_Logan'."""
    try:
        parts = filename.split("_", 1)
        year = int(parts[0])
        title = parts[1].replace("_", " ")
        return year, title
    except Exception:
        return None, filename

def load_scored_file(filepath):
    """Load one movie's scored chunk results."""
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def compute_unweighted(df):
    """Compute simple averages without weighting."""
    return {
        "universalism": df["universalism_score"].mean(),
        "egalitarianism": df["egalitarianism_score"].mean(),
        "progress": df["progress_score"].mean(),
    }

def compute_weighted(df):
    """Compute confidence-weighted averages."""
    def weighted_avg(value_col, conf_col):
        if df[conf_col].sum() == 0:
            return df[value_col].mean()  # fallback to unweighted
        return (df[value_col] * df[conf_col]).sum() / df[conf_col].sum()
    
    return {
        "universalism": weighted_avg("universalism_score", "universalism_confidence"),
        "egalitarianism": weighted_avg("egalitarianism_score", "egalitarianism_confidence"),
        "progress": weighted_avg("progress_score", "progress_confidence"),
    }

# === Main aggregation ===
unweighted_records = []
weighted_records = []

scored_files = list(scored_dir.glob("*.json"))
print(f"Found {len(scored_files)} scored movie files.")

for scored_file in tqdm(scored_files, desc="Aggregating newprompt results"):
    movie_id = scored_file.stem.replace("_scored", "")
    year, title = extract_year_and_title(movie_id)

    try:
        data = load_scored_file(scored_file)
        df = pd.DataFrame(data)

        unweighted_scores = compute_unweighted(df)
        weighted_scores = compute_weighted(df)

        unweighted_records.append({
            "year": year,
            "title": title,
            **unweighted_scores
        })

        weighted_records.append({
            "year": year,
            "title": title,
            **weighted_scores
        })

    except Exception as e:
        print(f"⚠️ Error processing {scored_file.name}: {e}")

# === Create DataFrames ===
df_unweighted = pd.DataFrame(unweighted_records)
df_weighted = pd.DataFrame(weighted_records)

# Save per movie
df_unweighted.to_csv(output_dir / "newprompt_unweighted_movie.csv", index=False)
df_weighted.to_csv(output_dir / "newprompt_weighted_movie.csv", index=False)

# === Aggregate per year ===
df_unweighted_year = (
    df_unweighted
    .groupby("year")[["universalism", "egalitarianism", "progress"]]
    .mean()
    .reset_index()
)

df_weighted_year = (
    df_weighted
    .groupby("year")[["universalism", "egalitarianism", "progress"]]
    .mean()
    .reset_index()
)

df_unweighted_year.to_csv(output_dir / "newprompt_unweighted_year.csv", index=False)
df_weighted_year.to_csv(output_dir / "newprompt_weighted_year.csv", index=False)

print("\n🎉 Aggregation complete! CSVs saved into:", output_dir)


Found 442 scored movie files.


Aggregating newprompt results: 100%|████████| 442/442 [00:00<00:00, 1451.89it/s]


🎉 Aggregation complete! CSVs saved into: /Users/cedricroetheli/Desktop/processed_subs/aggregated_newprompt





In [5]:
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# === Paths ===
base_path = Path.home() / "Desktop" / "processed_subs"
scored_dir = base_path / "scored_heroes_and_moral_v2"
output_csv = base_path / "heroes_and_moral_v2_movie.csv"

# === Helper to extract year and title ===
def parse_filename(filename):
    try:
        year, title = filename.split("_", 1)
        return int(year), title.replace("_", " ")
    except Exception:
        return None, filename

# === Load and aggregate ===
records = []

files = list(scored_dir.glob("*.json"))
print(f"Found {len(files)} scored files.")

for file in tqdm(files, desc="Aggregating v2 results"):
    try:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(f"⚠️ Failed to load {file.name}: {e}")
        continue

    year, title = parse_filename(file.stem.replace("_heroes_and_moral_v2_scored", ""))

    moral = data.get("moral_intervention_vs_isolation", [None, None])
    hero = data.get("hero", {})
    villain = data.get("villain", {})

    records.append({
        "year": year,
        "title": title,
        "moral_score": moral[0],
        "moral_confidence": moral[1],
        "hero_name": hero.get("name", "UNKNOWN"),
        "hero_entity_type": hero.get("entity_type", "UNKNOWN"),
        "hero_affiliation": hero.get("affiliation", "UNKNOWN"),
        "hero_confidence": hero.get("confidence", None),
        "villain_name": villain.get("name", "UNKNOWN"),
        "villain_entity_type": villain.get("entity_type", "UNKNOWN"),
        "villain_affiliation": villain.get("affiliation", "UNKNOWN"),
        "villain_confidence": villain.get("confidence", None),
    })

# === Save to CSV ===
df = pd.DataFrame(records)
df.to_csv(output_csv, index=False)
print(f"\n✅ Aggregated results saved to:\n{output_csv}")


Found 441 scored files.


Aggregating v2 results: 100%|███████████████| 441/441 [00:00<00:00, 5361.77it/s]


✅ Aggregated results saved to:
/Users/cedricroetheli/Desktop/processed_subs/heroes_and_moral_v2_movie.csv



