In [1]:
import os
import re
import json
from pathlib import Path
from tqdm.notebook import tqdm

# === CONFIGURATION ===
desktop = Path.home() / "Desktop"
input_folder = desktop / "Benchmark" / "srtraw" # <-- Your folder with 450 SRT files
output_folder = desktop / "Benchmark" / "json_benchmark"
chunk_size = 30  # number of lines per chunk

# === ENSURE OUTPUT FOLDER EXISTS ===
output_folder.mkdir(parents=True, exist_ok=True)

# === FUNCTION TO PARSE AND CHUNK SRT FILE ===
def parse_srt(file_path, chunk_size=30):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

    raw_blocks = re.split(r"\n\s*\n", content.strip())
    
    lines = []
    for block in raw_blocks:
        parts = block.strip().split("\n")
        if len(parts) >= 3:
            text = "\n".join(parts[2:])
            lines.append(text)
    
    # Group into chunks
    chunks = []
    for i in range(0, len(lines), chunk_size):
        chunk_text = "\n".join(lines[i:i + chunk_size]).strip()
        if chunk_text:
            chunks.append({
                "chunk_id": len(chunks) + 1,
                "text": chunk_text
            })
    
    return chunks

# === MAIN PROCESSING LOOP ===
srt_files = list(input_folder.glob("*.srt"))

for srt_file in tqdm(srt_files, desc="Processing subtitles"):
    base_name = srt_file.stem
    output_path = output_folder / f"{base_name}.json"
    
    try:
        chunks = parse_srt(srt_file, chunk_size)
        with open(output_path, "w", encoding="utf-8") as out_f:
            json.dump(chunks, out_f, indent=2, ensure_ascii=False)
    except Exception as e:
        print(f"❌ Failed to process {srt_file.name}: {e}")

Processing subtitles:   0%|          | 0/20 [00:00<?, ?it/s]

In [3]:
import os
import re
import pandas as pd
from pathlib import Path

# Define input folder
desktop = Path.home() / "Desktop"
input_folder = desktop / "Benchmark" / "srtraw"

# Use current working directory to save the CSV
output_csv_path = Path('.') / 'cleaned_filenames.csv'

# List all .srt files in the input folder
srt_files = [f for f in os.listdir(input_folder) if f.endswith('.srt')]

# Prepare data
original_names = []
cleaned_names = []

for file in srt_files:
    original_name = file
    # Remove extension
    name_without_ext = os.path.splitext(file)[0]
    # Remove leading 4-digit year
    name_no_year = re.sub(r'^\d{4}', '', name_without_ext)
    # Replace dots or underscores with spaces
    cleaned_name = re.sub(r'[._]+', ' ', name_no_year).strip()

    original_names.append(original_name)
    cleaned_names.append(cleaned_name)

# Create DataFrame and save
df = pd.DataFrame({
    'original_name': original_names,
    'cleaned_name': cleaned_names
})

df.to_csv(output_csv_path, index=False)

print(f"CSV saved in your Jupyter workspace as: {output_csv_path.name}")



CSV saved in your Jupyter workspace as: cleaned_filenames.csv


In [19]:
import wikipedia
import pandas as pd
from pathlib import Path

# === Load cleaned CSV ===
df = pd.read_csv("cleaned_filenames.csv")

# === Output folder for summaries ===
summary_output_dir = Path.home() / "Desktop" / "Benchmark" / "summaries"
summary_output_dir.mkdir(parents=True, exist_ok=True)

# === Helper: title case with exception for "for", "and", etc. ===
# Preserve known title exceptions exactly as needed
def title_case(title):
    special_cases = {
        "v for vendetta": "V for Vendetta",
    }

    key = title.lower().strip()
    if key in special_cases:
        return special_cases[key]

    # Default behavior: smart title casing with common word exceptions
    words = title.split()
    result = []
    for word in words:
        if word.lower() in ["for", "and", "of", "the", "in", "on", "at", "to"]:
            result.append(word.lower())
        else:
            result.append(word.capitalize())
    return " ".join(result)


# === Wikipedia summary fetch function ===
def fetch_wikipedia_summary(title, year=None):
    from wikipedia import page, search, WikipediaException

    attempts = []

    # Direct forms
    attempts.append(title)
    if year:
        attempts.append(f"{title} (film)")
        attempts.append(f"{title} ({year} film)")
    else:
        attempts.append(f"{title} (film)")

    # Try each attempt directly
    for attempt in attempts:
        try:
            pg = page(attempt, auto_suggest=False)
            content = pg.content
            if "== Plot ==" in content:
                start = content.find("== Plot ==") + len("== Plot ==")
                end = content.find("==", start)
                return content[start:end].strip() if end != -1 else content[start:].strip()
            else:
                return content.split('\n')[0]
        except WikipediaException:
            continue
        except Exception as e:
            print(f"⚠️ Unexpected error on '{attempt}': {e}")
            continue

    # If all fail, try search
    try:
        results = search(title)
        for result in results:
            try:
                pg = page(result, auto_suggest=False)
                content = pg.content
                if "== Plot ==" in content:
                    start = content.find("== Plot ==") + len("== Plot ==")
                    end = content.find("==", start)
                    return content[start:end].strip() if end != -1 else content[start:].strip()
                else:
                    return content.split('\n')[0]
            except:
                continue
    except Exception as e:
        print(f"⚠️ Search failed for {title}: {e}")

    print(f"❌ No summary found for: {title}")
    return None


# === Loop through each entry and save summaries ===
failed_titles = []

for idx, row in df.iterrows():
    filename = row['original_name']
    raw_title = row['cleaned_name']
    title = title_case(raw_title)

    if not isinstance(filename, str) or not filename[:4].isdigit():
        print(f"⚠️ Skipping row with bad or missing filename: {filename}")
        continue

    year = filename[:4]
    summary_path = summary_output_dir / f"{filename}_summary.txt"

    if summary_path.exists():
        print(f"🟡 Summary already exists for: {title}")
        continue

    summary = fetch_wikipedia_summary(title, year)
    if summary:
        with open(summary_path, "w", encoding="utf-8") as f:
            f.write(summary)
        print(f"✅ Saved summary for: {title}")
    else:
        print(f"❌ No summary found for: {title}")
        failed_titles.append(title)

# === Save list of failed titles ===
if failed_titles:
    failed_log = summary_output_dir / "missing_summaries.txt"
    with open(failed_log, "w", encoding="utf-8") as f:
        for t in failed_titles:
            f.write(t + "\n")
    print(f"❗ Missing summaries saved to: {failed_log}")



🟡 Summary already exists for: the Hurt Locker
🟡 Summary already exists for: the Purge
🟡 Summary already exists for: Black Panther
🟡 Summary already exists for: Night at the Museum
🟡 Summary already exists for: Rocky I
🟡 Summary already exists for: Pirates of the Caribbean the Curse of the Black Pearl
🟡 Summary already exists for: Superman
🟡 Summary already exists for: Paddington 2
🟡 Summary already exists for: Joker
🟡 Summary already exists for: the Constant Gardener
🟡 Summary already exists for: First Blood
🟡 Summary already exists for: the Hunger Games
🟡 Summary already exists for: Ghostbusters
❌ No summary found for: V for Vendetta
🟡 Summary already exists for: Indiana Jones and the Raiders of the Lost Ark
🟡 Summary already exists for: Star Wars Episode Iv - A New Hope
🟡 Summary already exists for: Blood Diamond
🟡 Summary already exists for: Dont Look Up
🟡 Summary already exists for: Avatar
🟡 Summary already exists for: Back to the Future
❗ Missing summaries saved to: /Users/cedricr

In [21]:
import wikipedia

page = wikipedia.page("V for Vendetta (film)", auto_suggest=False)
content = page.content
start = content.find("== Plot ==") + len("== Plot ==")
end = content.find("==", start)
summary = content[start:end].strip() if end != -1 else content[start:].strip()

with open("/Users/cedricroetheli/Desktop/Benchmark/summaries/2005V.for.Vendetta_summary.txt", "w", encoding="utf-8") as f:
    f.write(summary)

print("✅ Manually saved summary for V for Vendetta")


✅ Manually saved summary for V for Vendetta


In [29]:
import pandas as pd
from pathlib import Path

# Define paths
subs_dir = Path.home() / "Desktop" / "Benchmark" / "json_benchmark"
summaries_dir = Path.home() / "Desktop" / "Benchmark" / "summaries"

# List all .json subtitle files
json_files = list(subs_dir.glob("*.json"))

# Build matched list
matches = []

for json_file in json_files:
    base_name = json_file.stem  # e.g., "2021Dont.Look.Up"
    summary_file = summaries_dir / f"{base_name}.srt_summary.txt"

    if summary_file.exists():
        matches.append({"subtitle_filename": base_name})

# Create DataFrame
matches_df = pd.DataFrame(matches)

# Save to CSV
matches_df.to_csv("matches_benchmark.csv", index=False)

print(f"✅ Created matches_benchmark.csv with {len(matches_df)} matched entries.")


✅ Created matches_benchmark.csv with 20 matched entries.
