In [3]:
%pip install pymupdf gender-guesser spacy pandas matplotlib jinja2


Collecting pymupdf
  Using cached pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting gender-guesser
  Using cached gender_guesser-0.4.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting pandas
  Downloading pandas-2.3.0-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting jinja2
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp313-cp313-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.11-cp313-cp3

In [4]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import os
import fitz  # PyMuPDF
import re
import spacy
import pandas as pd
from gender_guesser.detector import Detector

# Load spaCy and gender detector
nlp = spacy.load("en_core_web_sm")
gender_detector = Detector()


In [7]:
def extract_script_text(pdf_path):
    with fitz.open(pdf_path) as doc:
        return "\n".join([page.get_text() for page in doc])

def load_scripts_from_folder(folder_path):
    scripts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            title = filename.replace('.pdf', '')
            text = extract_script_text(os.path.join(folder_path, filename))
            scripts[title] = text
    return scripts

# Update path to your local folder
scripts = load_scripts_from_folder("scripts-data/")
print(f"✅ Loaded {len(scripts)} scripts.")


✅ Loaded 13 scripts.


In [8]:
def extract_intro_lines(text):
    lines = text.split('\n')
    clean_lines = []

    for line in lines:
        line = line.strip()
        if 5 < len(line.split()) < 30:
            if not line.isupper() and not re.match(r'^[A-Z]+:$', line):
                clean_lines.append(line)
    return clean_lines


In [9]:
def detect_character_and_gender(line):
    doc = nlp(line)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text
            gender = gender_detector.get_gender(name.split()[0])
            return name, gender
    return None, None


In [10]:
clean_data = []

for title, text in scripts.items():
    lines = extract_intro_lines(text)
    for line in lines:
        character, gender = detect_character_and_gender(line)
        if character and gender in ['male', 'female']:
            clean_data.append({
                "script": title,
                "year": None,  # optional: fill manually later
                "character": character,
                "gender": gender,
                "line": line.strip()
            })

df_cleaned = pd.DataFrame(clean_data)
df_cleaned.to_csv("cleaned_script_data.csv", index=False)
print("✅ Saved cleaned data to cleaned_script_data.csv")
df_cleaned.head()


✅ Saved cleaned data to cleaned_script_data.csv


Unnamed: 0,script,year,character,gender,line
0,Haider,,Shiraz,male,detained at the Shiraz cinema camp…
1,Haider,,Ghazala,female,Ghazala sees the soldier raising the barricade...
2,Haider,,Ghazala,female,Surgical instruments are boiling in a pot. Gha...
3,Haider,,Ghazala,female,"silent, Ghazala has a sense of paranoia in her..."
4,Haider,,Ghazala,female,in with a bunch of other surgical tools in a t...
