In [None]:
# 01 - Ingest and Parse
### Initial analysis of screenplay
#### Dr John Hughes

In [None]:
# 01a - Statistical Analysis
### Statistical analysis word counts and flow

In [None]:
import re
import matplotlib.pyplot as plt
from pathlib import Path

# --- CONFIG ---
SCRIPT_PATH = "my_script.txt"  # Path to screenplay text file


def load_script(path):
    """Load the screenplay as plain text."""
    return Path(path).read_text(encoding="utf-8", errors="ignore")


def split_into_scenes(text):
    """Split screenplay into scenes based on scene headings."""
    pattern = r"\n(?:INT\.|EXT\.)[^\n]*"
    headings = re.findall(pattern, text, flags=re.IGNORECASE)
    scenes = re.split(pattern, text)
    scenes = scenes[1:]  # Remove anything before first heading
    return list(zip(headings, scenes))


def analyze_scene(scene_text):
    """Return length, avg sentence length, and dialogue ratio for a scene."""
    # Sentence stats
    sentences = re.split(r"[.!?]", scene_text)
    sentences = [s.strip() for s in sentences if s.strip()]
    avg_sentence_length = sum(len(s.split()) for s in sentences) / max(len(sentences), 1)

    # Dialogue stats (lines starting with ALL CAPS name)
    lines = scene_text.splitlines()
    dialogue_lines = [l for l in lines if re.match(r"^[A-Z ]+$", l.strip())]
    dialogue_ratio = len(dialogue_lines) / max(len(lines), 1)

    return {
        "length_words": len(scene_text.split()),
        "avg_sentence_length": avg_sentence_length,
        "dialogue_ratio": dialogue_ratio
    }


def plot_flow(stats):
    """Plot pacing/flow over scenes."""
    scene_nums = range(1, len(stats) + 1)
    lengths = [s["length_words"] for s in stats]
    dialogue_ratios = [s["dialogue_ratio"] for s in stats]

    fig, ax1 = plt.subplots(figsize=(12, 6))

    ax1.set_xlabel("Scene Number")
    ax1.set_ylabel("Scene Length (words)", color="blue")
    ax1.plot(scene_nums, lengths, color="blue", marker="o", label="Scene Length")
    ax1.tick_params(axis="y", labelcolor="blue")

    ax2 = ax1.twinx()
    ax2.set_ylabel("Dialogue Ratio", color="orange")
    ax2.plot(scene_nums, dialogue_ratios, color="orange", marker="x", label="Dialogue Ratio")
    ax2.tick_params(axis="y", labelcolor="orange")

    plt.title("Screenplay Pacing & Dialogue Flow")
    fig.tight_layout()
    plt.show()


if __name__ == "__main__":
    text = load_script(SCRIPT_PATH)
    scenes = split_into_scenes(text)

    stats = []
    for heading, body in scenes:
        stats.append(analyze_scene(body))
        print(f"{heading.strip()} — {stats[-1]}")

plot_flow(stats)