# 4) Gender in the Text: Pronouns & Nearby Verbs

**Goal:** Compare relative frequency of pronouns and the verbs near them.

In [6]:
import re
from pathlib import Path
from collections import Counter

In [7]:

def load_texts(
    pet_path: str = "../data/PetSemetary.txt",
    shining_path: str = "../data/TheShining.txt",
):
    """Load Pet Sematary and The Shining texts from disk."""
    p1, p2 = Path(pet_path), Path(shining_path)

    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please place 'PetSemetary.txt' at this path or update load_texts(...)."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please place 'TheShining.txt' at this path or update load_texts(...)."
        )

    pet_text = p1.read_text(encoding="utf-8", errors="ignore")
    shining_text = p2.read_text(encoding="utf-8", errors="ignore")
    return pet_text, shining_text


def normalize(text: str) -> str:
    """Simple normalization for your own TXT files."""
    if not text:
        return ""
    # normalize curly quotes to ASCII '
    text = text.replace("’", "'").replace("‘", "'")
    # normalize Windows line endings
    text = text.replace("\r\n", "\n")
    # join hyphenated line breaks
    text = re.sub(r"-\s*\n", "", text)
    return text


# Load and normalize
pet_raw, shining_raw = load_texts()
pet_norm     = normalize(pet_raw)
shining_norm = normalize(shining_raw)

print(f"Pet Sematary chars: {len(pet_norm):,} | The Shining chars: {len(shining_norm):,}")

Pet Sematary chars: 812,353 | The Shining chars: 905,869


### Helpers: Tokenization

In [8]:
# ---------- 2. Tokenization ----------

# keep apostrophes inside words (don't), but no apostrophe-only tokens
WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)*")

def words(text: str):
    return WORD_RE.findall(text.lower())

def sentences(text: str):
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]

pet_words     = words(pet_norm)
shining_words = words(shining_norm)

pet_sents     = sentences(pet_norm)
shining_sents = sentences(shining_norm)

print(f"Pet Sematary words   : {len(pet_words):,} | The Shining words   : {len(shining_words):,}")
print(f"Pet Sematary sentences: {len(pet_sents):,} | The Shining sentences: {len(shining_sents):,}")


Pet Sematary words   : 147,144 | The Shining words   : 162,085
Pet Sematary sentences: 9,269 | The Shining sentences: 12,914


### Pronoun Balance

In [9]:
# ---------- 3. Pronoun balance ----------

def pronoun_counts(tokens):
    target = {"he", "she", "him", "her"}
    c = Counter(w for w in tokens if w in target)
    total = sum(c.values())
    return c, total

pet_pron, pet_pron_tot = pronoun_counts(pet_words)
sh_pron,  sh_pron_tot  = pronoun_counts(shining_words)

print("Pet Sematary pronouns:", dict(pet_pron), "total:", pet_pron_tot)
print("The Shining pronouns :", dict(sh_pron),  "total:", sh_pron_tot)


Pet Sematary pronouns: {'he': 3576, 'her': 1101, 'she': 1040, 'him': 864} total: 6581
The Shining pronouns : {'he': 3616, 'him': 1104, 'her': 1111, 'she': 1208} total: 7039


### Verbs Near Pronouns (very naive)

In [10]:
# ---------- 4. Verbs near 'he' / 'she' (very naive) ----------

def verb_like(word: str) -> bool:
    # crude heuristic: surface forms with verb-like endings or common base forms
    return (
        bool(re.match(r".*(ed|ing|s)$", word))
        or word in {
            "say", "says", "said",
            "go", "goes", "went",
            "come", "comes", "came",
            "think", "thinks", "thought",
            "see", "sees", "saw",
            "know", "knows", "knew",
            "do", "does", "did",
            "feel", "feels", "felt",
        }
    )

def verbs_near_pronouns(tokens, window: int = 2):
    verbs_for = {"he": [], "she": []}
    for i, w in enumerate(tokens):
        if w in ("he", "she"):
            for j in range(max(0, i - window), min(len(tokens), i + window + 1)):
                if j == i:
                    continue
                if verb_like(tokens[j]):
                    verbs_for[w].append(tokens[j])
    # return top 20 per pronoun
    return {k: Counter(v).most_common(20) for k, v in verbs_for.items()}

pet_verbs_near = verbs_near_pronouns(pet_words)
sh_verbs_near  = verbs_near_pronouns(shining_words)

print("Pet Sematary – verbs near 'he'/'she':")
print(pet_verbs_near)
print("\nThe Shining – verbs near 'he'/'she':")
print(sh_verbs_near)


Pet Sematary – verbs near 'he'/'she':
{'he': [('was', 486), ('said', 317), ('his', 206), ('thought', 143), ('as', 124), ('did', 105), ('felt', 99), ('louis', 87), ('saw', 87), ('looked', 66), ('supposed', 54), ('went', 51), ('this', 49), ('do', 44), ('knew', 40), ('turned', 39), ('asked', 34), ('remembered', 33), ('go', 32), ('going', 29)], 'she': [('said', 189), ('was', 164), ('thought', 29), ('as', 28), ('louis', 27), ('looked', 25), ('did', 21), ('asked', 15), ('is', 14), ('think', 14), ('his', 13), ('saw', 13), ('come', 13), ('going', 13), ('felt', 13), ('know', 12), ('smiled', 12), ('turned', 11), ('wanted', 11), ('this', 11)]}

The Shining – verbs near 'he'/'she':
{'he': [('was', 495), ('said', 243), ('his', 241), ('as', 125), ('looked', 94), ('thought', 87), ('did', 68), ('felt', 65), ('knew', 65), ('went', 54), ('saw', 50), ('turned', 49), ('see', 47), ('is', 36), ('think', 33), ('know', 33), ('asked', 32), ('come', 32), ('go', 31), ('wanted', 31)], 'she': [('was', 162), ('said

**Prompt:** How do these crude patterns line up with character agency and narrative voice? What errors do you notice, and how would POS (Part OF Speech) tagging improve this?