# 5) Color Words & Description Density

**Goal:** Count color terms and compare description density.

In [1]:
import re
from pathlib import Path
from collections import Counter

In [2]:

def load_texts(local_pet: str = "../data/PetSemetary.txt",
               local_shining: str = "../data/TheShining.txt"):
    """Load Pet Sematary and The Shining texts from disk."""
    p1, p2 = Path(local_pet), Path(local_shining)

    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please place 'PetSemetary.txt' at this path or update load_texts(...)."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please place 'TheShining.txt' at this path or update load_texts(...)."
        )

    pet_text = p1.read_text(encoding="utf-8", errors="ignore")
    shining_text = p2.read_text(encoding="utf-8", errors="ignore")
    return pet_text, shining_text


def normalize(text: str) -> str:
    """Simple normalization for your own TXT files (not Gutenberg)."""
    if not text:
        return ""
    # normalize curly quotes to ASCII '
    text = text.replace("’", "'").replace("‘", "'")
    # normalize Windows line endings
    text = text.replace("\r\n", "\n")
    # join hyphenated line breaks
    text = re.sub(r"-\s*\n", "", text)
    return text


# Load raw texts
pet_raw, shining_raw = load_texts()

# Normalize for tokenization
pet_norm     = normalize(pet_raw)
shining_norm = normalize(shining_raw)

print(f"Pet Sematary chars: {len(pet_norm):,} | The Shining chars: {len(shining_norm):,}")

Pet Sematary chars: 812,353 | The Shining chars: 905,869


### Helpers: Tokenization

In [3]:
# ---------- 2. Tokenization helpers ----------

WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)*")  # keep apostrophes in words (don't)

def words(text: str):
    """Simple word tokenizer (lowercased, ASCII letters + apostrophes)."""
    return WORD_RE.findall(text.lower())

def sentences(text: str):
    """Naive sentence splitter using punctuation boundaries."""
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]

# Tokenize
pet_words     = words(pet_norm)
shining_words = words(shining_norm)

pet_sents     = sentences(pet_norm)
shining_sents = sentences(shining_norm)

print(f"Pet Sematary words   : {len(pet_words):,} | The Shining words   : {len(shining_words):,}")
print(f"Pet Sematary sentences: {len(pet_sents):,} | The Shining sentences: {len(shining_sents):,}")


Pet Sematary words   : 147,144 | The Shining words   : 162,085
Pet Sematary sentences: 9,269 | The Shining sentences: 12,914


### Count Color Terms

In [4]:
# ---------- 3. Color counts ----------

COLOR_LIST = {
    "red","orange","yellow","green","blue","indigo","violet","purple","pink",
    "brown","black","white","gray","grey",
    "scarlet","crimson","emerald","amber","gold","silver","lavender","mauve",
    "ivory","beige","teal","turquoise","magenta","maroon","navy"
}

def count_colors(tokens):
    c = Counter(w for w in tokens if w in COLOR_LIST)
    return c, sum(c.values()), len(tokens)

pet_c, pet_hits, pet_total = count_colors(pet_words)
sh_c,  sh_hits,  sh_total  = count_colors(shining_words)

print("Pet Sematary top:", pet_c.most_common(15), "| rate per 100k:", (pet_hits / pet_total) * 100000)
print("The Shining top :", sh_c.most_common(15),  "| rate per 100k:", (sh_hits  / sh_total)  * 100000)


Pet Sematary top: [('white', 84), ('black', 62), ('blue', 42), ('green', 40), ('red', 37), ('yellow', 31), ('gray', 22), ('brown', 16), ('silver', 9), ('pink', 8), ('gold', 8), ('orange', 7), ('purple', 3), ('maroon', 3), ('navy', 1)] | rate per 100k: 254.85238949600392
The Shining top : [('white', 115), ('black', 82), ('red', 81), ('blue', 67), ('green', 37), ('silver', 32), ('gray', 26), ('yellow', 26), ('pink', 18), ('gold', 16), ('purple', 15), ('brown', 15), ('orange', 7), ('violet', 5), ('ivory', 4)] | rate per 100k: 342.4129314865657


**Discuss:** Where do color bursts cluster in the narrative? What scenes rely on color to signal mood or magic?

In [5]:
# ---------- 4. Rolling windows: color "bursts" ----------

def rolling_color_windows(tokens, window=800, step=200, color_list=COLOR_LIST):
    hits_per_window = []
    for i in range(0, max(1, len(tokens) - window + 1), step):
        chunk = tokens[i:i+window]
        c = sum(1 for w in chunk if w in color_list)
        hits_per_window.append((i, i+window, c, c * (100000 / window)))  # per 100k
    return hits_per_window

def nearest_sentence_span(tokens, sents, start_idx, end_idx):
    """Approximate: map token range to a sentence slice around the window."""
    head = " ".join(tokens[start_idx:start_idx+20])
    for k, s in enumerate(sents):
        if head[:50] in s:
            j0 = max(0, k - 1)
            j1 = min(len(sents), k + 3)
            return j0, j1, " ".join(sents[j0:j1])
    # fallback: just return a raw text preview
    text = " ".join(tokens[start_idx:end_idx])
    return None, None, text[:500]

# run rolling color windows
pet_roll = rolling_color_windows(pet_words, window=800, step=200)
sh_roll  = rolling_color_windows(shining_words, window=800, step=200)

# top 5 bursts by per-100k
pet_top = sorted(pet_roll, key=lambda x: x[3], reverse=True)[:5]
sh_top  = sorted(sh_roll,  key=lambda x: x[3], reverse=True)[:5]

print("=== Pet Sematary color bursts ===")
for a, b, h, rate in pet_top:
    j0, j1, preview = nearest_sentence_span(pet_words, pet_sents, a, b)
    print(f"[tokens {a}-{b}] hits={h} | {rate:.0f} per 100k")
    print(preview[:400], "…\n")

print("=== The Shining color bursts ===")
for a, b, h, rate in sh_top:
    j0, j1, preview = nearest_sentence_span(shining_words, shining_sents, a, b)
    print(f"[tokens {a}-{b}] hits={h} | {rate:.0f} per 100k")
    print(preview[:400], "…\n")


=== Pet Sematary color bursts ===
[tokens 145000-145800] hits=11 | 1375 per 100k
approaching a lot of them they had been called but the would be hero was right the house was going flames probed through half a dozen broken windows now and the front eave had grown an almost transparent membrane of fire over its bright green shingles he turned back then remembering louis but if louis were here wouldn't he be with the others across the street steve caught something then just barel …

[tokens 117400-118200] hits=10 | 1250 per 100k
o great swami louis and in the meantime what do i do about that great big bundle of stuff i threw over the wall pick shovel flashlight you might as well stamp graverobbing equipment on every damn piece of it it landed in the bushes who's going to find it for christ's sake on measure that made sense but this was no sensible errand he was on and his heart told him quietly and absolutely that he coul …

[tokens 144600-145400] hits=10 | 1250 per 100k
a car crash a cou