# 2) Adverbs: Do Great Writers Avoid Them?

**Goal:** Estimate -ly adverb rate and compare across the two texts.

In [1]:
import re
from pathlib import Path
from collections import Counter

# Token regex: at least one letter, optional internal apostrophes
WORD_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)*")


In [2]:
def load_texts(pet_path: str = "../data/PetSemetary.txt",
               shining_path: str = "../data/TheShining.txt"):
    
    """Load Pet Sematary and The Shining from disk."""
    p1, p2 = Path(pet_path), Path(shining_path)

    if not p1.exists():
        raise FileNotFoundError(f"Missing file: {p1}")
    if not p2.exists():
        raise FileNotFoundError(f"Missing file: {p2}")

    pet_text = p1.read_text(encoding="utf-8", errors="ignore")
    shining_text = p2.read_text(encoding="utf-8", errors="ignore")
    return pet_text, shining_text



def normalize(text: str) -> str:
    """Simple normalization for your own TXT files (not Gutenberg)."""
    if not text:
        return ""
    # normalize curly quotes to ASCII '
    text = text.replace("’", "'").replace("‘", "'")
    # normalize Windows endings
    text = text.replace("\r\n", "\n")
    # join hyphenated line breaks: "won-\n derful" → "wonderful"
    text = re.sub(r"-\s*\n", "", text)
    return text

def words(text: str):
    """Tokenize into lowercased word tokens."""
    return WORD_RE.findall(text.lower())

def sentences(text: str):
    """Very simple sentence splitter."""
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]




### Helpers: Tokenization

In [3]:
# Load raw texts from your two files in the current directory
pet_raw, shining_raw = load_texts()

# Normalize
pet_norm     = normalize(pet_raw)
shining_norm = normalize(shining_raw)

print(f"Pet Sematary chars: {len(pet_norm):,}")
print(f"The Shining chars : {len(shining_norm):,}")
# Tokenize
pet_words   = words(pet_norm)
shining_words = words(shining_norm)

pet_sents   = sentences(pet_norm)
shining_sents = sentences(shining_norm)

print(f"Pet Sematary words   : {len(pet_words):,}")
print(f"The Shining words    : {len(shining_words):,}")
print(f"Pet Sematary sentences: {len(pet_sents):,}")
print(f"The Shining sentences : {len(shining_sents):,}")


Pet Sematary chars: 812,353
The Shining chars : 905,869
Pet Sematary words   : 147,144
The Shining words    : 162,085
Pet Sematary sentences: 9,269
The Shining sentences : 12,914


### Estimate -ly Adverb Rate

In [4]:
def adverb_rate(tokens):
    """Crude -ly adverb heuristic."""
    adverbs = [w for w in tokens if w.endswith("ly") and len(w) > 2]
    return len(adverbs), len(tokens), (len(adverbs) / len(tokens)) * 100
pet_adv, pet_total, pet_pct = adverb_rate(pet_words)
sh_adv, sh_total, sh_pct = adverb_rate(shining_words)

print(f"Pet Sematary: {pet_adv}/{pet_total} = {pet_pct:.2f}%")
print(f"The Shining : {sh_adv}/{sh_total} = {sh_pct:.2f}%")


Pet Sematary: 2259/147144 = 1.54%
The Shining : 2354/162085 = 1.45%


**Prompt:** Inspect a sample of detected -ly words. Which are true adverbs vs. adjectives/nouns? How would you refine the rule?

In [5]:
pet_ly = [w for w in pet_words if w.endswith("ly") and len(w) > 2]
sh_ly  = [w for w in shining_words if w.endswith("ly") and len(w) > 2]

print("Pet Sematary -ly tokens (first 30):", pet_ly[:30])
print("The Shining  -ly tokens (first 30):", sh_ly[:30])


Pet Sematary -ly tokens (first 30): ['purely', 'frequently', 'easily', 'probably', 'finally', 'certainly', 'simply', 'only', 'early', 'only', 'only', 'entirely', 'really', 'simply', 'only', 'early', 'completely', 'only', 'early', 'particularly', 'elderly', 'finally', 'only', 'plainly', 'exactly', 'relatively', 'slowly', 'morbidly', 'ceaselessly', 'promptly']
The Shining  -ly tokens (first 30): ['wholly', 'exceedingly', 'fully', 'assembly', 'curtly', 'probably', 'fully', 'completely', 'faintly', 'briskly', 'absolutely', 'actually', 'scholarly', 'only', 'impressively', 'directly', 'simply', 'perfectly', 'tightly', 'certainly', 'insultingly', 'completely', 'completely', 'gravely', 'wiggly', 'equally', 'admittedly', 'fantastically', 'daily', 'constantly']


Let's use a smarter way to find adverbs:

In [6]:
import spacy
from spacy.cli import download

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    

In [7]:
def true_adverbs_narrow(text: str):
    """Return adverbs that are ADV and function as advmod of VERB or ADJ."""
    doc = nlp(text)
    advs = []
    for t in doc:
        if t.pos_ == "ADV" and t.dep_ == "advmod" and t.head.pos_ in {"VERB", "ADJ"}:
            advs.append(t.text)
    return advs

In [8]:
pet_true_adv = true_adverbs_narrow(pet_norm)
sh_true_adv  = true_adverbs_narrow(shining_norm)


print("Pet Sematary filtered adverbs (first 30):", pet_true_adv[:30])
print("The Shining filtered adverbs (first 30):", sh_true_adv[:30])

print(f"Pet Sematary ADV count (filtered): {len(pet_true_adv)}")
print(f"The Shining  ADV count (filtered): {len(sh_true_adv)}")

Pet Sematary filtered adverbs (first 30): ['purely', 'most', 'ever', 'easily', 'away', 'finally', 'far', 'certainly', 'far', 'simply', 'before', 'also', 'later', 'very', 'just', 'there', 'long', 'sometimes', 'up', 'always', 'then', 'only', 'already', 'long', 'entirely', 'really', 'often', 'simply', 'just', 'only']
The Shining filtered adverbs (first 30): ['most', 'wholly', 'exceedingly', 'so', 'thus', 'more', 'fully', 'once', 'then', 'yet', 'curtly', 'perhaps', 'probably', 'away', 'fully', 'here', 'so', 'quite', 'back', 'completely', 'back', 'much', 'all', 'faintly', 'briskly', 'course', 'again', 'actually', 'around', 'away']
Pet Sematary ADV count (filtered): 6396
The Shining  ADV count (filtered): 6532


In [9]:
pet_true_rate = len(pet_true_adv) / len(pet_words) * 100
sh_true_rate  = len(sh_true_adv)  / len(shining_words) * 100

print(f"Pet Sematary (filtered ADV): {pet_true_rate:.2f}% of tokens")
print(f"The Shining  (filtered ADV): {sh_true_rate:.2f}% of tokens")


Pet Sematary (filtered ADV): 4.35% of tokens
The Shining  (filtered ADV): 4.03% of tokens
