
# 1) Frequent Words = Literary Fingerprints

This notebook compares **word frequency** between our two toy texts:
- *The Fellowship of the Ring* (here referenced as **Fellowship**)
- *The Return of the King* (here referenced as **TheKing**)

We practice simple tokenization and frequency analysis, then discuss
what's **meaningful signal** vs. **noise** in the results, and how to
improve the method (normalization, keyness, etc).


# Setup: Load Texts

This notebook needs **The Fellowship of the Ring** and **The Return of the King** as input texts.

**How to provide the texts:**

1. Download the following books as `.txt` files:

   

2. Place two text files in the "data" folder with names:
   - `Fellowship.txt`  (The Fellowship of the Ring)
   - `TheKing.txt` (The Return of the King)





In [14]:
from pathlib import Path
import re
from collections import Counter
import math

In [19]:
def load_texts(local_fellowship: str = '..\\data\\Fellowship.txt',
               local_return: str = '..\\data\\TheKing.txt'):
    """Load Fellowship and Return of the King texts from disk."""
    
    p1, p2 = Path(local_fellowship), Path(local_return)

    # Fail fast with a clear message if a file is missing
    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please make sure 'Fellowship.txt' is in the 'data' folder."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please make sure 'TheKing.txt' is in the 'data' folder."
        )

    # Read the files
    fellowship_text = p1.read_text(encoding='utf-8', errors='ignore')
    return_text = p2.read_text(encoding='utf-8', errors='ignore')
    return fellowship_text, return_text

In [20]:
def normalize(text: str, is_fellowship: bool = False) -> str:
    """Normalize a text for tokenization."""
    if not text:
        return ''
    
    # If it's Fellowship, skip the Foreword and Prologue
    if is_fellowship:
        prologue_end = text.find('Chapter 1\n\nA Long-expected Party')
        if prologue_end != -1:
            text = text[prologue_end:]
    
    # For Return of the King
    if not is_fellowship:
        contents_end = text.find('Book V\n\nChapter 1. Minas Tirith')
        if contents_end != -1:
            text = text[contents_end:]

    return text.replace('\r\n', '\n')

In [21]:
# This new regex finds words like "don't" but skips junk like "'s"
WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z']*\b") 

def words(text: str):
    """Smarter word tokenizer (lowercased, ASCII letters + internal apostrophes)."""
    return WORD_RE.findall(text.lower())


def sentences(text: str):
    """Naive sentence splitter using punctuation boundaries."""
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]


# --- Run the tokenizers ---
fellowship_words = words(fellowship)
return_king_words = words(return_king)

fellowship_sentences = sentences(fellowship)
return_king_sentences = sentences(return_king)

# Save total word counts for later
nF = len(fellowship_words) # Total words in Fellowship
nR = len(return_king_words) # Total words in TheKing

print(f"Fellowship words: {nF:,} | TheKing words: {nR:,}")
print(f"Fellowship sentences: {len(fellowship_sentences):,} | TheKing sentences: {len(return_king_sentences):,}")

NameError: name 'fellowship' is not defined

In [22]:
def top_words(words_list, min_len=4, extra_stop=None, n=30):
    """Return top-N frequent words after lightweight filtering."""
    base_stop = {
        'the','and','to','of','a','i','it','in','that','was','he','you','is','for','on','as',
        'with','his','her','at','be','she','had','not','but','said','they','them','this','so','all','one','very',
        'there','what','were','from','have','would','could','when','been','their','we','my','me','or','by','up','no','out','if',
        
        # --- CUSTOM TOLKIEN STOPWORDS (must be lowercase) ---
        'frodo', 'sam', 'pippin', 'merry', 'gandalf', 'aragorn', 'boromir', 'gimli', 'legolas', 'then', 'bilbo', 'samwise'
    }
    if extra_stop:
        base_stop |= set(extra_stop)

    c = Counter(w for w in words_list if len(w) >= min_len and w not in base_stop)
    return c.most_common(n)

In [23]:
def per_10k(count: int, total_words: int) -> float:
    """Normalize a raw count per 10,000 words for fair comparisons."""
    return (count / max(1, total_words)) * 10000.0


def log_likelihood(k1: int, n1: int, k2: int, n2: int) -> float:
    """Dunning’s log-likelihood (G^2) keyness score for word distinctiveness."""
    E1 = n1 * (k1 + k2) / max(1, (n1 + n2))
    E2 = n2 * (k1 + k2) / max(1, (n1 + n2))

    def term(k, E):
        return 0.0 if k == 0 or E == 0 else k * math.log(k / E)

    return 2.0 * (term(k1, E1) + term(k2, E2))


## Load & Normalize
We load both texts using **inline path checks** and then apply a simple normalization.


In [24]:

# Load raw texts
fellowship_raw, return_raw = load_texts()

# Normalize for tokenization
# We pass is_fellowship=True to tell our new function to skip the prologue
fellowship   = normalize(fellowship_raw, is_fellowship=True)
return_king = normalize(return_raw) 

# Update the print labels
print(f"Fellowship chars: {len(fellowship):,} | Return of the King chars: {len(return_king):,}")


Fellowship chars: 948,198 | Return of the King chars: 709,796



## Tokenize
We use a simple regex tokenizer (letters + apostrophes). For more serious work,
consider spaCy or stanza for tagging and lemmatization.


In [25]:
# --- Run the tokenizers ---
fellowship_words = words(fellowship)
return_king_words = words(return_king)

fellowship_sentences = sentences(fellowship)
return_king_sentences = sentences(return_king)

print(f"Fellowship words: {len(fellowship_words):,} | Return of the King words: {len(return_king_words):,}")
print(f"Fellowship sentences: {len(fellowship_sentences):,} | Return of the King sentences: {len(return_king_sentences):,}")


Fellowship words: 179,144 | Return of the King words: 136,735
Fellowship sentences: 10,880 | Return of the King sentences: 7,449



## Top Words (after basic stopwords)
The list is **partly signal, partly noise**—use it to start discussion.


In [26]:
print("Top Fellowship:", top_words(fellowship_words, n=15))
print("Top Return of the King:", top_words(return_king_words, n=15))


Top Fellowship: [('will', 559), ('into', 449), ('long', 421), ('came', 421), ('down', 418), ('again', 412), ('like', 403), ('more', 388), ('before', 378), ('come', 377), ('your', 347), ('some', 346), ('great', 335), ('back', 334), ('many', 334)]
Top Return of the King: [('will', 588), ('came', 455), ('great', 443), ('come', 388), ('your', 335), ('more', 331), ('like', 302), ('down', 299), ('into', 293), ('before', 291), ('upon', 289), ('long', 281), ('some', 275), ('again', 272), ('still', 269)]


## Optional continution:


## Distinctiveness via Log-Likelihood (Keyness)
Raw frequency is not enough. Compute **G²** to find words that are *distinctive* of each book.


In [27]:
def per_10k(count: int, total_words: int) -> float:
    """Normalize a raw count per 10,000 words for fair comparisons."""
    return (count / max(1, total_words)) * 10000.0


def lolookingglass_likelihood(k1: int, n1: int, k2: int, n2: int) -> float:
    """Dunning’s log-likelihood (G^2) keyness score for word distinctiveness.

    Parameters
    ----------
    k1 : int  Frequency in corpus A
    n1 : int  Total words in corpus A
    k2 : int  Frequency in corpus B
    n2 : int  Total words in corpus B

    Returns
    -------
    float
        G^2 value; larger absolute values indicate stronger distinctiveness.
        Direction should be interpreted by comparing rates (per_10k) or counts.

    Notes
    -----
    - Symmetric measure widely used for corpus comparison.
    - Great classroom upgrade over raw frequency lists.
    """
    E1 = n1 * (k1 + k2) / max(1, (n1 + n2))
    E2 = n2 * (k1 + k2) / max(1, (n1 + n2))

    def term(k, E):
        return 0.0 if k == 0 or E == 0 else k * math.log(k / E)

    return 2.0 * (term(k1, E1) + term(k2, E2))


In [29]:
# Build frequency dictionaries
cf = Counter(fellowship_words)
cr = Counter(return_king_words)
nF, nR = sum(cf.values()), sum(cr.values())

# Compare a candidate set (union of top ~500 from each to keep it fast)
candidates = set([w for w,_ in cf.most_common(500)] + [w for w,_ in cr.most_common(500)])

rows = []
for w in candidates:
    # (filter out any remaining stopwords that slipped through)
    if w in {'frodo', 'sam', 'pippin', 'merry', 'gandalf', 'aragorn', 'boromir', 'gimli', 'legolas', 'then', 'bilbo', 'samwise'}:
        continue
    
    # This line is now corrected (per_10k instead of per_1ci0k)
    g2 = log_likelihood(cf[w], nF, cr[w], nR)
    rows.append((g2, w, per_10k(cf[w], nF), per_10k(cr[w], nR)))

# Sort by distinctiveness (descending)
rows.sort(reverse=True)

print("--- 20 Most Distinctive Words (Log-Likelihood) ---")
print(f"{'WORD':>12}  {'G2_SCORE':>8}  {'FELLOWSHIP_RATE':>16}  {'RETURN_RATE':>12}")
print("-" * 52)

for g2, w, f10k, r10k in rows[:20]:
    # We check which rate is higher to label the "direction"
    if f10k > r10k:
        print(f"{w:>12}  {g2:8.1f}  {f10k:16.2f}  {r10k:12.2f}  (More in Fellowship)")
    else:
        print(f"{w:>12}  {g2:8.1f}  {f10k:16.2f}  {r10k:12.2f}  (More in Return of the King)")

--- 20 Most Distinctive Words (Log-Likelihood) ---
        WORD  G2_SCORE   FELLOWSHIP_RATE   RETURN_RATE
----------------------------------------------------
           s     983.1              0.45         46.29  (More in Return of the King)
           t     499.0              0.00         21.79  (More in Return of the King)
     faramir     271.3              0.00         11.85  (More in Return of the King)
        king     266.8              1.34         17.48  (More in Return of the King)
       don't     239.3             11.78          0.00  (More in Fellowship)
          ll     217.7              0.00          9.51  (More in Return of the King)
         men     203.6              6.25         25.74  (More in Return of the King)
        city     199.4              1.17         13.60  (More in Return of the King)
         don     165.8              0.00          7.24  (More in Return of the King)
        lord     164.0              3.41         17.33  (More in Return of the King)

In [32]:
import pandas as pd
import altair as alt

print("\n--- Combined Character & Pronoun Focus Analysis ---")

# 1. Define all the terms you want to track in the graph
# We'll create "pseudo-words" for the pronoun groups
target_terms = [
    'he/him', 
    'she/her', 
    'gandalf', 
    'aragorn', 
    'frodo', 
    'sam',
    'boromir', 
    'bilbo',
    'faramir',
    'denethor'
]

# 2. Get the counts and rates for each term
combined_data = []

for term in target_terms:
    fellowship_count = 0
    return_count = 0
    
    # Handle the special pronoun groups
    if term == 'he/him':
        fellowship_count = cf['he'] + cf['him']
        return_count = cr['he'] + cr['him']
    elif term == 'she/her':
        fellowship_count = cf['she'] + cf['her']
        return_count = cr['she'] + cr['her']
    # Handle regular names
    else:
        fellowship_count = cf[term]
        return_count = cr[term]
        
    # Add data for Fellowship
    combined_data.append({
        'Book': 'Fellowship',
        'Term': term,
        'Rate (per 10k words)': per_10k(fellowship_count, nF)
    })
    
    # Add data for TheKing
    combined_data.append({
        'Book': 'TheKing',
        'Term': term,
        'Rate (per 10k words)': per_10k(return_count, nR)
    })

# 3. Create a DataFrame
chart_df = pd.DataFrame(combined_data)

# 4. Build the Grouped Bar Chart
chart = alt.Chart(chart_df).mark_bar().encode(
    # Set the x-axis to the Book name
    x=alt.X('Book', axis=None), # axis=None hides the "Fellowship/TheKing" label under each bar
    
    # Set the y-axis to the normalized rate
    y=alt.Y('Rate (per 10k words)'),
    
    # Color the bars based on the Book
    color='Book',
    
    # Create grouped columns, one for each "Term"
    column=alt.Column('Term', sort=target_terms, header=alt.Header(titleOrient="bottom", labelOrient="bottom")),
    
    tooltip=['Book', 'Term', alt.Tooltip('Rate (per 10k words)', format='.2f')]
).properties(
    title='Combined Character & Pronoun Focus: Fellowship vs. TheKing'
).interactive()

# 5. Save and Display the Chart
chart.save('combined_character_focus_chart.json')
print("Graph saved to 'combined_character_focus_chart.json'")

chart


--- Combined Character & Pronoun Focus Analysis ---
Graph saved to 'combined_character_focus_chart.json'
