# 5) Color Words & Description Density

**Goal:** Count color terms and compare description density.

# Setup: Load Texts

This notebook needs **Fellowship of the King** and **The return of the King** as input texts.

**How to provide the texts:**
1. Aquire books through all means necessary

2. Place two text files in the "data" folder with names:
   - `Fellowship.txt`  (Fellowship of the King)
   - `TheKing.txt` (The return of the King)

In [1]:
import re
from pathlib import Path
from collections import Counter

In [8]:

def load_texts(local_fellowship: str = '..\\data\\Fellowship.txt',
               local_theking: str = '..\\data\\TheKing.txt'):
    """Load Fellowship
 and Looking-theking texts from disk.

    Parameters
    ----------
    local_fellowship : str
        Path to Fellowship
     text file. Defaults to '../data/Fellowship
    .txt'.
    local_theking : str
        Path to Looking-theking text file. Defaults to '../data/Looking-theking.txt'.

    Returns
    -------
    tuple[str, str]
        (Fellowship
    _text, theking
    _text).

    Raises
    ------
    FileNotFoundError
        If either file is missing.

    Extra Notes
    -----------
    - Using UTF-8 with `errors='ignore'` avoids codec exceptions on
      older Project Gutenberg dumps or inconsistent encodings.
    """
    p1, p2 = Path(local_fellowship), Path(local_theking)

    # Fail fast with a clear message if a file is missing
    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please place 'Fellowship.txt' at this path or update load_texts(...)."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please place 'Looking-theking.txt' at this path or update load_texts(...)."
        )

    # Read the files (UTF-8; ignore undecodable bytes to stay robust)
    fellowship = p1.read_text(encoding='utf-8', errors='ignore')
    theking = p2.read_text(encoding='utf-8', errors='ignore')
    return fellowship, theking


def normalize(text: str, is_fellowship: bool = False) -> str:
    """Normalize a text for tokenization."""
    if not text:
        return ''
    
    # If it's Fellowship, skip the Foreword and Prologue
    if is_fellowship:
        prologue_end = text.find('Chapter 1\n\nA Long-expected Party')
        if prologue_end != -1:
            text = text[prologue_end:]
    
    # For Return of the King
    if not is_fellowship:
        contents_end = text.find('Book V\n\nChapter 1. Minas Tirith')
        if contents_end != -1:
            text = text[contents_end:]

    return text.replace('\r\n', '\n')

# Load raw texts
fellowship_raw, theking_raw = load_texts()

# Normalize for tokenization
fellowship = normalize(fellowship_raw, is_fellowship=True)
theking = normalize(theking_raw, is_fellowship=False) 

print(f"Fellowship chars: {len(fellowship):,} | TheKing chars: {len(theking):,}")

Fellowship chars: 948,198 | TheKing chars: 709,796


### Helpers: Tokenization

In [9]:
# This new regex finds words like "don't" but skips junk like "'s"
WORD_RE = re.compile(r"\b[A-Za-z][A-Za-z']*\b") 

def words(text: str):
    """Smarter word tokenizer (lowercased, ASCII letters + internal apostrophes)."""
    return WORD_RE.findall(text.lower())


def sentences(text: str):
    """Naive sentence splitter using punctuation boundaries."""
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]


# --- Run the tokenizers ---
fellowship_words = words(fellowship)
theking_words = words(theking)

fellowship_sentences = sentences(fellowship)
theking_sentences = sentences(theking)

# Save total word counts for later
nF = len(fellowship_words) # Total words in Fellowship
nR = len(theking_words) # Total words in TheKing

print(f"Fellowship words: {nF:,} | TheKing words: {nR:,}")
print(f"Fellowship sentences: {len(fellowship_sentences):,} | TheKing sentences: {len(theking_sentences):,}")

Fellowship words: 179,144 | TheKing words: 136,735
Fellowship sentences: 10,880 | TheKing sentences: 7,449


### Count Color Terms

In [11]:
COLOR_LIST = {
    'red','orange','yellow','green','blue','indigo','violet','purple','pink','brown','black','white','gray','grey',
    'scarlet','crimson','emerald','amber','gold','silver','lavender','mauve','ivory','beige','teal','turquoise','magenta','maroon','navy'
}
def count_colors(tokens):
    c = Counter(w for w in tokens if w in COLOR_LIST)
    return c, sum(c.values()), len(tokens)

f_c, f_hits, f_total = count_colors(fellowship_words)
r_c, r_hits, r_total = count_colors(theking_words)
print(f"Fellowship top:", f_c.most_common(15), f"| rate per 100k: {(f_hits/f_total)*100000:.2f}")
print(f"TheKing top:", r_c.most_common(15), f"| rate per 100k: {(r_hits/r_total)*100000:.2f}")


Fellowship top: [('black', 170), ('white', 168), ('grey', 133), ('green', 114), ('silver', 82), ('gold', 68), ('red', 53), ('blue', 38), ('yellow', 33), ('brown', 28), ('scarlet', 2), ('emerald', 1)] | rate per 100k: 496.81
TheKing top: [('black', 172), ('white', 123), ('grey', 117), ('red', 71), ('green', 62), ('silver', 57), ('gold', 18), ('blue', 13), ('brown', 9), ('yellow', 4), ('scarlet', 4), ('purple', 1), ('ivory', 1)] | rate per 100k: 476.83


**Discuss:** Where do color bursts cluster in the narrative? What scenes rely on color to signal mood or magic?

In [12]:
def rolling_color_windows(tokens, window=800, step=200, color_list=COLOR_LIST):
    hits_per_window = []
    for i in range(0, max(1, len(tokens)-window+1), step):
        chunk = tokens[i:i+window]
        c = sum(1 for w in chunk if w in color_list)
        hits_per_window.append((i, i+window, c, c * (100000/window)))  # per 100k
    return hits_per_window

def nearest_sentence_span(tokens, sents, start_idx, end_idx):
    # approximate: map token range to a sentence slice of similar length
    # (good enough for previewing passages)
    text = " ".join(tokens[start_idx:end_idx])
    # find a sentence that contains first few words of the window
    head = " ".join(tokens[start_idx:start_idx+20])
    for k, s in enumerate(sents):
        if head[:50] in s:
            j0 = max(0, k-1); j1 = min(len(sents), k+3)
            return j0, j1, " ".join(sents[j0:j1])
    return None, None, text[:500]

# run it
f_roll = rolling_color_windows(fellowship_words, window=800, step=200)
r_roll = rolling_color_windows(theking_words, window=800, step=200)

# top 5 bursts by per-100k
f_top = sorted(f_roll, key=lambda x: x[3], reverse=True)[:5]
r_top = sorted(r_roll, key=lambda x: x[3], reverse=True)[:5]

print("=== Fellowship color bursts ===")
for a,b,h,rate in f_top:
    j0,j1,preview = nearest_sentence_span(fellowship_words, fellowship_sentences, a,b)
    print(f"[tokens {a}-{b}] hits={h} | {rate:.0f} per 100k")
    print(preview[:400], "…\n")

print("=== TheKing color bursts ===")
for a,b,h,rate in r_top:
    j0,j1,preview = nearest_sentence_span(theking_words, theking_sentences, a,b)
    print(f"[tokens {a}-{b}] hits={h} | {rate:.0f} per 100k")
    print(preview[:400], "…\n")

=== Fellowship color bursts ===
[tokens 161200-162000] hits=22 | 2750 per 100k
company the clothes they had brought for each they had provided a hood and cloak made according to his size of the light but warm silken stuff that the galadhrim wove it was hard to say of what colour they were grey with the hue of twilight under the trees they seemed to be and yet if they were moved or set in another light they were green as shadowed leaves or brown as fallow fields by night dusk …

[tokens 161800-162600] hits=20 | 2500 per 100k
out into a narrow tongue between bright margins on the right and west the silverlode flowed glittering on the left and east the great river rolled its broad waters deep and dark on the further shores the woodlands still marched on southwards as far as the eye could see but all the banks were bleak and bare no mallorn lifted its gold hung boughs beyond the land of lorien on the bank of the silverlo …

[tokens 153000-153800] hits=19 | 2375 per 100k
the twilight like l

In [13]:
import pandas as pd
import altair as alt

# This helper function is from Notebook 1
def per_10k(count: int, total_words: int) -> float:
    """Normalize a raw count per 10,000 words for fair comparisons."""
    return (count / max(1, total_words)) * 10000.0

# Define the colors from your research question
# We add 'fire' to the color list for this analysis
target_colors = ['green', 'gold', 'black', 'red', 'fire', 'white']

# Create a list of data for the DataFrame
chart_data = []
for color in target_colors:
    chart_data.append({
        'Book': 'Fellowship', 
        'Color': color, 
        # Use per_10k with the nF (total fellowship words) from Cell 8
        'Rate': per_10k(f_c[color], nF) 
    })
    chart_data.append({
        'Book': 'TheKing', 
        'Color': color, 
        # Use per_10k with the nR (total theking words) from Cell 8
        'Rate': per_10k(r_c[color], nR)
    })

chart_df = pd.DataFrame(chart_data)

# Build a grouped bar chart
chart = alt.Chart(chart_df).mark_bar().encode(
    # On the x-axis, show the Color, grouped by Book
    x=alt.X('Book', axis=None),
    # On the y-axis, show the Rate
    y=alt.Y('Rate', title='Rate (per 10k words)'),
    # Use color to distinguish the books
    color='Book',
    # Create a separate column for each color word
    column=alt.Column('Color', header=alt.Header(titleOrient="bottom", labelOrient="bottom")),
    tooltip=['Book', 'Color', 'Rate']
).properties(
    title='Color & Mood: Nature vs. War in Tolkien'
).interactive()

# Save the chart as a JSON file
chart.save('color_comparison_chart.json')
print("Graph saved to 'color_comparison_chart.json'")

# Display the chart
chart

Graph saved to 'color_comparison_chart.json'
