# Text Analysis of Harry Potter Saga

This notebook implements the "Trial of J.K. Rowling" challenge: analyzing character appearances and frequencies in the Harry Potter books.

## Import Libraries

In [24]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nimzero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Define Characters

In [25]:
characters = [
    'Harry Potter',
    'Hermione Granger',
    'Ron Weasley',
    'Albus Dumbledore',
    'Severus Rogue',
    'Voldemort',
    'Drago Malefoy',
    'Neville Londubat',
    'Luna Lovegood',
    'Ginny Weasley'
]

# Build regex patterns for each character to capture common French variants
import re, unicodedata
character_patterns = {
    'Harry Potter': re.compile(r"\b(harry|harry\s+potter|potter)\b", re.IGNORECASE),
    'Hermione Granger': re.compile(r"\b(hermione|hermione\s+granger|granger)\b", re.IGNORECASE),
    'Ron Weasley': re.compile(r"\b(ron|ron\s+weasley|weasley)\b", re.IGNORECASE),
    'Albus Dumbledore': re.compile(r"\b(dumbledore|albus|albus\s+dumbledore)\b", re.IGNORECASE),
    'Severus Rogue': re.compile(r"\b(rogue|severus|severus\s+rogue|snape|severus\s+snape)\b", re.IGNORECASE),
    'Voldemort': re.compile(r"\b(voldemort|tom\s+jedusor|tom\s+elvis\s+jedusor|celui[-\s]?dont[-\s]?on[-\s]?ne[-\s]?doit[-\s]?pas[-\s]?prononcer[-\s]?le[-\s]?nom|vous[-\s]?savez[-\s]?qui)\b", re.IGNORECASE),
    # For Draco/Drago: avoid matching 'Draconis' and capture French forms (Drago Malefoy / Malefoy / Malfoy)
    'Drago Malefoy': re.compile(r"\b(drago(?![a-z])|drago\s+malefoy|draco(?![a-z])|draco\s+malfoy|(?<!lucius\s)(malefoy|malfoy|mal[eé]foy)|m\.?\s*malefoy|m\.?\s*malfoy)\b", re.IGNORECASE),
    'Neville Londubat': re.compile(r"\b(neville|neville\s+londubat|londubat|longbottom)\b", re.IGNORECASE),
    'Luna Lovegood': re.compile(r"\b(luna|luna\s+lovegood|lovegood)\b", re.IGNORECASE),
    'Ginny Weasley': re.compile(r"\b(ginny|ginny\s+weasley|weasley)\b", re.IGNORECASE),
}

## Load Texts

In [26]:
text_dir = '../datasets/texts'
books = {}
os.makedirs(text_dir, exist_ok=True)

# Prefer PDF files if present; fall back to .txt files
pdf_files = [f for f in os.listdir(text_dir) if f.lower().endswith('.pdf')]
txt_files = [f for f in os.listdir(text_dir) if f.lower().endswith('.txt')]

if pdf_files:
    from PyPDF2 import PdfReader
    for file in pdf_files:
        path = os.path.join(text_dir, file)
        try:
            reader = PdfReader(path)
            text = ''
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
            books[file] = text
        except Exception as e:
            print(f'Failed to read PDF {file}: {e}')
elif txt_files:
    for file in txt_files:
        with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
            books[file] = f.read()
else:
    raise FileNotFoundError('No PDF or TXT files found in datasets/texts/. Please add the book PDFs or .txt files.')

In [27]:
import unicodedata

def normalize_text(s):
    if not s:
        return ''
    # Normalize unicode, remove accents, and collapse whitespace
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r'\s+', ' ', s)
    return s

# Per book analysis using regex patterns
book_data = []
for book_name, text in books.items():
    book_mentions = {}
    norm_text = normalize_text(text).lower() if text else ''
    for char, pattern in character_patterns.items():
        try:
            count = len(pattern.findall(norm_text))
        except Exception:
            count = 0
        book_mentions[char] = count
    book_mentions['Book'] = book_name
    book_data.append(book_mentions)

df_books = pd.DataFrame(book_data)
if 'Book' in df_books.columns:
    df_books = df_books.set_index('Book')
else:
    df_books.index = list(books.keys())

# Total mentions
total_mentions = df_books.sum()
print('Total mentions computed for characters:\n', total_mentions)

# Visualizations
import matplotlib
matplotlib.use('Agg')
plt.figure(figsize=(12, 6))
total_mentions.sort_values(ascending=False).plot(kind='bar')
plt.title('Total Character Mentions Across All Books')
plt.xlabel('Character')
plt.ylabel('Mentions')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../visualizations/total_mentions.png')
plt.close()

# Heatmap per book (if multiple books)
if df_books.shape[0] > 0:
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_books.T, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Character Mentions Per Book')
    plt.xlabel('Book')
    plt.ylabel('Character')
    plt.tight_layout()
    plt.savefig('../visualizations/mentions_per_book.png')
    plt.close()

# If books are ordered, plot trends
book_order = list(books.keys())
if df_books.shape[0] > 0:
    df_ordered = df_books.loc[book_order] if set(book_order) == set(df_books.index) else df_books
    plt.figure(figsize=(12, 6))
    for char in characters:
        if char in df_ordered.columns:
            plt.plot(df_ordered.index, df_ordered[char], label=char, marker='o')
    plt.title('Character Mention Trends Across Books')
    plt.xlabel('Book')
    plt.ylabel('Mentions')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.savefig('../visualizations/mention_trends.png')
    plt.close()

Total mentions computed for characters:
 Harry Potter        18218
Hermione Granger     5381
Ron Weasley          7632
Albus Dumbledore     3369
Severus Rogue        1892
Voldemort            1326
Drago Malefoy        1459
Neville Londubat      790
Luna Lovegood         421
Ginny Weasley        2382
dtype: int64


In [28]:
# Diagnostic: show contexts where Draco or Malfoy appear
for name in ['Draco', 'Malfoy']:
    print('---', name, '---')
    for book, text in books.items():
        matches = [m.start() for m in re.finditer(name, text, re.IGNORECASE)]
        print(book, len(matches), 'matches')
        for i, pos in enumerate(matches[:5]):
            start = max(0, pos-30)
            end = min(len(text), pos+30)
            print('...', text[start:end].replace('\n', ' '))
        print()

--- Draco ---
harry-potter-3-le-prisonnier-dazkaban.pdf 0 matches

harry-potter-1-lecole-des-sorciers.pdf 1 matches
... asse	?	demanda-t-elle. —Caput	Draconis,	dit	Percy	et	le	tabl

harry-potter-2-la-chambre-des-secrets.pdf 0 matches

harry-potter-4-la-coupe-de-feu.pdf 0 matches

harry-potter-7-les-reliques-de-la-mort.pdf 0 matches

harry-potter-6-le-prince-de-sang-mecc82lecc81.pdf 0 matches

harry-potter-5-lordre-du-phoenix.pdf 0 matches

--- Malfoy ---
harry-potter-3-le-prisonnier-dazkaban.pdf 0 matches

harry-potter-1-lecole-des-sorciers.pdf 0 matches

harry-potter-2-la-chambre-des-secrets.pdf 0 matches

harry-potter-4-la-coupe-de-feu.pdf 0 matches

harry-potter-7-les-reliques-de-la-mort.pdf 0 matches

harry-potter-6-le-prince-de-sang-mecc82lecc81.pdf 0 matches

harry-potter-5-lordre-du-phoenix.pdf 0 matches

