# Text Analysis of Harry Potter Saga

This notebook implements the "Trial of J.K. Rowling" challenge: analyzing character appearances and frequencies in the Harry Potter books.

In [1]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
import unicodedata
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nimzero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Define Characters

In [2]:
characters = [
    'Harry Potter',
    'Hermione Granger',
    'Ron Weasley',
    'Albus Dumbledore',
    'Severus Rogue',
    'Voldemort',
    'Drago Malefoy',
    'Neville Londubat',
    'Luna Lovegood',
    'Ginny Weasley'
]

# Build regex patterns for each character (French/English variants)
character_patterns = {
    'Harry Potter': re.compile(r"\b(harry|harry\s+potter|potter)\b", re.IGNORECASE),
    'Hermione Granger': re.compile(r"\b(hermione|hermione\s+granger|granger)\b", re.IGNORECASE),
    'Ron Weasley': re.compile(r"\b(ron|ron\s+weasley|weasley)\b", re.IGNORECASE),
    'Albus Dumbledore': re.compile(r"\b(dumbledore|albus|albus\s+dumbledore)\b", re.IGNORECASE),
    'Severus Rogue': re.compile(r"\b(rogue|snape|severus)\b", re.IGNORECASE),
    'Voldemort': re.compile(r"\b(voldemort|tom\s+jedusor|vous\s+savez\s+qui)\b", re.IGNORECASE),
    'Drago Malefoy': re.compile(r"\b(drago(?![a-z])|draco(?![a-z])|malefoy|malfoy|mal[eé]foy)\b", re.IGNORECASE),
    'Neville Londubat': re.compile(r"\b(neville|neville\s+londubat|londubat|longbottom)\b", re.IGNORECASE),
    'Luna Lovegood': re.compile(r"\b(luna|luna\s+lovegood|lovegood)\b", re.IGNORECASE),
    'Ginny Weasley': re.compile(r"\b(ginny|ginny\s+weasley|weasley)\b", re.IGNORECASE),
}

## Load Texts

In [3]:
text_dir = '../datasets/texts'
books = {}
os.makedirs(text_dir, exist_ok=True)
pdf_files = [f for f in os.listdir(text_dir) if f.lower().endswith('.pdf')]
txt_files = [f for f in os.listdir(text_dir) if f.lower().endswith('.txt')]
if pdf_files:
    try:
        from PyPDF2 import PdfReader
    except Exception:
        PdfReader = None
    for file in pdf_files:
        path = os.path.join(text_dir, file)
        if PdfReader is None:
            print('PyPDF2 not available; skipping PDF', file)
            continue
        try:
            reader = PdfReader(path)
            text = ''
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + '\n'
            books[file] = text
        except Exception as e:
            print(f'Failed to read PDF {file}: {e}')
elif txt_files:
    for file in txt_files:
        with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
            books[file] = f.read()
else:
    print('No PDF or TXT files found in datasets/texts. Add book texts to run full analysis.')

In [4]:
# Normalize helper
def normalize_text(s):
    if not s:
        return ''
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(c for c in s if not unicodedata.combining(c))
    s = re.sub(r'\s+', ' ', s)
    return s

# Per book analysis using character_patterns
book_data = []
for book_name, text in books.items():
    book_mentions = {}
    norm_text = normalize_text(text).lower() if text else ''
    for char, pattern in character_patterns.items():
        try:
            count = len(pattern.findall(norm_text))
        except Exception:
            count = 0
        book_mentions[char] = count
    book_mentions['Book'] = book_name
    book_data.append(book_mentions)

df_books = pd.DataFrame(book_data)
if 'Book' in df_books.columns:
    df_books = df_books.set_index('Book')
else:
    df_books.index = list(books.keys())

# Total mentions
total_mentions = df_books.sum() if not df_books.empty else pd.Series()
print('Total mentions computed for characters:\n', total_mentions)

# Save basic visualizations
import matplotlib
matplotlib.use('Agg')
os.makedirs('../visualizations', exist_ok=True)
if not total_mentions.empty:
    plt.figure(figsize=(12, 6))
    total_mentions.sort_values(ascending=False).plot(kind='bar')
    plt.title('Total Character Mentions Across All Books')
    plt.xlabel('Character')
    plt.ylabel('Mentions')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('../visualizations/total_mentions.png')
    plt.close()

# Heatmap per book (if multiple books)
if df_books.shape[0] > 0:
    plt.figure(figsize=(12, 8))
    sns.heatmap(df_books.T, annot=True, fmt='d', cmap='YlOrRd')
    plt.title('Character Mentions Per Book')
    plt.xlabel('Book')
    plt.ylabel('Character')
    plt.tight_layout()
    plt.savefig('../visualizations/mentions_per_book.png')
    plt.close()

Total mentions computed for characters:
 Harry Potter        18218
Hermione Granger     5381
Ron Weasley          7632
Albus Dumbledore     3369
Severus Rogue        1892
Voldemort            1212
Drago Malefoy        1542
Neville Londubat      790
Luna Lovegood         421
Ginny Weasley        2382
dtype: int64


In [5]:
# Diagnostic: show contexts where Draco or Malfoy appear
for name in ['Draco', 'Malfoy']:
    print('---', name, '---')
    for book, text in books.items():
        matches = [m.start() for m in re.finditer(name, text, re.IGNORECASE)]
        print(book, len(matches), 'matches')
        for i, pos in enumerate(matches[:5]):
            start = max(0, pos-30)
            end = min(len(text), pos+30)
            print('...', text[start:end].replace('\n', ' '))
        print()

--- Draco ---
harry-potter-3-le-prisonnier-dazkaban.pdf 0 matches

harry-potter-1-lecole-des-sorciers.pdf 1 matches
... asse	?	demanda-t-elle. —Caput	Draconis,	dit	Percy	et	le	tabl

harry-potter-2-la-chambre-des-secrets.pdf 0 matches

harry-potter-4-la-coupe-de-feu.pdf 0 matches

harry-potter-7-les-reliques-de-la-mort.pdf 0 matches

harry-potter-6-le-prince-de-sang-mecc82lecc81.pdf 0 matches

harry-potter-5-lordre-du-phoenix.pdf 0 matches

--- Malfoy ---
harry-potter-3-le-prisonnier-dazkaban.pdf 0 matches

harry-potter-1-lecole-des-sorciers.pdf 0 matches

harry-potter-2-la-chambre-des-secrets.pdf 0 matches

harry-potter-4-la-coupe-de-feu.pdf 0 matches

harry-potter-7-les-reliques-de-la-mort.pdf 0 matches

harry-potter-6-le-prince-de-sang-mecc82lecc81.pdf 0 matches

harry-potter-5-lordre-du-phoenix.pdf 0 matches



## Advanced requested analyses

This section computes:\n
- How often Harry's scar hurts (heuristic)\n
- Number of times Hermione says a line starting with 'Mais'\n
- Heuristic for Dumbledore decisive actions\n
- Comparison of speaking turns for Harry, Hermione and Ron\n
- Rogue mysterious/dark occurrences\n
- Counts of morally/legally questionable acts\n
- Per-book breakdown and normalization per 100 pages

In [6]:
# Advanced analysis implementation (cleaned)
import os, re
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Patterns/heuristics
scar_p = re.compile(r'\b(scar|cicatrice)\b.*?(hurt|hurts|pain|ache|stings|sting|brul|fait mal|douleur)', re.IGNORECASE)
scar_alt = re.compile(r'(hurt|pain|ache|stings|sting|brul|fait mal|douleur).*?(scar|cicatrice)', re.IGNORECASE)
hermione_mais_p = re.compile(r'(^|\n)\s*Mais\b', re.IGNORECASE | re.MULTILINE)
dumbledore_act_p = re.compile(r'\b(dumbledore|albus)\b(.{0,120}?)\b(decid|choose|order|arrang|turn|change|influence|intervene|ordonner|decide|choisit|intervient)', re.IGNORECASE)
rogue_dark_p = re.compile(r'\b(snape|rogue|severus)\b(.{0,80}?)\b(mysterious|mysterieux|sinister|sombre|dark|etrange|ambig)', re.IGNORECASE)
crime_keywords = [r'kill', r'murder', r'steal', r'torture', r'assault', r'attack', r'poison', r'vol', r'voler', r'assassin', r'assassinat', r'agression', r'attaque', r'meurtr', r'sacrifice', r'kidnap', r'enlevement']
crime_p = re.compile('|'.join(crime_keywords), re.IGNORECASE)

results = defaultdict(dict)
for book_name, text in books.items():
    raw = text or ''
    norm = normalize_text(raw)
    lower = norm.lower()
    chars = len(raw)
    pages_est = max(1, int(chars / 1800))
    scar_count = len(scar_p.findall(lower)) + len(scar_alt.findall(lower))
    hermione_mais_count = len(hermione_mais_p.findall(raw))
    dumbledore_actions = len(dumbledore_act_p.findall(lower))
    rogue_dark = len(rogue_dark_p.findall(lower))
    crimes = len(crime_p.findall(lower))
    # Speaking turns (crude)
    speakers = {'Harry Potter':0, 'Hermione Granger':0, 'Ron Weasley':0}
    for qm in re.finditer(r'(["])(.*?)(["])', raw, re.DOTALL):
        qstart = qm.start()
        qend = qm.end()
        window = raw[max(0,qstart-200):min(len(raw), qend+200)].lower()
        if 'harry' in window or 'potter' in window:
            speakers['Harry Potter'] += 1
        if 'hermione' in window or 'granger' in window:
            speakers['Hermione Granger'] += 1
        if re.search(r'\bron\b', window) or 'weasley' in window:
            speakers['Ron Weasley'] += 1
    results[book_name]['scar_count'] = scar_count
    results[book_name]['hermione_mais'] = hermione_mais_count
    results[book_name]['dumbledore_actions'] = dumbledore_actions
    results[book_name]['rogue_dark'] = rogue_dark
    results[book_name]['crime_events'] = crimes
    results[book_name]['pages_est'] = pages_est
    factor = 100.0 / pages_est if pages_est else 1.0
    for k in ['scar_count','hermione_mais','dumbledore_actions','rogue_dark','crime_events']:
        results[book_name][k + '_per_100_pages'] = round(results[book_name][k] * factor, 2)
    for s,v in speakers.items():
        results[book_name][s + '_speeches'] = v
        results[book_name][s + '_speeches_per_100_pages'] = round(v * factor, 2)

df_adv = pd.DataFrame.from_dict(results, orient='index').fillna(0)
os.makedirs('../reports', exist_ok=True)
df_adv.to_csv('../reports/advanced_analysis.csv')
print('Advanced analysis saved to ../reports/advanced_analysis.csv')

# Simple visualizations
os.makedirs('../visualizations', exist_ok=True)
if not df_adv.empty:
    totals = df_adv[[c for c in df_adv.columns if c.endswith('_per_100_pages')]].sum().sort_values(ascending=False)
    plt.figure(figsize=(10,6))
    totals.plot(kind='bar')
    plt.title('Totals per 100 pages (all books)')
    plt.tight_layout()
    plt.savefig('../visualizations/advanced_totals_per_100_pages.png')
    plt.close()
    speaker_cols = [c for c in df_adv.columns if c.endswith('_speeches') and ('Harry' in c or 'Hermione' in c or 'Ron' in c)]
    if speaker_cols:
        df_adv[speaker_cols].plot(kind='bar', figsize=(12,6))
        plt.title('Speaking turns: Harry vs Hermione vs Ron (per book)')
        plt.tight_layout()
        plt.savefig('../visualizations/speakers_per_book.png')
        plt.close()
print('Visualizations saved to ../visualizations/')

Advanced analysis saved to ../reports/advanced_analysis.csv
Visualizations saved to ../visualizations/


### Graphes supplémentaires

Ce bloc crée des graphiques détaillés pour chaque métrique demandée (comptages bruts et normalisés par 100 pages), des tendances par livre, ainsi qu'une heatmap résumant les métriques par livre. Les fichiers PNG sont sauvegardés dans `../visualizations/`.

In [7]:
# Additional plotting: raw counts, per-100-pages, trends and heatmap
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import os
# ensure df_adv exists (from previous cell). If not, try loading the CSV
if 'df_adv' not in globals():
    try:
        df_adv = pd.read_csv('../reports/advanced_analysis.csv', index_col=0)
        print('Loaded ../reports/advanced_analysis.csv')
    except Exception as e:
        raise RuntimeError('df_adv not found and failed to load CSV: ' + str(e))
os.makedirs('../visualizations', exist_ok=True)
# Define metric groups
metrics = ['scar_count','hermione_mais','dumbledore_actions','rogue_dark','crime_events']
metrics_per100 = [m + '_per_100_pages' for m in metrics]
# 1) Raw counts per book (bar chart for each metric)
for m in metrics:
    if m in df_adv.columns:
        plt.figure(figsize=(10,4))
        df_adv[m].sort_values(ascending=False).plot(kind='bar', color='C0')
        plt.title(f'Raw counts: {m} per book')
        plt.ylabel('Count')
        plt.xlabel('Book')
        plt.tight_layout()
        fname = f'../visualizations/{m}_per_book.png'
        plt.savefig(fname)
        plt.close()
        print('Saved', fname)
# 2) Per-100-pages normalized metrics (grouped bar)
available = [c for c in metrics_per100 if c in df_adv.columns]
if available:
    plt.figure(figsize=(12,6))
    df_adv[available].plot(kind='bar')
    plt.title('Metrics per 100 pages (per book)')
    plt.ylabel('Count per 100 pages')
    plt.xlabel('Book')
    plt.tight_layout()
    fname = '../visualizations/metrics_per_100_pages_per_book.png'
    plt.savefig(fname)
    plt.close()
    print('Saved', fname)
# 3) Trends across books: line plots normalized by book order
# Try to infer book order from filenames by sorting; if index is numeric, use it
try:
    order = list(df_adv.index)
    # simple heuristic: sort by filename if they contain numbers 1-7
    import re as _re
    def book_key(name):
        m = _re.search(r'(+)', name)
        return int(m.group(1)) if m else name
    order_sorted = sorted(order, key=book_key)
    df_sorted = df_adv.loc[order_sorted]
except Exception:
    df_sorted = df_adv
# plot trends for raw and per-100
for col in [c for c in df_sorted.columns if (c in metrics) or (c in metrics_per100)]:
    plt.figure(figsize=(9,3))
    plt.plot(df_sorted.index, df_sorted[col], marker='o')
    plt.title(f'Trend: {col} by book (sorted)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    fname = f'../visualizations/trend_{col}.png'
    plt.savefig(fname)
    plt.close()
    print('Saved', fname)
# 4) Heatmap summarizing selected raw metrics per book
heat_df = df_adv[metrics].copy() if all([m in df_adv.columns for m in metrics]) else df_adv[[c for c in metrics if c in df_adv.columns]]
if not heat_df.empty:
    plt.figure(figsize=(10,6))
    sns.heatmap(heat_df, annot=True, fmt='g', cmap='coolwarm')
    plt.title('Heatmap: selected raw metrics per book')
    plt.ylabel('Book')
    plt.xlabel('Metric')
    plt.tight_layout()
    fname = '../visualizations/heatmap_selected_metrics.png'
    plt.savefig(fname)
    plt.close()
    print('Saved', fname)
print('All additional visualizations created in ../visualizations')

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()


Saved ../visualizations/scar_count_per_book.png
Saved ../visualizations/hermione_mais_per_book.png
Saved ../visualizations/dumbledore_actions_per_book.png


  plt.tight_layout()
  plt.tight_layout()


Saved ../visualizations/rogue_dark_per_book.png
Saved ../visualizations/crime_events_per_book.png
Saved ../visualizations/metrics_per_100_pages_per_book.png
Saved ../visualizations/trend_scar_count.png
Saved ../visualizations/trend_hermione_mais.png


  plt.tight_layout()


Saved ../visualizations/trend_dumbledore_actions.png
Saved ../visualizations/trend_rogue_dark.png
Saved ../visualizations/trend_crime_events.png
Saved ../visualizations/trend_scar_count_per_100_pages.png
Saved ../visualizations/trend_hermione_mais_per_100_pages.png
Saved ../visualizations/trend_dumbledore_actions_per_100_pages.png
Saved ../visualizations/trend_rogue_dark_per_100_pages.png
Saved ../visualizations/trend_crime_events_per_100_pages.png
Saved ../visualizations/heatmap_selected_metrics.png
All additional visualizations created in ../visualizations
