### FIX: Chunked n-gram extraction and plotting for very large books (War and Peace)

Drop this cell **after** the cells that define `nlp`, `read_text`, `strip_gutenberg_headers`, `war_file`, `anna_file`, and `OUTDIR`.

This cell replaces the in-memory single-call `nlp(text)` with a chunked `nlp.pipe` approach over paragraphs so spaCy won't choke on very large texts. It then re-runs the n-gram plots and saves PNGs.

In [None]:

# Chunked n-gram extraction + plotting cell
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

def get_paragraph_chunks(text: str):
    """Split into paragraph-like chunks (based on blank lines), falling back to fixed-size character chunks
    if paragraphs are excessively long. Returns a list of text chunks."""
    text = text.replace('\r\n', '\n')
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    # If paragraphs are too large (e.g., one massive paragraph), fallback to slicing by character window
    max_paragraph_length = 20000  # characters
    chunks = []
    for p in paragraphs:
        if len(p) <= max_paragraph_length:
            chunks.append(p)
        else:
            # split long paragraph into ~20k char chunks on whitespace boundaries
            start = 0
            L = len(p)
            while start < L:
                end = min(start + max_paragraph_length, L)
                # try to extend end to next space to avoid breaking a word
                if end < L:
                    nxt = p.rfind(' ', start, end)
                    if nxt > start:
                        end = nxt
                chunks.append(p[start:end].strip())
                start = end
    return chunks


def get_top_ngrams_from_text_chunked(text: str, n: int=2, top_k: int=20):
    """
    Efficiently compute top n-grams from a very large text by processing chunks with nlp.pipe.
    Keeps only alphabetic tokens (token.is_alpha) and lowercases them.
    """
    counts = Counter()
    chunks = get_paragraph_chunks(text)
    # Use nlp.pipe for efficiency and to avoid building one enormous Doc
    # Filter out empty chunks
    chunks = [c for c in chunks if c]
    if not chunks:
        return []
    # Process in batches
    for doc in nlp.pipe(chunks, batch_size=50):
        tokens = [t.text.lower() for t in doc if t.is_alpha]
        if len(tokens) < n:
            continue
        for i in range(len(tokens) - n + 1):
            gram = " ".join(tokens[i:i+n])
            counts[gram] += 1
    return counts.most_common(top_k)


def plot_bigrams_trigrams_for_book_chunked(book_title: str, text: str, top_k: int=15, savepath=None):
    bigrams = get_top_ngrams_from_text_chunked(text, n=2, top_k=top_k)
    trigrams = get_top_ngrams_from_text_chunked(text, n=3, top_k=top_k)

    bigram_labels, bigram_vals = zip(*bigrams) if bigrams else ([], [])
    trigram_labels, trigram_vals = zip(*trigrams) if trigrams else ([], [])

    bigram_labels = list(bigram_labels)[::-1]
    bigram_vals = list(bigram_vals)[::-1]
    trigram_labels = list(trigram_labels)[::-1]
    trigram_vals = list(trigram_vals)[::-1]

    fig, axes = plt.subplots(2, 1, figsize=(12, 10), constrained_layout=True)

    # Trigrams on top
    ax = axes[0]
    y_pos = range(len(trigram_labels))
    ax.barh(y_pos, trigram_vals)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(trigram_labels, fontsize=9)
    ax.set_xlabel("Frequency")
    ax.set_title(f"{book_title} — Top {top_k} Trigrams")
    ax.invert_yaxis()

    # Bigrams on bottom
    ax = axes[1]
    y_pos = range(len(bigram_labels))
    ax.barh(y_pos, bigram_vals)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(bigram_labels, fontsize=9)
    ax.set_xlabel("Frequency")
    ax.set_title(f"{book_title} — Top {top_k} Bigrams")
    ax.invert_yaxis()

    if savepath:
        Path(savepath).parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(savepath, bbox_inches='tight')
    plt.show()
    return savepath, fig

# --- Run for War and Peace and Anna Karenina if variables exist ---
# This cell assumes your notebook already has `read_text`, `strip_gutenberg_headers`, `war_file`, `anna_file`, and `OUTDIR` defined.

if 'war_file' in globals() and war_file is not None and war_file.exists():
    print('Processing War and Peace (chunked) ...')
    war_text = strip_gutenberg_headers(read_text(war_file))
    war_png = OUTDIR / 'War_and_Peace_bi_tri_ngrams_chunked.png'
    plot_bigrams_trigrams_for_book_chunked('War and Peace', war_text, top_k=15, savepath=str(war_png))
    print('Saved:', war_png)
else:
    print('war_file not found in globals or file does not exist; skipping War and Peace chunked plot.')

if 'anna_file' in globals() and anna_file is not None and anna_file.exists():
    print('Processing Anna Karenina (chunked) ...')
    anna_text = strip_gutenberg_headers(read_text(anna_file))
    anna_png = OUTDIR / 'Anna_Karenina_bi_tri_ngrams_chunked.png'
    plot_bigrams_trigrams_for_book_chunked('Anna Karenina', anna_text, top_k=15, savepath=str(anna_png))
    print('Saved:', anna_png)
else:
    print('anna_file not found in globals or file does not exist; skipping Anna Karenina chunked plot.')

print('\nIf War and Peace still fails to produce a plot, check the printed messages and verify the file path and that spaCy model is available.')
