In [None]:
https://www.gutenberg.org/cache/epub/2600/pg2600-images.html

In [22]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import inflect

def clean_text(text):
    """Clean text while preserving structure"""
    # Remove excessive whitespace but keep paragraph breaks
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Multiple line breaks to double
    text = re.sub(r'[ \t]+', ' ', text)             # Multiple spaces to single
    text = text.strip()
    return text

def extract_full_book_text(soup):
    """Extract all text content from the book"""
    print("Extracting full book text...")

    # Remove navigation, headers, footers, and other non-content elements
    for element in soup.find_all(['script', 'style', 'nav', 'header', 'footer']):
        element.decompose()

    # Look for the main content area - usually in a div or body
    main_content = soup.find('body') or soup

    # Extract all text, preserving some structure
    full_text = main_content.get_text(separator='\n', strip=True)

    # Clean up the text
    full_text = clean_text(full_text)

    print(f"Extracted {len(full_text)} characters of text")
    return full_text

def split_into_chapters(full_text):
    """Split the full text into chapters using regex patterns"""
    print("Splitting text into chapters...")

    # Define chapter patterns - look for various chapter heading formats
    chapter_patterns = [
        r'\n\s*CHAPTER\s+[IVXLCDM]+\s*\.?\s*\n',           # CHAPTER I, II, III etc
        r'\n\s*Chapter\s+[IVXLCDM]+\s*\.?\s*\n',           # Chapter I, II, III etc
        r'\n\s*CHAPTER\s+\d+\s*\.?\s*\n',                  # CHAPTER 1, 2, 3 etc
        r'\n\s*Chapter\s+\d+\s*\.?\s*\n',                  # Chapter 1, 2, 3 etc
        r'\n\s*BOOK\s+[IVXLCDM]+\s*\.?\s*\n',             # BOOK I, II, III etc
        r'\n\s*Book\s+[IVXLCDM]+\s*\.?\s*\n',             # Book I, II, III etc
    ]

    # Combine all patterns
    combined_pattern = '|'.join(f'({pattern})' for pattern in chapter_patterns)

    # Find all chapter boundaries
    chapter_splits = list(re.finditer(combined_pattern, full_text, re.IGNORECASE))

    print(f"Found {len(chapter_splits)} potential chapter boundaries")

    chapters = []

    if not chapter_splits:
        print("No chapter patterns found, trying alternative approach...")
        # Try simpler pattern
        simple_pattern = r'\n\s*(CHAPTER|Chapter)\s+[IVXLCDM\d]+[^\n]*\n'
        chapter_splits = list(re.finditer(simple_pattern, full_text, re.IGNORECASE))
        print(f"Found {len(chapter_splits)} chapters with simple pattern")

    if chapter_splits:
        for i, match in enumerate(chapter_splits):
            # Get chapter title
            chapter_title = match.group().strip()

            # Get chapter start position
            start_pos = match.end()

            # Get chapter end position (start of next chapter or end of text)
            if i + 1 < len(chapter_splits):
                end_pos = chapter_splits[i + 1].start()
            else:
                end_pos = len(full_text)

            # Extract chapter content
            chapter_content = full_text[start_pos:end_pos].strip()

            if len(chapter_content) > 100:  # Only include chapters with substantial content
                chapters.append((chapter_title, chapter_content))
                print(f"Chapter {i+1}: {chapter_title[:50]}... ({len(chapter_content)} chars)")

    if not chapters:
        print("No chapters found with automatic splitting. Manual inspection needed.")
        # Return the full text as a single "chapter" for manual review
        chapters.append(("FULL TEXT", full_text))

    return chapters

def assign_book_labels(chapters):
    """Assign book and epilogue labels to chapters"""
    print("Assigning book and epilogue labels...")

    p = inflect.engine()
    number_to_word = {
        1: 'ONE', 2: 'TWO', 3: 'THREE', 4: 'FOUR', 5: 'FIVE',
        6: 'SIX', 7: 'SEVEN', 8: 'EIGHT', 9: 'NINE', 10: 'TEN',
        11: 'ELEVEN', 12: 'TWELVE', 13: 'THIRTEEN', 14: 'FOURTEEN', 15: 'FIFTEEN'
    }

    labeled_chapters = []
    book_count = 1
    epilogue_count = 0
    in_epilogue = False

    for i, (chapter_title, chapter_content) in enumerate(chapters):
        # Normalize chapter title for comparison
        normalized_title = re.sub(r'\s+', ' ', chapter_title.upper().strip())

        # Check if this is the start of a new book (Chapter I appearing again)
        if i > 0 and ('CHAPTER I' in normalized_title or 'CHAPTER 1' in normalized_title):
            if not in_epilogue and book_count < 15:
                book_count += 1
                label = f"BOOK {number_to_word[book_count]}"
            else:
                in_epilogue = True
                epilogue_count += 1
                ordinal = p.ordinal(p.number_to_words(epilogue_count)).upper()
                label = f"{ordinal} EPILOGUE"
        else:
            # Check if we're in epilogue section
            if 'EPILOGUE' in normalized_title or in_epilogue:
                in_epilogue = True
                if epilogue_count == 0:
                    epilogue_count = 1
                ordinal = p.ordinal(p.number_to_words(epilogue_count)).upper()
                label = f"{ordinal} EPILOGUE"
            else:
                label = f"BOOK {number_to_word[book_count]}"

        full_index = f"{label}, {normalized_title}"
        labeled_chapters.append((full_index, chapter_content))

    return labeled_chapters

# Main execution
print("Starting War and Peace text extraction...")
print("="*60)

# Step 1: Load HTML
url = "https://www.gutenberg.org/cache/epub/2600/pg2600-images.html"
print(f"Loading from: {url}")

try:
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    print(f"✓ Successfully loaded HTML ({len(response.text):,} characters)")
except Exception as e:
    print(f"✗ Error loading HTML: {e}")
    exit(1)

# Step 2: Extract full text
full_text = extract_full_book_text(soup)

if len(full_text) < 100000:  # War and Peace should be much longer
    print(f"⚠ Warning: Extracted text seems too short ({len(full_text)} chars)")
    print("First 500 characters:")
    print(full_text[:500])
    print("\nLast 500 characters:")
    print(full_text[-500:])

# Step 3: Split into chapters
chapters = split_into_chapters(full_text)

if not chapters:
    print("✗ No chapters found!")
    exit(1)

print(f"✓ Successfully split into {len(chapters)} chapters")

# Step 4: Assign book labels
labeled_chapters = assign_book_labels(chapters)

# Step 5: Create DataFrame and save
print("Creating final dataset...")
data = []

for index_label, chapter_content in labeled_chapters:
    word_count = len(chapter_content.split())
    char_count = len(chapter_content)

    data.append({
        'index': index_label,
        'text': chapter_content,
        'word_count': word_count,
        'char_count': char_count
    })

    print(f"  {index_label}: {word_count:,} words")

# Save to CSV
df = pd.DataFrame(data)
csv_path = "war_and_peace_full_chapters.csv"
df.to_csv(csv_path, index=False, encoding='utf-8')

# Print final summary
print("\n" + "="*60)
print("EXTRACTION COMPLETE!")
print("="*60)
print(f"File saved: {csv_path}")
print(f"Total chapters: {len(df)}")
print(f"Total words: {df['word_count'].sum():,}")
print(f"Average words per chapter: {df['word_count'].mean():.0f}")
print(f"Longest chapter: {df['word_count'].max():,} words")
print(f"Shortest chapter: {df['word_count'].min():,} words")

# Show first few chapters as sample
print(f"\nFirst 3 chapters:")
for i in range(min(3, len(df))):
    row = df.iloc[i]
    print(f"\n{i+1}. {row['index']}")
    print(f"   Words: {row['word_count']:,}")
    print(f"   Preview: {row['text'][:150]}...")

print(f"\nData saved to: {csv_path}")
print("You can now open this CSV file to see the complete text of each chapter!")

Starting War and Peace text extraction...
Loading from: https://www.gutenberg.org/cache/epub/2600/pg2600-images.html
✓ Successfully loaded HTML (3,812,034 characters)
Extracting full book text...
Extracted 3287376 characters of text
Splitting text into chapters...
Found 551 potential chapter boundaries
Chapter 187: CHAPTER I... (11673 chars)
Chapter 188: CHAPTER II... (7953 chars)
Chapter 189: CHAPTER III... (8740 chars)
Chapter 190: CHAPTER IV... (8136 chars)
Chapter 191: CHAPTER V... (11190 chars)
Chapter 192: CHAPTER VI... (7831 chars)
Chapter 193: CHAPTER VII... (5767 chars)
Chapter 194: CHAPTER VIII... (6894 chars)
Chapter 195: CHAPTER IX... (12182 chars)
Chapter 196: CHAPTER X... (9855 chars)
Chapter 197: CHAPTER XI... (5060 chars)
Chapter 198: CHAPTER XII... (7808 chars)
Chapter 199: CHAPTER XIII... (4105 chars)
Chapter 200: CHAPTER XIV... (8595 chars)
Chapter 201: CHAPTER XV... (8647 chars)
Chapter 202: CHAPTER XVI... (10468 chars)
Chapter 203: CHAPTER XVII... (4086 chars)
Chap