In [9]:
import pandas as pd
import re
import requests
import os

# --- Configuration ---

BOOKS_TO_DOWNLOAD = {
    'Fyodor Dostoevsky': [
        {'title': 'Crime and Punishment', 'id': 2554},
        {'title': 'The Brothers Karamazov', 'id': 28054},
    ],
    'Charles Dickens': [
        {'title': 'A Tale of Two Cities', 'id': 98},
        {'title': 'Great Expectations', 'id': 1400},
        {'title': 'Oliver Twist', 'id': 730},
    ]
}

PROCESSED_DATA_PATH = 'final_corpus.csv'


# --- Helper Functions ---

def download_gutenberg_text(book_id):
    """Downloads the plain text version of a book from Project Gutenberg."""
    url_patterns = [
        f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt',
        f'https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt',
        f'https://www.gutenberg.org/files/{book_id}/{book_id}.txt'
    ]
    for url in url_patterns:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                print(f"  - Successfully downloaded from {url}")
                return response.content.decode('utf-8', errors='ignore')
        except requests.exceptions.RequestException:
            continue
    print(f"  - ERROR: Failed to download book with ID {book_id}.")
    return None

def remove_table_of_contents(text):
    """
    A new function to specifically find and remove lines that look like a Table of Contents.
    """
    lines = text.splitlines()
    cleaned_lines = []
    in_toc_section = False

    # A regex to identify typical TOC lines (e.g., "Chapter I...", "Part 1...")
    # It looks for the keyword, a number/roman numeral, and not much else on the line.
    toc_pattern = re.compile(r'^\s*(chapter|part|book|stave|epilogue)\s+([ivx\d]+|[a-zA-Z]+)?\s*(\.|\s)*\s*$', re.IGNORECASE)

    # Another pattern for lines ending in page numbers or many dots
    toc_pattern_2 = re.compile(r'.*\s\.{3,}\s*\d*\s*$|.*\s_+\s*\d*\s*$')

    for line in lines:
        # If a line matches a TOC pattern, we skip it.
        if toc_pattern.match(line.strip()) or toc_pattern_2.match(line.strip()):
            continue
        # A simple heuristic: if a line is just "CONTENTS" or "TABLE OF CONTENTS", skip it
        if line.strip().lower() in ["contents", "table of contents"]:
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)


def clean_gutenberg_text(text, book_title):
    """
    The main cleaning pipeline, now including the TOC removal.
    """
    # 1. Remove standard Gutenberg footer
    end_marker = re.search(r'\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', text, re.IGNORECASE)
    if end_marker: text = text[:end_marker.start()]

    # 2. Remove standard Gutenberg header
    start_marker = re.search(r'\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', text, re.IGNORECASE)
    if start_marker: text = text[start_marker.end():]

    # 3. *** NEW STEP *** Remove the Table of Contents
    text = remove_table_of_contents(text)

    # 4. Final whitespace cleanup
    text = re.sub(r'(\r\n|\n|\r){3,}', '\n\n', text).strip()
    return text

def segment_text(text, min_chunk_length=50):
    """Splits a long text into smaller chunks (paragraphs)."""
    if not isinstance(text, str): return []
    chunks = re.split(r'\n{2,}', text)
    return [chunk.strip() for chunk in chunks if len(chunk.strip()) >= min_chunk_length]

def normalize_text(text):
    """Performs basic text normalization."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Main Processing Pipeline ---

def main():
    processed_data = []
    print("Starting FINAL data acquisition and cleaning pipeline...")

    for author, books in BOOKS_TO_DOWNLOAD.items():
        print(f"\nProcessing books for: {author}")
        for book in books:
            title, book_id = book['title'], book['id']
            print(f"- Downloading '{title}' (ID: {book_id})...")

            raw_text = download_gutenberg_text(book_id)
            if not raw_text: continue

            cleaned_text = clean_gutenberg_text(raw_text, title)
            chunks = segment_text(cleaned_text)

            for chunk in chunks:
                normalized_chunk = normalize_text(chunk)
                processed_data.append({
                    'author': author,
                    'book_title': title,
                    'text_chunk': normalized_chunk
                })
            print(f"  - Finished processing '{title}', created {len(chunks)} text chunks.")

    if not processed_data:
        print("\nPipeline finished, but no data was processed.")
        return

    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(PROCESSED_DATA_PATH, index=False)

    print("\n--------------------")
    print("Pipeline Complete!")
    print(f"Total processed text chunks: {len(processed_df)}")
    print(f"Final, cleaned data has been saved to '{PROCESSED_DATA_PATH}'")
    print("--------------------")


if __name__ == '__main__':
    main()


Starting FINAL data acquisition and cleaning pipeline...

Processing books for: Fyodor Dostoevsky
- Downloading 'Crime and Punishment' (ID: 2554)...
  - Successfully downloaded from https://www.gutenberg.org/files/2554/2554-0.txt
  - Finished processing 'Crime and Punishment', created 3127 text chunks.
- Downloading 'The Brothers Karamazov' (ID: 28054)...
  - Successfully downloaded from https://www.gutenberg.org/files/28054/28054-0.txt
  - Finished processing 'The Brothers Karamazov', created 4895 text chunks.

Processing books for: Charles Dickens
- Downloading 'A Tale of Two Cities' (ID: 98)...
  - Successfully downloaded from https://www.gutenberg.org/files/98/98-0.txt
  - Finished processing 'A Tale of Two Cities', created 2575 text chunks.
- Downloading 'Great Expectations' (ID: 1400)...
  - Successfully downloaded from https://www.gutenberg.org/files/1400/1400-0.txt
  - Finished processing 'Great Expectations', created 3107 text chunks.
- Downloading 'Oliver Twist' (ID: 730)...
