In [None]:
import pandas as pd
import re
import nltk
from transformers import AutoTokenizer
import os

# --- Configuration ---

# 1. DEFINE FILE PATHS
# TODO: Update this to the actual path where you've downloaded the Kaggle dataset.
# The "Comprehensive Literary Greats Dataset" from Kaggle is a great choice as suggested.
# We'll assume the main file is named 'all_books_with_summaries.csv'.
# If you use a different dataset, you may need to adjust the column names below.
DATASET_PATH = 'path/to/your/all_books_with_summaries.csv'

# 2. DEFINE AUTHORS FOR THE CORPUS
AUTHORS = ['Fyodor Dostoevsky', 'Charles Dickens']

# 3. CHOOSE PRE-TRAINED MODEL FOR TOKENIZATION
# Your project ladder suggests roberta-base, which is an excellent choice.
# We will use its tokenizer to prepare the text.
MODEL_NAME = 'roberta-base'

# 4. OUTPUT FILE
# This is where we'll save the clean, processed data.
PROCESSED_DATA_PATH = 'processed_literary_corpus.csv'


# --- Helper Functions ---

def load_data(path):
    """
    Loads the dataset from the specified CSV file.
    Handles potential FileNotFoundError.
    """
    print(f"Attempting to load data from: {path}")
    if not os.path.exists(path):
        print(f"---")
        print(f"ERROR: File not found at '{path}'")
        print("Please download the 'Comprehensive Literary Greats Dataset' from Kaggle")
        print("or another suitable dataset and update the DATASET_PATH variable in this script.")
        print(f"---")
        return None
    try:
        # The dataset might be large, so let's be mindful of data types.
        return pd.read_csv(path)
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None

def segment_text(text, min_chunk_length=50):
    """
    Splits a long text (like a full book) into smaller chunks.
    A good heuristic is to split by double newlines, which often separate paragraphs.

    Args:
        text (str): The full text of the book.
        min_chunk_length (int): The minimum character length for a chunk to be kept.

    Returns:
        list: A list of text chunks (paragraphs).
    """
    if not isinstance(text, str):
        return []

    # Split by one or more newline characters
    chunks = re.split(r'\n+', text)

    # Filter out very short or empty chunks and strip leading/trailing whitespace
    return [chunk.strip() for chunk in chunks if len(chunk.strip()) >= min_chunk_length]

def normalize_text(text):
    """
    Performs basic text normalization:
    1. Converts to lowercase.
    2. Removes excessive whitespace.

    Note: We are not removing punctuation here because it can be a stylistic feature.
    The transformer's tokenizer is designed to handle punctuation correctly.
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Main Processing Pipeline ---

def main():
    """
    Main function to run the data acquisition and preprocessing pipeline.
    """
    # 1. Load the dataset
    df = load_data(DATASET_PATH)
    if df is None:
        return # Stop execution if data loading failed

    print("Dataset loaded successfully. Columns:", df.columns.tolist())

    # 2. Filter for the chosen authors
    # We'll assume the author information is in a column named 'author'.
    # Adjust 'author' to the correct column name if it's different in your file.
    author_column = 'author'
    if author_column not in df.columns:
        print(f"ERROR: Column '{author_column}' not found in the dataset.")
        print("Please check the CSV file and update the 'author_column' variable.")
        return

    print(f"\nFiltering for authors: {AUTHORS}")
    corpus_df = df[df[author_column].isin(AUTHORS)].copy()
    print(f"Found {len(corpus_df)} books by the selected authors.")

    if corpus_df.empty:
        print("No books by the specified authors were found. Please check the author names and the dataset.")
        return

    # 3. Process each book
    processed_data = []
    text_column = 'text' # Assuming the book content is in a column named 'text'
    if text_column not in corpus_df.columns:
        print(f"ERROR: Column '{text_column}' not found in the dataset.")
        print("Please check the CSV file and update the 'text_column' variable.")
        return

    print("\nStarting text segmentation and normalization for each book...")
    for index, row in corpus_df.iterrows():
        book_title = row.get('book_title', 'Unknown Title')
        author = row[author_column]
        book_text = row[text_column]

        # Segment the book's text into paragraphs/chunks
        chunks = segment_text(book_text)

        # Normalize each chunk and add it to our processed list
        for chunk in chunks:
            normalized_chunk = normalize_text(chunk)
            processed_data.append({
                'author': author,
                'book_title': book_title,
                'text_chunk': normalized_chunk
            })
        print(f"  - Processed '{book_title}' by {author}, created {len(chunks)} chunks.")

    # 4. Create a new DataFrame with the processed data
    processed_df = pd.DataFrame(processed_data)
    print(f"\nTotal processed text chunks: {len(processed_df)}")

    # 5. Save the processed data to a new CSV file
    processed_df.to_csv(PROCESSED_DATA_PATH, index=False)
    print(f"\nProcessed data has been saved to '{PROCESSED_DATA_PATH}'")

    # 6. Demonstrate tokenization on a sample
    print("\n--- Tokenization Example ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        sample_text = processed_df['text_chunk'].iloc[0]
        print(f"Sample Text:\n'{sample_text}'")

        tokens = tokenizer.tokenize(sample_text)
        print(f"\nTokens produced by '{MODEL_NAME}' tokenizer:\n{tokens}")

        encoded_input = tokenizer(sample_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        print(f"\nEncoded input (for model input):\n{encoded_input}")

    except Exception as e:
        print(f"Could not demonstrate tokenization. Error: {e}")


if __name__ == '__main__':
    # Download NLTK data if not already present (used for sentence tokenization if needed)
    # nltk.download('punkt')
    main()