In [1]:
import pandas as pd
import re
import nltk
from transformers import AutoTokenizer
import os
import subprocess
import zipfile

# --- Configuration ---

# 1. KAGGLE DATASET IDENTIFIER
# This is the unique ID for the dataset on Kaggle.
KAGGLE_DATASET_ID = 'thedevastator/comprehensive-literary-greats-dataset'

# The specific file we need from the downloaded zip archive.
DATASET_FILENAME = 'books_1.Best_Books_Ever.csv'

# 2. DEFINE AUTHORS FOR THE CORPUS
AUTHORS = ['Fyodor Dostoevsky', 'Charles Dickens']

# 3. CHOOSE PRE-TRAINED MODEL FOR TOKENIZATION
# Your project ladder suggests roberta-base, which is an excellent choice.
MODEL_NAME = 'roberta-base'

# 4. OUTPUT FILE
# This is where we'll save the clean, processed data.
PROCESSED_DATA_PATH = 'processed_literary_corpus.csv'


# --- Helper Functions ---

def download_dataset_from_kaggle(dataset_id, destination_file):
    """
    Downloads and unzips a dataset from Kaggle using the Kaggle API.
    This function is designed for a Google Colab environment.
    """
    # If the file already exists, we don't need to download it again.
    if os.path.exists(destination_file):
        print(f"Dataset '{destination_file}' already exists. Skipping download.")
        return True

    print("--- Setting up Kaggle API for dataset download ---")
    # In Colab, the user needs to upload their kaggle.json file.
    if not os.path.exists('kaggle.json'):
        print("\n[ACTION REQUIRED]")
        print("ERROR: 'kaggle.json' not found in the current Colab session.")
        print("To proceed, please do the following:")
        print("1. Go to your Kaggle account page, navigate to the 'API' section.")
        print("2. Click on 'Create New API Token'. This will download a 'kaggle.json' file.")
        print("3. In Google Colab, click the 'Files' icon on the left sidebar and upload the 'kaggle.json' file you just downloaded.")
        print("4. Rerun this cell after uploading.")
        return False

    # Create the necessary directory and move the API key into it.
    os.makedirs('/root/.kaggle', exist_ok=True)
    os.rename('kaggle.json', '/root/.kaggle/kaggle.json')
    # Set the correct permissions for the API key file.
    os.chmod('/root/.kaggle/kaggle.json', 600)

    print(f"Downloading dataset: {dataset_id}...")
    # The name of the downloaded zip file is typically the dataset name part of the ID.
    zip_file_name = f"{dataset_id.split('/')[1]}.zip"

    # Execute the Kaggle API command to download the dataset.
    command = ['kaggle', 'datasets', 'download', '-d', dataset_id]
    result = subprocess.run(command, capture_output=True, text=True)

    if result.returncode != 0:
        print("\nERROR: Kaggle download failed.")
        print("Please ensure your 'kaggle.json' is valid and you have accepted the dataset's terms on the Kaggle website.")
        print("Stderr:", result.stderr)
        return False

    print("Download successful. Unzipping...")
    # Unzip the downloaded archive to extract our target file.
    try:
        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
            if destination_file in zip_ref.namelist():
                zip_ref.extract(destination_file, '.')
                print(f"Successfully extracted '{destination_file}'")
            else:
                print(f"ERROR: The expected file '{destination_file}' was not in the archive.")
                print(f"Files found in zip: {zip_ref.namelist()}")
                return False
    except Exception as e:
        print(f"An error occurred during unzipping: {e}")
        return False
    finally:
        # Clean up by removing the downloaded zip file.
        if os.path.exists(zip_file_name):
            os.remove(zip_file_name)

    return True

def load_data(path):
    """
    Loads the dataset from the specified CSV file.
    """
    print(f"\nAttempting to load data from: {path}")
    try:
        return pd.read_csv(path)
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None

def segment_text(text, min_chunk_length=50):
    """
    Splits a long text into smaller chunks (paragraphs).
    """
    if not isinstance(text, str):
        return []
    chunks = re.split(r'\n+', text)
    return [chunk.strip() for chunk in chunks if len(chunk.strip()) >= min_chunk_length]

def normalize_text(text):
    """
    Performs basic text normalization.
    """
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Main Processing Pipeline ---

def main():
    """
    Main function to run the data acquisition and preprocessing pipeline.
    """
    # 1. Download the dataset from Kaggle if it doesn't exist locally.
    if not download_dataset_from_kaggle(KAGGLE_DATASET_ID, DATASET_FILENAME):
        print("\nPipeline stopped: Could not acquire dataset.")
        return

    # 2. Load the dataset
    df = load_data(DATASET_FILENAME)
    if df is None:
        return

    print("Dataset loaded successfully. Columns:", df.columns.tolist())

    # 3. Filter for the chosen authors
    author_column = 'author'
    if author_column not in df.columns:
        print(f"ERROR: Column '{author_column}' not found. Please check the dataset.")
        return

    print(f"\nFiltering for authors: {AUTHORS}")
    corpus_df = df[df[author_column].isin(AUTHORS)].copy()
    print(f"Found {len(corpus_df)} books by the selected authors.")

    if corpus_df.empty:
        print("No books by the specified authors were found.")
        return

    # 4. Process each book
    processed_data = []
    text_column = 'text'
    if text_column not in corpus_df.columns:
        print(f"ERROR: Column '{text_column}' not found. Please check the dataset.")
        return

    print("\nStarting text segmentation and normalization...")
    for index, row in corpus_df.iterrows():
        book_title = row.get('book_title', 'Unknown Title')
        author = row[author_column]
        book_text = row[text_column]

        chunks = segment_text(book_text)
        for chunk in chunks:
            normalized_chunk = normalize_text(chunk)
            processed_data.append({
                'author': author,
                'book_title': book_title,
                'text_chunk': normalized_chunk
            })
        print(f"  - Processed '{book_title}' by {author}, created {len(chunks)} chunks.")

    # 5. Create and save the final processed DataFrame
    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(PROCESSED_DATA_PATH, index=False)
    print(f"\nTotal processed text chunks: {len(processed_df)}")
    print(f"Processed data has been saved to '{PROCESSED_DATA_PATH}'")

    # 6. Demonstrate tokenization on a sample
    print("\n--- Tokenization Example ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        sample_text = processed_df['text_chunk'].iloc[0]
        print(f"Sample Text:\n'{sample_text}'")
        tokens = tokenizer.tokenize(sample_text)
        print(f"\nTokens from '{MODEL_NAME}':\n{tokens}")
    except Exception as e:
        print(f"Could not demonstrate tokenization. Error: {e}")


if __name__ == '__main__':
    main()


--- Setting up Kaggle API for dataset download ---
Downloading dataset: thedevastator/comprehensive-literary-greats-dataset...
Download successful. Unzipping...
ERROR: The expected file 'all_books_with_summaries.csv' was not in the archive.
Files found in zip: ['books_1.Best_Books_Ever.csv']

Pipeline stopped: Could not acquire dataset.
