In [1]:
# --- Cell 1: Setup and Imports ---
import pandas as pd
import re
import requests
import os
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import os

# Mount your Google Drive
drive.mount('/content/drive')

# --- Configuration ---
project_folder = '/content/drive/MyDrive/NLP_Project'
corpus_path = os.path.join(project_folder, 'final_corpus.csv')
embeddings_path = os.path.join(project_folder, 'stylistic_embeddings.npz')

# --- Load Data ---
try:
    # Load the cleaned text data
    df = pd.read_csv(corpus_path)
    print(f"✅ Successfully loaded '{os.path.basename(corpus_path)}' from Google Drive.")
    print(f"Corpus shape: {df.shape}")

    # Load the embeddings and labels
    data = np.load(embeddings_path, allow_pickle=True)
    embeddings = data['embeddings']
    labels = data['labels']
    print(f"✅ Successfully loaded '{os.path.basename(embeddings_path)}' from Google Drive.")
    print(f"Embeddings shape: {embeddings.shape}")

    print("\nAll data loaded. You can now proceed with analysis and visualization.")

except FileNotFoundError as e:
    print(f"ERROR: A required file was not found. Please ensure both files were saved correctly.")
    print(f"Missing file: {e.filename}")

In [2]:
# --- Configuration ---

BOOKS_TO_DOWNLOAD = {
    'Fyodor Dostoevsky': [
        {'title': 'Crime and Punishment', 'id': 2554},
        {'title': 'The Brothers Karamazov', 'id': 28054},
    ],
    'Charles Dickens': [
        {'title': 'A Tale of Two Cities', 'id': 98},
        {'title': 'Great Expectations', 'id': 1400},
        {'title': 'Oliver Twist', 'id': 730},
    ]
}

PROCESSED_DATA_PATH = 'final_corpus.csv'


# --- Helper Functions ---

def download_gutenberg_text(book_id):
    """Downloads the plain text version of a book from Project Gutenberg."""
    url_patterns = [
        f'https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt',
        f'https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt',
        f'https://www.gutenberg.org/files/{book_id}/{book_id}.txt'
    ]
    for url in url_patterns:
        try:
            response = requests.get(url)
            if response.status_code == 200:
                print(f"  - Successfully downloaded from {url}")
                return response.content.decode('utf-8', errors='ignore')
        except requests.exceptions.RequestException:
            continue
    print(f"  - ERROR: Failed to download book with ID {book_id}.")
    return None

def remove_table_of_contents(text):
    """
    A new function to specifically find and remove lines that look like a Table of Contents.
    """
    lines = text.splitlines()
    cleaned_lines = []
    in_toc_section = False

    # A regex to identify typical TOC lines (e.g., "Chapter I...", "Part 1...")
    # It looks for the keyword, a number/roman numeral, and not much else on the line.
    toc_pattern = re.compile(r'^\s*(chapter|part|book|stave|epilogue)\s+([ivx\d]+|[a-zA-Z]+)?\s*(\.|\s)*\s*$', re.IGNORECASE)

    # Another pattern for lines ending in page numbers or many dots
    toc_pattern_2 = re.compile(r'.*\s\.{3,}\s*\d*\s*$|.*\s_+\s*\d*\s*$')

    for line in lines:
        # If a line matches a TOC pattern, we skip it.
        if toc_pattern.match(line.strip()) or toc_pattern_2.match(line.strip()):
            continue
        # A simple heuristic: if a line is just "CONTENTS" or "TABLE OF CONTENTS", skip it
        if line.strip().lower() in ["contents", "table of contents"]:
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)


def clean_gutenberg_text(text, book_title):
    """
    The main cleaning pipeline, now including the TOC removal.
    """
    # 1. Remove standard Gutenberg footer
    end_marker = re.search(r'\*\*\* END OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', text, re.IGNORECASE)
    if end_marker: text = text[:end_marker.start()]

    # 2. Remove standard Gutenberg header
    start_marker = re.search(r'\*\*\* START OF (THIS|THE) PROJECT GUTENBERG EBOOK .* \*\*\*', text, re.IGNORECASE)
    if start_marker: text = text[start_marker.end():]

    # 3. *** NEW STEP *** Remove the Table of Contents
    text = remove_table_of_contents(text)

    # 4. Final whitespace cleanup
    text = re.sub(r'(\r\n|\n|\r){3,}', '\n\n', text).strip()
    return text

def segment_text(text, min_chunk_length=50):
    """Splits a long text into smaller chunks (paragraphs)."""
    if not isinstance(text, str): return []
    chunks = re.split(r'\n{2,}', text)
    return [chunk.strip() for chunk in chunks if len(chunk.strip()) >= min_chunk_length]

def normalize_text(text):
    """Performs basic text normalization."""
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Main Processing Pipeline ---

def main():
    processed_data = []
    print("Starting FINAL data acquisition and cleaning pipeline...")

    for author, books in BOOKS_TO_DOWNLOAD.items():
        print(f"\nProcessing books for: {author}")
        for book in books:
            title, book_id = book['title'], book['id']
            print(f"- Downloading '{title}' (ID: {book_id})...")

            raw_text = download_gutenberg_text(book_id)
            if not raw_text: continue

            cleaned_text = clean_gutenberg_text(raw_text, title)
            chunks = segment_text(cleaned_text)

            for chunk in chunks:
                normalized_chunk = normalize_text(chunk)
                processed_data.append({
                    'author': author,
                    'book_title': title,
                    'text_chunk': normalized_chunk
                })
            print(f"  - Finished processing '{title}', created {len(chunks)} text chunks.")

    if not processed_data:
        print("\nPipeline finished, but no data was processed.")
        return

    processed_df = pd.DataFrame(processed_data)
    processed_df.to_csv(PROCESSED_DATA_PATH, index=False)

    print("\n--------------------")
    print("Pipeline Complete!")
    print(f"Total processed text chunks: {len(processed_df)}")
    print(f"Final, cleaned data has been saved to '{PROCESSED_DATA_PATH}'")
    print("--------------------")


if __name__ == '__main__':
    main()


Starting FINAL data acquisition and cleaning pipeline...

Processing books for: Fyodor Dostoevsky
- Downloading 'Crime and Punishment' (ID: 2554)...
  - Successfully downloaded from https://www.gutenberg.org/files/2554/2554-0.txt
  - Finished processing 'Crime and Punishment', created 3127 text chunks.
- Downloading 'The Brothers Karamazov' (ID: 28054)...
  - Successfully downloaded from https://www.gutenberg.org/files/28054/28054-0.txt
  - Finished processing 'The Brothers Karamazov', created 4895 text chunks.

Processing books for: Charles Dickens
- Downloading 'A Tale of Two Cities' (ID: 98)...
  - Successfully downloaded from https://www.gutenberg.org/files/98/98-0.txt
  - Finished processing 'A Tale of Two Cities', created 2575 text chunks.
- Downloading 'Great Expectations' (ID: 1400)...
  - Successfully downloaded from https://www.gutenberg.org/files/1400/1400-0.txt
  - Finished processing 'Great Expectations', created 3107 text chunks.
- Downloading 'Oliver Twist' (ID: 730)...


In [3]:
# --- Cell 3: Text Embedding ---
# This cell loads the cleaned text data and uses a pre-trained transformer model
# to generate a numerical vector (embedding) for each text chunk.

# --- Configuration ---
# Input file from the previous step
PROCESSED_DATA_PATH = 'final_corpus.csv'
# Model for generating embeddings, as per the project plan
MODEL_NAME = 'roberta-base'
# Output file for the generated embeddings and labels
EMBEDDINGS_OUTPUT_PATH = 'stylistic_embeddings.npz'
# Processing in batches is more efficient, especially on a GPU
BATCH_SIZE = 32

def generate_embeddings():
    """
    Main function to generate embeddings from the processed text data.
    """
    # 1. Verify that the input file from the previous step exists
    if not os.path.exists(PROCESSED_DATA_PATH):
        print(f"ERROR: Input file not found at '{PROCESSED_DATA_PATH}'")
        print("Please run the Data Acquisition and Cleaning cell (Cell 2) first.")
        return

    # 2. Set up the device (use GPU if available in Colab)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # 3. Load the pre-trained model and tokenizer from Hugging Face
    print(f"Loading tokenizer and model: '{MODEL_NAME}'...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME)
    model.to(device)  # Move model to the selected device
    model.eval()      # Set model to evaluation mode
    print("Model and tokenizer loaded successfully.")

    # 4. Load the processed text data
    df = pd.read_csv(PROCESSED_DATA_PATH)
    texts = df['text_chunk'].tolist()
    # Create numerical labels for authors (e.g., 0 for Dostoevsky, 1 for Dickens)
    labels, author_names = pd.factorize(df['author'])
    author_mapping = {i: name for i, name in enumerate(author_names)}

    print(f"\nFound {len(texts)} text chunks to embed.")
    print(f"Author mapping: {author_mapping}")

    # 5. Generate embeddings in batches for efficiency
    all_embeddings = []
    print(f"\nGenerating embeddings in batches of {BATCH_SIZE}...")

    # tqdm provides a helpful progress bar
    for i in tqdm(range(0, len(texts), BATCH_SIZE)):
        batch_texts = texts[i:i + BATCH_SIZE]

        # Tokenize the batch of texts
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Move tokenized inputs to the GPU/CPU
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Perform inference without calculating gradients to save resources
        with torch.no_grad():
            outputs = model(**inputs)

        # We use the embedding of the [CLS] token as the representation for the whole chunk.
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_embeddings)

    # Combine the embeddings from all batches into one large array
    final_embeddings = np.vstack(all_embeddings)

    print(f"\nEmbedding generation complete. Shape of embeddings array: {final_embeddings.shape}")

    # 6. Save the embeddings and corresponding labels to a compressed .npz file
    np.savez_compressed(
        EMBEDDINGS_OUTPUT_PATH,
        embeddings=final_embeddings,
        labels=labels,
        author_mapping=author_mapping
    )
    print(f"Embeddings and labels saved to '{EMBEDDINGS_OUTPUT_PATH}'")

    # 7. Verification step
    print("\n--- Verification ---")
    loaded_data = np.load(EMBEDDINGS_OUTPUT_PATH, allow_pickle=True)
    print(f"Loaded embeddings shape: {loaded_data['embeddings'].shape}")
    print(f"Loaded labels shape: {loaded_data['labels'].shape}")
    print(f"Loaded author mapping: {loaded_data['author_mapping'].item()}")
    print("Verification successful. You are ready for analysis.")

# --- Run the Embedding Generation ---
generate_embeddings()

Using device: cuda
Loading tokenizer and model: 'roberta-base'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully.

Found 17100 text chunks to embed.
Author mapping: {0: 'Fyodor Dostoevsky', 1: 'Charles Dickens'}

Generating embeddings in batches of 32...


  0%|          | 0/535 [00:00<?, ?it/s]


Embedding generation complete. Shape of embeddings array: (17100, 768)
Embeddings and labels saved to 'stylistic_embeddings.npz'

--- Verification ---
Loaded embeddings shape: (17100, 768)
Loaded labels shape: (17100,)
Loaded author mapping: {0: 'Fyodor Dostoevsky', 1: 'Charles Dickens'}
Verification successful. You are ready for analysis.


In [5]:
from google.colab import drive
import shutil
import os

# Mount your Google Drive
drive.mount('/content/drive')

# --- Configuration ---
# Define the project folder in your Drive
project_folder = '/content/drive/MyDrive/NLP_Project'
os.makedirs(project_folder, exist_ok=True)

# List of files to save
files_to_save = ['final_corpus.csv', 'stylistic_embeddings.npz']

# --- Save Files ---
for file_name in files_to_save:
    source_path = os.path.join('/content/', file_name)
    destination_path = os.path.join(project_folder, file_name)

    if os.path.exists(source_path):
        shutil.copyfile(source_path, destination_path)
        print(f"✅ Successfully saved '{file_name}' to your Google Drive!")
    else:
        print(f"⚠️ WARNING: File '{file_name}' not found in the Colab session. Skipping.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Successfully saved 'final_corpus.csv' to your Google Drive!
✅ Successfully saved 'stylistic_embeddings.npz' to your Google Drive!
