# Project: CTRL + Style (P10)
**Course:** Natural Language Processing
**Student:** [Your Name]

## AI Usage Disclaimer
Parts of this project code and the conceptual outlining were developed with the assistance of **Google Gemini**. The AI was used to:
1.  **Drafting Code:** Support in writing boilerplate code for data loading and plotting (matplotlib/sklearn).
2.  **Debugging:** Troubleshooting dimension mismatch errors in the embedding layer.
3.  **Ideation:** Refining the choice of authors for the style comparison.

All outputs have been modified, verified, and integrated into the final workflow by me. I take full responsibility for the final content.

In [3]:
# Install dependencies
!pip install -q transformers torch scikit-learn matplotlib seaborn nltk accelerate bitsandbytes google-generativeai

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os
from nltk.corpus import gutenberg
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix
from scipy.spatial.distance import cosine
from transformers import AutoTokenizer, AutoModel

# Configuration
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Device: {device}")

# Data Prep
nltk.download('gutenberg', quiet=True)
nltk.download('punkt', quiet=True)

def create_dataset(file_id, label, chunk_size=128, max_samples=300):
    """Tokenizes and chunks text from NLTK corpus."""
    raw = gutenberg.raw(file_id)
    words = raw.split()

    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        if len(chunk) > 200: # Filter short trailing chunks
            chunks.append(chunk)

    # Subsample to balance classes
    if len(chunks) > max_samples:
        import random
        random.seed(SEED)
        chunks = random.sample(chunks, max_samples)

    return pd.DataFrame({'text': chunks, 'label': label})

# Load Corpus
df = pd.concat([
    create_dataset('austen-emma.txt', 'Austen'),
    create_dataset('melville-moby_dick.txt', 'Melville')
]).reset_index(drop=True)

print(f"Dataset loaded: {len(df)} samples")
print(df['label'].value_counts())

Device: cuda
Dataset loaded: 600 samples
label
Austen      300
Melville    300
Name: count, dtype: int64


In [4]:
# Load BAAI/bge-small-en-v1.5 for semantic clustering
# MTEB leaderboard standard for retrieval/similarity
SEM_MODEL_NAME = "BAAI/bge-small-en-v1.5"

print(f"Loading {SEM_MODEL_NAME}...")
sem_tokenizer = AutoTokenizer.from_pretrained(SEM_MODEL_NAME)
sem_model = AutoModel.from_pretrained(SEM_MODEL_NAME).to(device)

def get_semantic_embeddings(text_list, batch_size=32):
    sem_model.eval()
    embeddings = []

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = sem_tokenizer(batch, padding=True, truncation=True,
                               max_length=512, return_tensors="pt").to(device)

        with torch.no_grad():
            out = sem_model(**inputs)

        # BGE uses CLS token normalized
        cls_emb = out.last_hidden_state[:, 0]
        cls_emb = torch.nn.functional.normalize(cls_emb, p=2, dim=1)
        embeddings.append(cls_emb.cpu().numpy())

    return np.vstack(embeddings)

print("Generating semantic embeddings...")
vectors_sem = get_semantic_embeddings(df['text'].tolist())
print(f"Output shape: {vectors_sem.shape}")

Loading BAAI/bge-small-en-v1.5...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


KeyboardInterrupt: 

In [None]:
# Dimensionality Reduction
pca = PCA(n_components=2, random_state=SEED)
pca_res = pca.fit_transform(vectors_sem)

tsne = TSNE(n_components=2, perplexity=30, random_state=SEED, n_iter=1000)
tsne_res = tsne.fit_transform(vectors_sem)

# Clustering Metrics
kmeans = KMeans(n_clusters=2, random_state=SEED, n_init=10)
clusters = kmeans.fit_predict(vectors_sem)
sil_score = silhouette_score(vectors_sem, df['label'])

# Plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# PCA
sns.scatterplot(ax=axes[0], x=pca_res[:,0], y=pca_res[:,1], hue=df['label'], alpha=0.7)
axes[0].set_title('PCA: Austen vs Melville')

# t-SNE
sns.scatterplot(ax=axes[1], x=tsne_res[:,0], y=tsne_res[:,1], hue=df['label'], alpha=0.7)
axes[1].set_title('t-SNE: Austen vs Melville')

plt.show()

print(f"Silhouette Score: {sil_score:.4f}")
print("\nConfusion Matrix (Cluster vs Label):")
print(pd.crosstab(df['label'], clusters))

In [None]:
import google.generativeai as genai
from google.colab import userdata

# API Setup
# Ensure GOOGLE_API_KEY is set in Colab Secrets
try:
    api_key = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=api_key)
except Exception as e:
    print("Error loading API key. Please set GOOGLE_API_KEY in Colab Secrets.")

# Using Flash 2.5 for speed
gen_model = genai.GenerativeModel('gemini-2.5-flash')

def style_transfer(text, target_author, style_example=None):
    """
    Rewrites text using Gemini API.
    Implements 'In-Context Learning' (Pan et al., 2024) by providing a style demonstration.
    """

    # Base instruction
    base_prompt = f"""
    You are an expert literary editor.
    TASK: Rewrite the text below to strictly mimic the writing style, vocabulary, and syntax of {target_author}.
    CONSTRAINTS: Keep original meaning/narrative. Do not add intro/outro text.
    """

    # Add Demonstration (ICL) if provided
    if style_example:
        base_prompt += f'\nSTYLE REFERENCE (Mimic this tone):\n"{style_example}"\n'

    base_prompt += f'\nORIGINAL TEXT:\n"{text}"\n\nREWRITTEN TEXT:'

    try:
        response = gen_model.generate_content(
            base_prompt,
            generation_config=genai.types.GenerationConfig(temperature=0.7)
        )
        return response.text.strip()
    except Exception as e:
        return f"GenAI Error: {str(e)}"

In [None]:
# 1. Select Control Sample (Austen)
sample_idx = 5
sample_austen = df[df['label'] == 'Austen'].iloc[sample_idx]['text']

# 2. Select Style Demonstration (Melville)
# We pick a random text from the target author to guide the LLM (Pan et al., 2024)
demo_idx = 10 # Arbitrary index
style_demo = df[df['label'] == 'Melville'].iloc[demo_idx]['text']

print(f"--- ORIGINAL (Austen) ---\n{sample_austen[:200]}...\n")
print(f"--- STYLE DEMO (Melville) ---\n{style_demo[:200]}...\n")

# 3. Generate Experimental Sample
# Passing the demo helps the model disentangle style from content
rewritten_melville = style_transfer(sample_austen, "Herman Melville", style_example=style_demo)
print(f"--- REWRITTEN (Target: Melville) ---\n{rewritten_melville[:200]}...\n")

# 4. Embed both using Semantic Model (BGE)
exp_vectors = get_semantic_embeddings([sample_austen, rewritten_melville])

# 5. Centroid Analysis
c_austen = vectors_sem[df['label'] == 'Austen'].mean(axis=0)
c_melville = vectors_sem[df['label'] == 'Melville'].mean(axis=0)

d_orig_austen = cosine(exp_vectors[0], c_austen)
d_new_melville = cosine(exp_vectors[1], c_melville)
d_new_austen = cosine(exp_vectors[1], c_austen)

print("-" * 30)
print("SEMANTIC SPACE (BGE) RESULTS:")
print(f"Original -> Austen Centroid:   {d_orig_austen:.4f}")
print(f"Rewritten -> Melville Centroid: {d_new_melville:.4f}")
print(f"Rewritten -> Austen Centroid:   {d_new_austen:.4f}")

if d_new_melville < d_new_austen:
    print("\nResult: SUCCESS. Vector shifted to Target Cluster.")
else:
    print("\nResult: FAILURE. Vector remained in Source Cluster (Content dominance).")

In [None]:
# Reload BGE configuration to output hidden states
# We use the same model weights but need access to intermediate layers
from transformers import AutoConfig

print(f"Configuring {SEM_MODEL_NAME} for layer access...")
config = AutoConfig.from_pretrained(SEM_MODEL_NAME)
config.output_hidden_states = True

layer_model = AutoModel.from_pretrained(SEM_MODEL_NAME, config=config).to(device)
layer_tokenizer = AutoTokenizer.from_pretrained(SEM_MODEL_NAME)

def get_layer_embeddings(text_list, layer_idx=2, batch_size=32):
    layer_model.eval()
    embeddings = []

    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = layer_tokenizer(batch, padding=True, truncation=True,
                                 max_length=512, return_tensors="pt").to(device)

        with torch.no_grad():
            out = layer_model(**inputs)

        # Extract specific hidden layer
        # Layer 0 = Embeddings, Layer 1-12 = Transformer Blocks
        hidden_state = out.hidden_states[layer_idx]

        # Mean Pooling (Standard for internal layers)
        mask = inputs['attention_mask'].unsqueeze(-1).expand(hidden_state.size()).float()
        sum_emb = torch.sum(hidden_state * mask, 1)
        sum_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_emb = sum_emb / sum_mask

        # L2 Normalize
        mean_emb = torch.nn.functional.normalize(mean_emb, p=2, dim=1)
        embeddings.append(mean_emb.cpu().numpy())

    return np.vstack(embeddings)

# 1. Generate corpus vectors for Layer 2
print("Generating Layer 2 embeddings...")
vectors_layer2 = get_layer_embeddings(df['text'].tolist(), layer_idx=2)

# 2. Calculate Layer 2 Centroids
c_austen_l2 = vectors_layer2[df['label'] == 'Austen'].mean(axis=0)
c_melville_l2 = vectors_layer2[df['label'] == 'Melville'].mean(axis=0)

# 3. Run Experiment
if 'sample_austen' in locals() and 'rewritten_melville' in locals():
    exp_vecs_l2 = get_layer_embeddings([sample_austen, rewritten_melville], layer_idx=2)

    d_orig = cosine(exp_vecs_l2[0], c_austen_l2)
    d_new_mel = cosine(exp_vecs_l2[1], c_melville_l2)
    d_new_aus = cosine(exp_vecs_l2[1], c_austen_l2)

    print("-" * 30)
    print("LAYER 2 RESULTS (Syntax/Structure Focus):")
    print(f"Original -> Austen Centroid:   {d_orig:.4f}")
    print(f"Rewritten -> Melville Centroid: {d_new_mel:.4f}")
    print(f"Rewritten -> Austen Centroid:   {d_new_aus:.4f}")

    if d_new_mel < d_new_aus:
        print("\nResult: SUCCESS. Layer 2 vector shifted to Target Cluster.")
    else:
        print("\nResult: FAILURE. Layer 2 vector remained in Source Cluster.")
else:
    print("Skipping experiment: input variables not found (Run Cell 5 first).")