This notebook will convert every tweet into a 768-dimensional vector that captures its semantic meaning.

# Cell 1: Setup & Load Model

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Imports
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# Check GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define paths
BASE_PATH = '/content/drive/MyDrive/same_words_different_worlds'

PATHS = {
    'raw': os.path.join(BASE_PATH, 'data/raw'),
    'processed': os.path.join(BASE_PATH, 'data/processed'),
    'outputs': os.path.join(BASE_PATH, 'data/outputs'),
    'models': os.path.join(BASE_PATH, 'models'),
    'figures': os.path.join(BASE_PATH, 'figures'),
}

# Load fine-tuned model and tokenizer
model_path = os.path.join(PATHS['models'], 'fine_tuned_roberta')
print(f"\nLoading fine-tuned model from: {model_path}")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)
model.eval()  # Set to evaluation mode

print(f"✓ Model loaded successfully")
print(f"  Hidden size: {model.config.hidden_size}")

Mounted at /content/drive
Using device: cuda

Loading fine-tuned model from: /content/drive/MyDrive/same_words_different_worlds/models/fine_tuned_roberta


Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/same_words_different_worlds/models/fine_tuned_roberta and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded successfully
  Hidden size: 768


# Cell 2: Load Data

In [2]:
# Load cleaned data
df = pd.read_csv(os.path.join(PATHS['processed'], '01_ai_tweets_clean.csv'))

# Ensure clean_text is string type
df['clean_text'] = df['clean_text'].fillna("").astype(str)

print(f"Loaded {len(df):,} tweets")
print(f"Columns: {df.columns.tolist()}")
print(f"\nParty distribution:")
print(df['party'].value_counts())

Loaded 3,201 tweets
Columns: ['party', 'text', 'clean_text', 'posted_date', 'year', 'name', 'chamber']

Party distribution:
party
D    2022
R    1179
Name: count, dtype: int64


# Cell 3: Define Embedding Extraction Function

In [3]:
def get_embedding(text, tokenizer, model, device, max_length=128):
    """
    Extract a 768-dimensional embedding for a text using mean pooling.

    Mean pooling averages all token embeddings (excluding padding),
    giving a single vector representation of the entire text.
    """
    if not isinstance(text, str) or len(text.strip()) == 0:
        return np.zeros(768)

    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length
    ).to(device)

    # Get model output (no gradient computation needed)
    with torch.no_grad():
        outputs = model(**inputs)

    # outputs.last_hidden_state shape: (batch_size, seq_len, hidden_dim=768)
    hidden_states = outputs.last_hidden_state

    # Mean pooling: average over sequence length, ignoring padding tokens
    attention_mask = inputs['attention_mask'].unsqueeze(-1)  # (1, seq_len, 1)
    attention_mask = attention_mask.expand(hidden_states.size()).float()

    # Sum of embeddings weighted by attention mask
    sum_embeddings = torch.sum(hidden_states * attention_mask, dim=1)

    # Number of non-padding tokens
    sum_mask = torch.clamp(attention_mask.sum(dim=1), min=1e-9)

    # Mean embedding
    mean_embedding = sum_embeddings / sum_mask

    return mean_embedding.cpu().numpy()[0]

# Test the function
test_text = "AI safety regulation is essential for protecting Americans."
test_embedding = get_embedding(test_text, tokenizer, model, device)

print("✓ Embedding function defined")
print(f"\nTest embedding:")
print(f"  Input: '{test_text}'")
print(f"  Output shape: {test_embedding.shape}")
print(f"  Sample values: [{test_embedding[0]:.4f}, {test_embedding[1]:.4f}, ... {test_embedding[-1]:.4f}]")
print(f"  Embedding norm: {np.linalg.norm(test_embedding):.4f}")

✓ Embedding function defined

Test embedding:
  Input: 'AI safety regulation is essential for protecting Americans.'
  Output shape: (768,)
  Sample values: [-0.0016, 0.1230, ... 0.0581]
  Embedding norm: 12.6817


# Cell 4: Extract Embeddings for All Tweets

In [4]:
# Extract embeddings for all tweets
# This will take a few minutes (~3-5 min for 3,201 tweets)

print("Extracting embeddings for all tweets...")
print("="*60)

embeddings = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    emb = get_embedding(row['clean_text'], tokenizer, model, device)
    embeddings.append(emb)

# Add embeddings to dataframe
df['embedding'] = embeddings

print("="*60)
print(f"✓ Extraction complete!")
print(f"  Total embeddings: {len(embeddings)}")
print(f"  Embedding shape: {df['embedding'].iloc[0].shape}")

Extracting embeddings for all tweets...


Processing:   0%|          | 0/3201 [00:00<?, ?it/s]

✓ Extraction complete!
  Total embeddings: 3201
  Embedding shape: (768,)


# Cell 5: Verify Embeddings

In [5]:
# Verify embeddings are valid and vary across tweets
print("="*60)
print("EMBEDDING VERIFICATION")
print("="*60)

# Stack all embeddings into a matrix
embedding_matrix = np.stack(df['embedding'].values)
print(f"\nEmbedding matrix shape: {embedding_matrix.shape}")

# Check for any zero vectors (would indicate failed extraction)
zero_vectors = np.sum(np.all(embedding_matrix == 0, axis=1))
print(f"Zero vectors (failed extractions): {zero_vectors}")

# Check embedding statistics
print(f"\nEmbedding statistics:")
print(f"  Mean: {embedding_matrix.mean():.4f}")
print(f"  Std:  {embedding_matrix.std():.4f}")
print(f"  Min:  {embedding_matrix.min():.4f}")
print(f"  Max:  {embedding_matrix.max():.4f}")

# Check variance across embeddings (should be non-zero)
embedding_variance = np.var(embedding_matrix, axis=0).mean()
print(f"  Mean variance across dimensions: {embedding_variance:.4f}")

# Quick semantic sanity check: similar texts should have similar embeddings
from sklearn.metrics.pairwise import cosine_similarity

text1 = "AI regulation is important for safety"
text2 = "Artificial intelligence rules matter for security"
text3 = "I had pizza for lunch today"

emb1 = get_embedding(text1, tokenizer, model, device).reshape(1, -1)
emb2 = get_embedding(text2, tokenizer, model, device).reshape(1, -1)
emb3 = get_embedding(text3, tokenizer, model, device).reshape(1, -1)

sim_12 = cosine_similarity(emb1, emb2)[0][0]
sim_13 = cosine_similarity(emb1, emb3)[0][0]

print(f"\nSemantic sanity check:")
print(f"  '{text1[:40]}...'")
print(f"  vs '{text2[:40]}...' → similarity: {sim_12:.3f}")
print(f"  vs '{text3[:40]}...' → similarity: {sim_13:.3f}")
print(f"  ✓ Similar texts more similar: {sim_12 > sim_13}")

EMBEDDING VERIFICATION

Embedding matrix shape: (3201, 768)
Zero vectors (failed extractions): 0

Embedding statistics:
  Mean: 0.0199
  Std:  0.4560
  Min:  -7.8555
  Max:  11.2109
  Mean variance across dimensions: 0.0045

Semantic sanity check:
  'AI regulation is important for safety...'
  vs 'Artificial intelligence rules matter for...' → similarity: 0.989
  vs 'I had pizza for lunch today...' → similarity: 0.956
  ✓ Similar texts more similar: True


# Cell 6: Save Embeddings

In [6]:
# Save dataframe with embeddings as pickle
# (CSV can't properly store numpy arrays)

output_path = os.path.join(PATHS['processed'], '02_tweets_with_embeddings.pkl')

print(f"Saving embeddings to: {output_path}")
df.to_pickle(output_path)

# Verify save
file_size = os.path.getsize(output_path) / 1e6
print(f"✓ Saved successfully!")
print(f"  File size: {file_size:.1f} MB")

# Test reload
df_test = pd.read_pickle(output_path)
print(f"  Reload test: {df_test['embedding'].iloc[0].shape} ✓")

Saving embeddings to: /content/drive/MyDrive/same_words_different_worlds/data/processed/02_tweets_with_embeddings.pkl
✓ Saved successfully!
  File size: 11.6 MB
  Reload test: (768,) ✓


# Cell 7: Notebook 04 Summary

In [7]:
print("="*60)
print("NOTEBOOK 04 COMPLETE ✓")
print("="*60)
print("""
EMBEDDING EXTRACTION RESULTS:

1. PROCESS:
   - Loaded fine-tuned RoBERTa model
   - Applied mean pooling over token embeddings
   - Extracted 768-dimensional vector per tweet

2. OUTPUT:
   - 3,201 tweets with embeddings
   - Matrix shape: (3201, 768)
   - Zero failed extractions
   - File: 02_tweets_with_embeddings.pkl (11.6 MB)

3. VERIFICATION:
   - Semantic sanity check passed
   - Similar texts have higher cosine similarity
   - Embeddings show appropriate variance

WHAT THESE EMBEDDINGS CAPTURE:
   - Each tweet is now a point in 768-dimensional space
   - Semantically similar tweets are closer together
   - We can now measure the "distance" between how
     Democrats and Republicans use the same words

NEXT STEPS:
   → Notebook 05: Semantic Divergence Analysis
      - Visualize partisan semantic spaces (PCA/UMAP)
      - Train classifier probe
      - Measure word-level semantic distance
      - This is where we prove the thesis!
""")

NOTEBOOK 04 COMPLETE ✓

EMBEDDING EXTRACTION RESULTS:

1. PROCESS:
   - Loaded fine-tuned RoBERTa model
   - Applied mean pooling over token embeddings
   - Extracted 768-dimensional vector per tweet
   
2. OUTPUT:
   - 3,201 tweets with embeddings
   - Matrix shape: (3201, 768)
   - Zero failed extractions
   - File: 02_tweets_with_embeddings.pkl (11.6 MB)
   
3. VERIFICATION:
   - Semantic sanity check passed
   - Similar texts have higher cosine similarity
   - Embeddings show appropriate variance

WHAT THESE EMBEDDINGS CAPTURE:
   - Each tweet is now a point in 768-dimensional space
   - Semantically similar tweets are closer together
   - We can now measure the "distance" between how
     Democrats and Republicans use the same words
   
NEXT STEPS:
   → Notebook 05: Semantic Divergence Analysis
      - Visualize partisan semantic spaces (PCA/UMAP)
      - Train classifier probe
      - Measure word-level semantic distance
      - This is where we prove the thesis!

