# Week 2 Lab: Word Embeddings and Word2Vec

## Learning Objectives
- Build Word2Vec from scratch in PyTorch
- Train embeddings on real text data
- Explore semantic relationships
- Visualize embedding spaces
- Compare with pre-trained models

---

## Part 1: Setup and Data Preparation

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import random
import re
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print(f"PyTorch version: {torch.__version__}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Load sample text data
# You can replace this with any text file
sample_text = """
The cat sat on the mat. The dog played in the garden.
Cats and dogs are common pets. They live with humans.
The king ruled the kingdom. The queen lived in the palace.
Paris is the capital of France. Rome is the capital of Italy.
Berlin is the capital of Germany. London is the capital of England.
Scientists study nature. Researchers conduct experiments.
Students learn in schools. Teachers educate students.
Books contain knowledge. Libraries store books.
Computers process information. Programmers write code.
Artists create paintings. Musicians compose songs.
"""

# For larger corpus, uncomment and use:
# with open('your_text_file.txt', 'r', encoding='utf-8') as f:
#     sample_text = f.read()

print(f"Corpus size: {len(sample_text)} characters")
print(f"Sample: {sample_text[:100]}...")

In [None]:
# Text preprocessing
def preprocess_text(text):
    """Clean and tokenize text"""
    # Convert to lowercase
    text = text.lower()
    # Keep only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Split into words
    words = text.split()
    return words

# Preprocess the corpus
words = preprocess_text(sample_text)
print(f"Total words: {len(words)}")
print(f"Unique words: {len(set(words))}")
print(f"First 10 words: {words[:10]}")

In [None]:
# Build vocabulary
class Vocabulary:
    def __init__(self, words, min_count=1):
        """Build vocabulary from word list"""
        # Count word frequencies
        word_counts = Counter(words)
        
        # Filter by minimum count
        valid_words = [w for w, c in word_counts.items() if c >= min_count]
        
        # Create mappings
        self.word2idx = {w: i for i, w in enumerate(valid_words)}
        self.idx2word = {i: w for w, i in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)
        self.word_counts = word_counts
        
    def __len__(self):
        return self.vocab_size
    
    def encode(self, word):
        return self.word2idx.get(word, -1)
    
    def decode(self, idx):
        return self.idx2word.get(idx, '<UNK>')

# Create vocabulary
vocab = Vocabulary(words, min_count=1)
print(f"Vocabulary size: {len(vocab)}")
print(f"Most common words: {vocab.word_counts.most_common(10)}")

## Part 2: Build Word2Vec Model

In [None]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_dim=100):
        """
        Word2Vec Skip-gram model
        Args:
            vocab_size: Size of vocabulary
            embed_dim: Dimension of embeddings
        """
        super(Word2Vec, self).__init__()
        
        # Two embedding matrices
        self.center_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
        
        # Initialize weights
        self.center_embeddings.weight.data.uniform_(-0.5, 0.5)
        self.context_embeddings.weight.data.uniform_(-0.5, 0.5)
        
    def forward(self, center_words, context_words, neg_words=None):
        """
        Forward pass with negative sampling
        Args:
            center_words: Batch of center word indices
            context_words: Batch of context word indices
            neg_words: Batch of negative sample indices
        """
        # Get embeddings
        center_embeds = self.center_embeddings(center_words)
        context_embeds = self.context_embeddings(context_words)
        
        # Positive score (should be high)
        pos_score = torch.sum(center_embeds * context_embeds, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score))
        
        # Negative sampling loss
        neg_loss = 0
        if neg_words is not None:
            neg_embeds = self.context_embeddings(neg_words)
            neg_score = torch.bmm(neg_embeds, center_embeds.unsqueeze(2)).squeeze()
            neg_loss = -torch.log(torch.sigmoid(-neg_score)).sum(dim=1)
        
        return (pos_loss + neg_loss).mean()
    
    def get_embedding(self, word_idx):
        """Get the embedding vector for a word"""
        return self.center_embeddings.weight[word_idx].detach().cpu().numpy()
    
    def similarity(self, word_idx1, word_idx2):
        """Compute cosine similarity between two words"""
        vec1 = self.get_embedding(word_idx1)
        vec2 = self.get_embedding(word_idx2)
        
        cos_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return cos_sim

# Create model
embed_dim = 50  # Smaller for visualization
model = Word2Vec(vocab.vocab_size, embed_dim).to(device)
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")
print(model)

## Part 3: Prepare Training Data

In [None]:
def create_skipgram_dataset(words, vocab, window_size=2, neg_samples=5):
    """
    Create training pairs for skip-gram model
    Args:
        words: List of words in corpus
        vocab: Vocabulary object
        window_size: Context window size
        neg_samples: Number of negative samples
    """
    dataset = []
    
    # Word frequency for negative sampling
    word_freqs = np.array([vocab.word_counts[vocab.decode(i)] for i in range(len(vocab))])
    word_freqs = word_freqs ** 0.75  # Smooth distribution
    word_freqs = word_freqs / word_freqs.sum()
    
    for i, center_word in enumerate(words):
        center_idx = vocab.encode(center_word)
        if center_idx == -1:
            continue
            
        # Get context words
        context_range = range(max(0, i - window_size), 
                            min(len(words), i + window_size + 1))
        
        for j in context_range:
            if i == j:
                continue
                
            context_word = words[j]
            context_idx = vocab.encode(context_word)
            if context_idx == -1:
                continue
            
            # Sample negative words
            neg_indices = np.random.choice(
                len(vocab), size=neg_samples, p=word_freqs
            )
            
            dataset.append((center_idx, context_idx, neg_indices))
    
    return dataset

# Create training dataset
train_data = create_skipgram_dataset(words, vocab, window_size=2, neg_samples=5)
print(f"Training samples: {len(train_data)}")

# Show sample training data
sample = train_data[0]
print(f"\nSample training instance:")
print(f"Center word: {vocab.decode(sample[0])}")
print(f"Context word: {vocab.decode(sample[1])}")
print(f"Negative samples: {[vocab.decode(idx) for idx in sample[2]]}")

## Part 4: Train the Model

In [None]:
def train_word2vec(model, train_data, vocab, epochs=100, lr=0.01, batch_size=32):
    """
    Train Word2Vec model
    """
    optimizer = optim.Adam(model.parameters(), lr=lr)
    losses = []
    
    for epoch in range(epochs):
        epoch_loss = 0
        random.shuffle(train_data)
        
        # Process in batches
        for i in range(0, len(train_data), batch_size):
            batch = train_data[i:i+batch_size]
            
            # Prepare batch tensors
            center_words = torch.tensor([x[0] for x in batch], dtype=torch.long).to(device)
            context_words = torch.tensor([x[1] for x in batch], dtype=torch.long).to(device)
            neg_words = torch.tensor([x[2] for x in batch], dtype=torch.long).to(device)
            
            # Forward pass
            loss = model(center_words, context_words, neg_words)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / (len(train_data) // batch_size)
        losses.append(avg_loss)
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
    
    return losses

# Train the model
print("Training Word2Vec model...")
losses = train_word2vec(model, train_data, vocab, epochs=100, lr=0.01)

# Plot training loss
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Word2Vec Training Loss')
plt.grid(True, alpha=0.3)
plt.show()

## Part 5: Explore Semantic Relationships

In [None]:
def find_nearest_neighbors(model, vocab, word, k=5):
    """
    Find k nearest neighbors to a word
    """
    word_idx = vocab.encode(word)
    if word_idx == -1:
        print(f"Word '{word}' not in vocabulary")
        return []
    
    # Get target embedding
    target_embed = model.get_embedding(word_idx)
    
    # Compute similarities with all words
    similarities = []
    for idx in range(len(vocab)):
        if idx == word_idx:
            continue
        embed = model.get_embedding(idx)
        sim = np.dot(target_embed, embed) / (np.linalg.norm(target_embed) * np.linalg.norm(embed))
        similarities.append((vocab.decode(idx), sim))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    return similarities[:k]

# Test with different words
test_words = ['cat', 'king', 'paris', 'student']

for word in test_words:
    if vocab.encode(word) != -1:
        print(f"\nNearest neighbors to '{word}':")
        neighbors = find_nearest_neighbors(model, vocab, word, k=5)
        for neighbor, sim in neighbors:
            print(f"  {neighbor}: {sim:.3f}")

In [None]:
def analogy(model, vocab, a, b, c, k=5):
    """
    Solve analogies: a:b :: c:?
    Example: king:queen :: man:woman
    """
    # Get word indices
    a_idx = vocab.encode(a)
    b_idx = vocab.encode(b)
    c_idx = vocab.encode(c)
    
    if -1 in [a_idx, b_idx, c_idx]:
        print("One or more words not in vocabulary")
        return []
    
    # Get embeddings
    a_vec = model.get_embedding(a_idx)
    b_vec = model.get_embedding(b_idx)
    c_vec = model.get_embedding(c_idx)
    
    # Compute target vector: b - a + c
    target = b_vec - a_vec + c_vec
    
    # Find nearest neighbors
    similarities = []
    for idx in range(len(vocab)):
        if idx in [a_idx, b_idx, c_idx]:
            continue
        embed = model.get_embedding(idx)
        sim = np.dot(target, embed) / (np.linalg.norm(target) * np.linalg.norm(embed))
        similarities.append((vocab.decode(idx), sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:k]

# Test analogies (with small corpus, results may vary)
print("Testing analogies:")
print("\nking : queen :: paris : ?")
results = analogy(model, vocab, 'king', 'queen', 'paris', k=3)
for word, sim in results:
    print(f"  {word}: {sim:.3f}")

print("\ncat : cats :: dog : ?")
results = analogy(model, vocab, 'cat', 'cats', 'dog', k=3)
for word, sim in results:
    print(f"  {word}: {sim:.3f}")

## Part 6: Visualize Embeddings

In [None]:
def visualize_embeddings(model, vocab, method='tsne', n_words=30):
    """
    Visualize word embeddings in 2D
    """
    # Get most common words
    common_words = vocab.word_counts.most_common(n_words)
    word_indices = [vocab.encode(word) for word, _ in common_words]
    
    # Get embeddings
    embeddings = np.array([model.get_embedding(idx) for idx in word_indices])
    
    # Reduce dimensions
    if method == 'tsne':
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(5, n_words-1))
    else:  # PCA
        reducer = PCA(n_components=2)
    
    embeddings_2d = reducer.fit_transform(embeddings)
    
    # Plot
    plt.figure(figsize=(12, 8))
    plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
    
    # Add labels
    for i, (word, _) in enumerate(common_words):
        plt.annotate(word, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]),
                    xytext=(5, 2), textcoords='offset points',
                    fontsize=9, alpha=0.7)
    
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title(f'Word Embeddings Visualization ({method.upper()})')
    plt.grid(True, alpha=0.3)
    plt.show()

# Visualize with t-SNE
print("t-SNE visualization:")
visualize_embeddings(model, vocab, method='tsne', n_words=25)

# Visualize with PCA
print("\nPCA visualization:")
visualize_embeddings(model, vocab, method='pca', n_words=25)

## Part 7: Compare with Pre-trained Embeddings (Optional)

In [None]:
# This section requires gensim
# !pip install gensim

try:
    from gensim.models import KeyedVectors
    import gensim.downloader as api
    
    print("Loading pre-trained Word2Vec model (this may take a moment)...")
    # Load a smaller pre-trained model
    pretrained = api.load('glove-wiki-gigaword-50')
    
    # Compare nearest neighbors
    test_word = 'king'
    if test_word in pretrained:
        print(f"\nPre-trained model - Nearest neighbors to '{test_word}':")
        for word, sim in pretrained.most_similar(test_word, topn=5):
            print(f"  {word}: {sim:.3f}")
    
    # Test analogy
    print("\nPre-trained model - king : queen :: man : ?")
    result = pretrained.most_similar(positive=['queen', 'man'], negative=['king'], topn=3)
    for word, sim in result:
        print(f"  {word}: {sim:.3f}")
        
except ImportError:
    print("Gensim not installed. To compare with pre-trained models, install with:")
    print("pip install gensim")
except Exception as e:
    print(f"Could not load pre-trained model: {e}")

## Part 8: Save and Load Model

In [None]:
# Save the trained model
model_path = 'word2vec_model.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': vocab.vocab_size,
    'embed_dim': embed_dim,
    'vocab': vocab
}, model_path)

print(f"Model saved to {model_path}")

# Load the model
checkpoint = torch.load(model_path, map_location=device)
loaded_model = Word2Vec(checkpoint['vocab_size'], checkpoint['embed_dim']).to(device)
loaded_model.load_state_dict(checkpoint['model_state_dict'])
loaded_model.eval()

print("Model loaded successfully!")

# Verify loaded model works
test_word = 'cat'
if vocab.encode(test_word) != -1:
    print(f"\nTesting loaded model - Nearest neighbors to '{test_word}':")
    neighbors = find_nearest_neighbors(loaded_model, vocab, test_word, k=3)
    for neighbor, sim in neighbors:
        print(f"  {neighbor}: {sim:.3f}")

## Exercises

1. **Experiment with hyperparameters:**
   - Try different embedding dimensions (50, 100, 200)
   - Vary the window size (1, 3, 5)
   - Change the number of negative samples

2. **Use a larger corpus:**
   - Download a book from Project Gutenberg
   - Train on Wikipedia articles
   - Compare results with the small corpus

3. **Implement CBOW:**
   - Modify the model to implement CBOW instead of Skip-gram
   - Compare the results

4. **Explore bias:**
   - Test for gender bias in embeddings
   - Try debiasing techniques

5. **Advanced visualizations:**
   - Create interactive plots with plotly
   - Visualize semantic clusters
   - Show analogy vectors

## Summary

In this lab, you have:
- Built Word2Vec from scratch in PyTorch
- Implemented the Skip-gram architecture with negative sampling
- Trained embeddings on text data
- Explored semantic relationships and analogies
- Visualized embedding spaces
- Learned how to save and load trained models

Key takeaways:
- Word embeddings capture semantic relationships
- Similar words have similar vectors
- Vector arithmetic can encode analogies
- Quality depends on corpus size and diversity

Next week: Using these embeddings in Recurrent Neural Networks!