# Word Embeddings Discovery Notebook

This notebook accompanies the pre-class discovery handout. Use it to experiment with word embeddings and complete the activities.

**Author**: NLP Course Team  
**Prerequisites**: Basic Python, numpy

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import List, Dict, Tuple

# Set up visualization
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')

# Educational color scheme
COLOR_CURRENT = '#FF6B6B'  # Red
COLOR_CONTEXT = '#4ECDC4'  # Teal
COLOR_PREDICT = '#95E77E'  # Green
COLOR_NEUTRAL = '#E0E0E0'  # Gray

## Activity 1: String Similarity vs Semantic Similarity

Let's explore why character-based similarity doesn't capture meaning.

In [None]:
def string_similarity(w1: str, w2: str) -> float:
    """Calculate character-based similarity between two words."""
    # Count matching characters in same positions
    matches = sum(1 for c1, c2 in zip(w1, w2) if c1 == c2)
    return matches / max(len(w1), len(w2))

# Test words
word_pairs = [
    ('cat', 'dog'),
    ('cat', 'car'),
    ('cat', 'kitten'),
    ('happy', 'joyful'),
    ('bank', 'tank'),  # Similar spelling, different meaning
]

print("Character-based Similarity:")
print("-" * 40)
for w1, w2 in word_pairs:
    sim = string_similarity(w1, w2)
    print(f"{w1:8} vs {w2:8} = {sim:.3f}")

### Reflection Question
Notice how 'bank' and 'tank' have high character similarity but completely different meanings. This shows why we need better representations!

## Activity 2: One-Hot Encoding

Let's implement one-hot encoding and see its limitations.

In [None]:
class OneHotEncoder:
    """Simple one-hot encoder for words."""
    
    def __init__(self, vocabulary: List[str]):
        self.vocabulary = vocabulary
        self.word_to_idx = {word: i for i, word in enumerate(vocabulary)}
        self.vocab_size = len(vocabulary)
    
    def encode(self, word: str) -> np.ndarray:
        """Encode a word as a one-hot vector."""
        vector = np.zeros(self.vocab_size)
        if word in self.word_to_idx:
            vector[self.word_to_idx[word]] = 1
        return vector
    
    def similarity(self, word1: str, word2: str) -> float:
        """Calculate similarity between two words using dot product."""
        vec1 = self.encode(word1)
        vec2 = self.encode(word2)
        return np.dot(vec1, vec2)

# Create encoder with small vocabulary
vocab = ['cat', 'dog', 'mat', 'sat', 'hat', 'kitten', 'puppy']
encoder = OneHotEncoder(vocab)

# Encode some words
print("One-Hot Encodings:")
print("-" * 40)
for word in ['cat', 'dog', 'kitten']:
    vec = encoder.encode(word)
    print(f"{word:8} = {vec}")

# Calculate similarities
print("\nSimilarities (using dot product):")
print("-" * 40)
test_pairs = [('cat', 'dog'), ('cat', 'kitten'), ('cat', 'cat')]
for w1, w2 in test_pairs:
    sim = encoder.similarity(w1, w2)
    print(f"{w1:8} vs {w2:8} = {sim:.1f}")

In [None]:
# Visualize the sparsity problem
def visualize_one_hot_sparsity(vocab_size: int):
    """Show how sparse one-hot vectors become with large vocabularies."""
    
    sizes = [10, 100, 1000, 10000, 50000]
    sparsity = [1 - 1/s for s in sizes]
    
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.bar(range(len(sizes)), sparsity, color=COLOR_CURRENT)
    plt.xticks(range(len(sizes)), [f'{s:,}' for s in sizes])
    plt.ylabel('Percentage of Zeros')
    plt.xlabel('Vocabulary Size')
    plt.title('Sparsity of One-Hot Vectors')
    plt.ylim(0, 1.05)
    
    # Add percentage labels
    for i, s in enumerate(sparsity):
        plt.text(i, s + 0.02, f'{s*100:.2f}%', ha='center', fontweight='bold')
    
    plt.subplot(1, 2, 2)
    # Show a single one-hot vector for vocab_size=50
    vec = np.zeros(50)
    vec[5] = 1  # Word at index 5
    plt.bar(range(50), vec, color=np.where(vec == 1, COLOR_CURRENT, COLOR_NEUTRAL))
    plt.xlabel('Dimension')
    plt.ylabel('Value')
    plt.title('Example: One-Hot Vector (50-word vocabulary)')
    plt.ylim(0, 1.2)
    
    plt.tight_layout()
    plt.show()

visualize_one_hot_sparsity(50000)

## Activity 3: Dense Embeddings - 2D Word Space

Now let's work with dense vectors where each word is represented by just a few numbers.

In [None]:
# Simple 2D embeddings (normally these would be learned from data)
word_embeddings = {
    # Animals cluster
    'cat': [2.0, 3.0],
    'dog': [3.0, 3.0],
    'kitten': [1.5, 2.5],
    'puppy': [3.5, 2.5],
    'pet': [2.5, 3.5],
    
    # Vehicles cluster
    'car': [8.0, 1.0],
    'truck': [9.0, 1.5],
    'vehicle': [8.5, 0.5],
    'bus': [8.5, 2.0],
    
    # Emotions cluster
    'happy': [5.0, 8.0],
    'joyful': [5.5, 8.5],
    'sad': [5.0, 6.0],
    'angry': [4.0, 6.5]
}

def euclidean_distance(vec1: List[float], vec2: List[float]) -> float:
    """Calculate Euclidean distance between two vectors."""
    return np.sqrt(sum((a - b)**2 for a, b in zip(vec1, vec2)))

def cosine_similarity(vec1: List[float], vec2: List[float]) -> float:
    """Calculate cosine similarity between two vectors."""
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    norm1 = np.sqrt(sum(a**2 for a in vec1))
    norm2 = np.sqrt(sum(b**2 for b in vec2))
    return dot_product / (norm1 * norm2) if norm1 * norm2 > 0 else 0

# Calculate distances
print("Dense Embedding Distances:")
print("-" * 50)
test_pairs = [
    ('cat', 'dog'),      # Same category
    ('cat', 'kitten'),   # Very similar
    ('cat', 'car'),      # Different categories
    ('happy', 'joyful'), # Synonyms
    ('happy', 'sad'),    # Antonyms
]

for w1, w2 in test_pairs:
    vec1 = word_embeddings[w1]
    vec2 = word_embeddings[w2]
    dist = euclidean_distance(vec1, vec2)
    cos_sim = cosine_similarity(vec1, vec2)
    print(f"{w1:8} vs {w2:8}: distance={dist:.2f}, cosine_sim={cos_sim:.3f}")

In [None]:
def plot_word_space(embeddings: Dict[str, List[float]], highlight_pairs=None):
    """Visualize words in 2D space."""
    plt.figure(figsize=(12, 8))
    
    # Define categories for coloring
    categories = {
        'animals': ['cat', 'dog', 'kitten', 'puppy', 'pet'],
        'vehicles': ['car', 'truck', 'vehicle', 'bus'],
        'emotions': ['happy', 'joyful', 'sad', 'angry']
    }
    
    colors = {
        'animals': COLOR_CONTEXT,
        'vehicles': COLOR_CURRENT,
        'emotions': COLOR_PREDICT
    }
    
    # Plot words by category
    for category, words in categories.items():
        for word in words:
            if word in embeddings:
                x, y = embeddings[word]
                plt.scatter(x, y, s=200, c=colors[category], 
                          alpha=0.7, edgecolors='black', linewidths=2)
                plt.annotate(word, (x, y), fontsize=10, fontweight='bold',
                           ha='center', va='center')
    
    # Highlight specific pairs if requested
    if highlight_pairs:
        for w1, w2 in highlight_pairs:
            if w1 in embeddings and w2 in embeddings:
                x1, y1 = embeddings[w1]
                x2, y2 = embeddings[w2]
                plt.plot([x1, x2], [y1, y2], 'k--', alpha=0.3, linewidth=2)
                
                # Add distance label
                mid_x, mid_y = (x1 + x2) / 2, (y1 + y2) / 2
                dist = euclidean_distance(embeddings[w1], embeddings[w2])
                plt.text(mid_x, mid_y, f'{dist:.1f}', fontsize=8,
                        bbox=dict(boxstyle='round,pad=0.3', 
                                 facecolor='yellow', alpha=0.7))
    
    plt.xlabel('Dimension 1', fontsize=12)
    plt.ylabel('Dimension 2', fontsize=12)
    plt.title('Word Embeddings in 2D Space - Similar Words Cluster Together',
             fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=colors[cat], label=cat.capitalize())
                      for cat in categories.keys()]
    plt.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    plt.show()

# Visualize the word space
plot_word_space(word_embeddings, highlight_pairs=[('cat', 'kitten'), ('cat', 'car')])

## Activity 4: Word Arithmetic

One of the most fascinating properties of word embeddings is that they can capture analogies through vector arithmetic.

In [None]:
# Simplified embeddings that demonstrate relationships
analogy_embeddings = {
    # Gender relationship
    'king': [5.0, 3.0],
    'queen': [5.0, 2.0],
    'man': [4.0, 3.0],
    'woman': [4.0, 2.0],
    
    # Country-Capital relationship
    'Paris': [2.0, 5.0],
    'France': [3.0, 6.0],
    'Berlin': [6.0, 5.0],
    'Germany': [7.0, 6.0],
    'London': [10.0, 5.0],
    'England': [11.0, 6.0],
    
    # Size relationship
    'small': [1.0, 1.0],
    'smaller': [0.5, 0.5],
    'big': [3.0, 3.0],
    'bigger': [3.5, 3.5]
}

def word_arithmetic(embeddings: Dict, word1: str, word2: str, word3: str) -> np.ndarray:
    """
    Perform word arithmetic: word1 - word2 + word3
    Example: king - man + woman = queen
    """
    vec1 = np.array(embeddings[word1])
    vec2 = np.array(embeddings[word2])
    vec3 = np.array(embeddings[word3])
    
    result = vec1 - vec2 + vec3
    return result

def find_nearest_word(embeddings: Dict, target_vec: np.ndarray, 
                     exclude_words: List[str] = None) -> Tuple[str, float]:
    """Find the word whose embedding is closest to the target vector."""
    if exclude_words is None:
        exclude_words = []
    
    best_word = None
    best_distance = float('inf')
    
    for word, vec in embeddings.items():
        if word not in exclude_words:
            dist = euclidean_distance(vec, target_vec.tolist())
            if dist < best_distance:
                best_distance = dist
                best_word = word
    
    return best_word, best_distance

# Test word arithmetic
print("Word Arithmetic Examples:")
print("=" * 50)

# Example 1: king - man + woman = ?
result = word_arithmetic(analogy_embeddings, 'king', 'man', 'woman')
nearest, dist = find_nearest_word(analogy_embeddings, result, 
                                 exclude_words=['king', 'man', 'woman'])
print(f"king - man + woman = {result}")
print(f"Nearest word: {nearest} (distance: {dist:.2f})")
print(f"Expected: queen")
print()

# Example 2: Paris - France + Germany = ?
result = word_arithmetic(analogy_embeddings, 'Paris', 'France', 'Germany')
nearest, dist = find_nearest_word(analogy_embeddings, result,
                                 exclude_words=['Paris', 'France', 'Germany'])
print(f"Paris - France + Germany = {result}")
print(f"Nearest word: {nearest} (distance: {dist:.2f})")
print(f"Expected: Berlin")
print()

# Example 3: small - big + bigger = ?
result = word_arithmetic(analogy_embeddings, 'small', 'big', 'bigger')
nearest, dist = find_nearest_word(analogy_embeddings, result,
                                 exclude_words=['small', 'big', 'bigger'])
print(f"small - big + bigger = {result}")
print(f"Nearest word: {nearest} (distance: {dist:.2f})")
print(f"Expected: smaller")

In [None]:
def visualize_word_arithmetic(embeddings: Dict, w1: str, w2: str, w3: str):
    """Visualize word arithmetic as vectors."""
    plt.figure(figsize=(10, 8))
    
    # Get vectors
    vec1 = np.array(embeddings[w1])
    vec2 = np.array(embeddings[w2])
    vec3 = np.array(embeddings[w3])
    result = vec1 - vec2 + vec3
    
    # Find nearest word
    nearest, _ = find_nearest_word(embeddings, result, exclude_words=[w1, w2, w3])
    vec_nearest = np.array(embeddings[nearest])
    
    # Plot points
    plt.scatter(*vec1, s=300, c=COLOR_CURRENT, marker='*', 
               edgecolors='black', linewidths=2, zorder=5)
    plt.scatter(*vec2, s=200, c=COLOR_CONTEXT, marker='o',
               edgecolors='black', linewidths=2, zorder=5)
    plt.scatter(*vec3, s=200, c=COLOR_CONTEXT, marker='o',
               edgecolors='black', linewidths=2, zorder=5)
    plt.scatter(*result, s=300, c='yellow', marker='D',
               edgecolors='black', linewidths=2, zorder=5)
    plt.scatter(*vec_nearest, s=300, c=COLOR_PREDICT, marker='*',
               edgecolors='black', linewidths=2, zorder=5)
    
    # Labels
    plt.annotate(w1, vec1, xytext=(5, 5), textcoords='offset points',
                fontsize=12, fontweight='bold')
    plt.annotate(w2, vec2, xytext=(5, 5), textcoords='offset points',
                fontsize=12, fontweight='bold')
    plt.annotate(w3, vec3, xytext=(5, 5), textcoords='offset points',
                fontsize=12, fontweight='bold')
    plt.annotate('result', result, xytext=(5, -15), textcoords='offset points',
                fontsize=11, style='italic', color='orange')
    plt.annotate(nearest, vec_nearest, xytext=(5, 5), textcoords='offset points',
                fontsize=12, fontweight='bold', color=COLOR_PREDICT)
    
    # Draw vectors
    # vec1 - vec2
    plt.arrow(vec1[0], vec1[1], vec2[0]-vec1[0], vec2[1]-vec1[1],
             head_width=0.1, head_length=0.1, fc='red', ec='red',
             linestyle='--', alpha=0.5)
    
    # + vec3
    plt.arrow(vec2[0], vec2[1], vec3[0]-vec2[0], vec3[1]-vec2[1],
             head_width=0.1, head_length=0.1, fc='blue', ec='blue',
             linestyle='--', alpha=0.5)
    
    # Result
    plt.arrow(vec3[0], vec3[1], result[0]-vec3[0], result[1]-vec3[1],
             head_width=0.15, head_length=0.15, fc='orange', ec='orange',
             linewidth=2)
    
    plt.title(f'Word Arithmetic: {w1} - {w2} + {w3} = {nearest}',
             fontsize=14, fontweight='bold')
    plt.xlabel('Dimension 1', fontsize=12)
    plt.ylabel('Dimension 2', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

# Visualize the king-queen analogy
visualize_word_arithmetic(analogy_embeddings, 'king', 'man', 'woman')

## Activity 5: Context Matters - Static vs Contextual Embeddings

Let's explore why modern NLP uses contextual embeddings.

In [None]:
# Simulate contextual embeddings
class ContextualEmbedding:
    """Simulate how contextual embeddings work."""
    
    def __init__(self):
        # Base embeddings (static)
        self.static_embeddings = {
            'bank': [5.0, 5.0],  # Ambiguous position
            'money': [8.0, 8.0],
            'river': [2.0, 2.0],
            'water': [1.0, 3.0],
            'account': [9.0, 7.0],
            'fish': [2.0, 1.0]
        }
        
    def get_contextual_embedding(self, word: str, context: List[str]) -> List[float]:
        """Get embedding based on context."""
        if word != 'bank':
            return self.static_embeddings.get(word, [0, 0])
        
        # Adjust 'bank' embedding based on context
        financial_words = {'money', 'account', 'deposit', 'loan', 'savings'}
        nature_words = {'river', 'water', 'fish', 'boat', 'shore'}
        
        financial_score = sum(1 for w in context if w in financial_words)
        nature_score = sum(1 for w in context if w in nature_words)
        
        if financial_score > nature_score:
            return [8.0, 7.5]  # Near financial terms
        elif nature_score > financial_score:
            return [2.0, 2.5]  # Near nature terms
        else:
            return [5.0, 5.0]  # Default ambiguous position

# Test contextual embeddings
embedder = ContextualEmbedding()

# Different contexts for 'bank'
contexts = [
    ['I', 'deposited', 'money', 'in', 'the', 'bank'],
    ['We', 'sat', 'by', 'the', 'river', 'bank'],
    ['The', 'bank', 'has', 'many', 'branches'],  # Ambiguous!
]

print("Contextual Embeddings for 'bank':")
print("=" * 50)

for i, context in enumerate(contexts, 1):
    embedding = embedder.get_contextual_embedding('bank', context)
    print(f"Context {i}: {' '.join(context)}")
    print(f"  Embedding: {embedding}")
    
    # Find nearest words
    distances = {}
    for word, vec in embedder.static_embeddings.items():
        if word != 'bank':
            dist = euclidean_distance(embedding, vec)
            distances[word] = dist
    
    nearest = min(distances, key=distances.get)
    print(f"  Nearest word: {nearest} (distance: {distances[nearest]:.2f})")
    print()

In [None]:
def visualize_contextual_embeddings():
    """Visualize how context changes word embeddings."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    embedder = ContextualEmbedding()
    
    # Static embedding problem
    ax1.set_title('Problem: Static Embedding for "bank"', fontsize=12, fontweight='bold')
    
    # Plot static embeddings
    for word, pos in embedder.static_embeddings.items():
        if word == 'bank':
            ax1.scatter(*pos, s=400, c=COLOR_CURRENT, marker='*',
                      edgecolors='black', linewidths=2, zorder=5)
        elif word in ['money', 'account']:
            ax1.scatter(*pos, s=200, c='gold', marker='o',
                      edgecolors='black', linewidths=2, zorder=5)
        else:
            ax1.scatter(*pos, s=200, c='lightblue', marker='o',
                      edgecolors='black', linewidths=2, zorder=5)
        ax1.annotate(word, pos, xytext=(5, 5), textcoords='offset points',
                    fontsize=10)
    
    ax1.text(5, 6.5, '?', fontsize=30, fontweight='bold', color='red')
    ax1.set_xlim(0, 10)
    ax1.set_ylim(0, 10)
    ax1.grid(True, alpha=0.3)
    ax1.set_xlabel('Dimension 1')
    ax1.set_ylabel('Dimension 2')
    
    # Contextual embedding solution
    ax2.set_title('Solution: Contextual Embeddings', fontsize=12, fontweight='bold')
    
    # Plot base words
    for word, pos in embedder.static_embeddings.items():
        if word != 'bank':
            if word in ['money', 'account']:
                ax2.scatter(*pos, s=200, c='gold', marker='o',
                          edgecolors='black', linewidths=2, zorder=5)
            else:
                ax2.scatter(*pos, s=200, c='lightblue', marker='o',
                          edgecolors='black', linewidths=2, zorder=5)
            ax2.annotate(word, pos, xytext=(5, 5), textcoords='offset points',
                        fontsize=10)
    
    # Plot contextual bank embeddings
    bank_financial = embedder.get_contextual_embedding('bank', ['money', 'account'])
    bank_river = embedder.get_contextual_embedding('bank', ['river', 'water'])
    
    ax2.scatter(*bank_financial, s=400, c=COLOR_PREDICT, marker='*',
              edgecolors='black', linewidths=2, zorder=5)
    ax2.annotate('bank\n(financial)', bank_financial, 
                xytext=(5, -15), textcoords='offset points',
                fontsize=10, fontweight='bold', ha='center')
    
    ax2.scatter(*bank_river, s=400, c=COLOR_PREDICT, marker='*',
              edgecolors='black', linewidths=2, zorder=5)
    ax2.annotate('bank\n(river)', bank_river,
                xytext=(5, -15), textcoords='offset points',
                fontsize=10, fontweight='bold', ha='center')
    
    # Draw connections
    ax2.plot([bank_financial[0], embedder.static_embeddings['money'][0]],
            [bank_financial[1], embedder.static_embeddings['money'][1]],
            'g--', alpha=0.5, linewidth=2)
    ax2.plot([bank_river[0], embedder.static_embeddings['river'][0]],
            [bank_river[1], embedder.static_embeddings['river'][1]],
            'g--', alpha=0.5, linewidth=2)
    
    ax2.set_xlim(0, 10)
    ax2.set_ylim(0, 10)
    ax2.grid(True, alpha=0.3)
    ax2.set_xlabel('Dimension 1')
    ax2.set_ylabel('Dimension 2')
    
    plt.suptitle('Context Matters: Same Word, Different Meanings',
                fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

visualize_contextual_embeddings()

## Summary and Next Steps

### What You've Discovered

1. **Character similarity ≠ Semantic similarity**: Words that are spelled similarly may have completely different meanings.

2. **One-hot encoding limitations**:
   - All words are equidistant (orthogonal)
   - Extremely sparse representations
   - No semantic relationships captured

3. **Dense embeddings advantages**:
   - Similar words cluster together
   - Compact representations (e.g., 100-300 dimensions vs 50,000+)
   - Can measure meaningful similarities

4. **Word arithmetic magic**:
   - Embeddings capture analogies
   - Relationships are encoded as vector differences
   - king - man + woman ≈ queen

5. **Context matters**:
   - Static embeddings can't handle polysemy (multiple meanings)
   - Contextual embeddings (BERT, GPT) solve this by creating different vectors based on context

### Questions for Class Discussion

1. How do you think computers actually *learn* these embeddings from text?
2. What's the optimal number of dimensions for embeddings? (Hint: it's usually 100-300, but why?)
3. Can we use embeddings for things other than words? (sentences? documents? images?)
4. What are the ethical implications of word embeddings learning from biased data?

### Try This Before Class

Think of other word relationships that might work with arithmetic:
- puppy - dog + cat = ?
- Tokyo - Japan + France = ?
- walked - walk + run = ?

### Resources for Further Exploration

- [Word2Vec Explained](https://jalammar.github.io/illustrated-word2vec/)
- [The Illustrated BERT](https://jalammar.github.io/illustrated-bert/)
- [GloVe: Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/)
