# Week 4: Sequence-to-Sequence Models - Interactive Lab

## Learning Objectives
- Build a basic encoder-decoder model from scratch
- Visualize the information bottleneck problem
- Implement attention mechanism step by step
- See seq2seq models in action with translation


In [None]:
# Setup and imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple, Dict
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Helper function to create clean visualizations
def plot_matrix(matrix, xlabel='', ylabel='', title='', cmap='Blues', figsize=(8, 6)):
    """Helper function to plot matrices consistently."""
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, annot=True, fmt='.2f', cmap=cmap, cbar=True, 
                square=True, linewidths=1, linecolor='gray')
    plt.xlabel(xlabel, fontsize=12)
    plt.ylabel(ylabel, fontsize=12)
    plt.title(title, fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

print("Setup complete! Let's explore sequence-to-sequence models.")

## Part 1: The Variable-Length Challenge

Let's start by understanding why translation is hard for neural networks.

In [None]:
# Translation examples showing length mismatch
translations = [
    ("I love you", "Je t'aime", "French"),
    ("I love you", "Ich liebe dich", "German"),
    ("I love you", "Aishiteru", "Japanese"),
    ("I love you", "Te amo", "Spanish"),
    ("I love you", "Wo ai ni", "Chinese"),
]

# Visualize length differences
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart of word counts
languages = [t[2] for t in translations]
english_lengths = [len(t[0].split()) for t in translations]
target_lengths = [len(t[1].split()) for t in translations]

x = np.arange(len(languages))
width = 0.35

ax1.bar(x - width/2, english_lengths, width, label='English', color='#4ECDC4')
ax1.bar(x + width/2, target_lengths, width, label='Target Language', color='#FF6B6B')
ax1.set_xlabel('Language')
ax1.set_ylabel('Number of Words')
ax1.set_title('Word Count Comparison: "I love you"')
ax1.set_xticks(x)
ax1.set_xticklabels(languages)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Scatter plot showing no correlation
np.random.seed(42)
sample_sentences = [
    (3, 2), (5, 7), (10, 8), (7, 12), (15, 10),
    (4, 6), (8, 5), (12, 14), (6, 9), (20, 15)
]
eng_lens = [s[0] for s in sample_sentences]
fr_lens = [s[1] for s in sample_sentences]

ax2.scatter(eng_lens, fr_lens, s=100, alpha=0.6, color='#FF6B6B')
ax2.plot([0, 20], [0, 20], 'k--', alpha=0.3, label='1:1 mapping')
ax2.set_xlabel('English Sentence Length (words)')
ax2.set_ylabel('French Sentence Length (words)')
ax2.set_title('No Fixed Length Relationship')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Key Insight: Different languages use different numbers of words!")
print("This means we can't use a simple one-to-one neural network mapping.")

## Part 2: Building an Encoder-Decoder

Let's build a simple encoder-decoder model to understand how it works.

In [None]:
class SimpleEncoder:
    """A simplified encoder that processes input sequences."""
    
    def __init__(self, hidden_size=4):
        self.hidden_size = hidden_size
        # Simulate word embeddings
        self.embeddings = {
            'the': np.array([0.1, 0.2, 0.3, 0.1]),
            'cat': np.array([0.8, 0.1, 0.05, 0.05]),
            'sat': np.array([0.2, 0.7, 0.05, 0.05]),
            'on': np.array([0.1, 0.1, 0.4, 0.4]),
            'mat': np.array([0.3, 0.2, 0.2, 0.3]),
        }
    
    def encode(self, sentence: List[str]) -> np.ndarray:
        """Encode a sentence into a fixed-size vector."""
        # Initialize hidden state
        hidden = np.zeros(self.hidden_size)
        
        # Process each word (simplified RNN)
        states = []
        for word in sentence:
            if word in self.embeddings:
                # Simplified RNN update
                hidden = 0.5 * hidden + 0.5 * self.embeddings[word]
                states.append(hidden.copy())
        
        return hidden, states

class SimpleDecoder:
    """A simplified decoder that generates output sequences."""
    
    def __init__(self, hidden_size=4):
        self.hidden_size = hidden_size
        # Simulate output vocabulary
        self.vocab = ['le', 'chat', 'sur', 'tapis', 'est', 'assis']
    
    def decode(self, context: np.ndarray, max_length: int = 5) -> List[str]:
        """Decode from context vector to output sequence."""
        output = []
        hidden = context.copy()
        
        for _ in range(max_length):
            # Simplified: choose word based on highest hidden value
            idx = np.argmax(hidden) % len(self.vocab)
            output.append(self.vocab[idx])
            # Update hidden state (simplified)
            hidden = hidden * 0.9 + np.random.randn(self.hidden_size) * 0.1
        
        return output

# Test the encoder-decoder
encoder = SimpleEncoder()
decoder = SimpleDecoder()

# Encode a sentence
input_sentence = ['the', 'cat', 'sat', 'on', 'the', 'mat']
context_vector, encoder_states = encoder.encode(input_sentence)

# Decode to French
output_sentence = decoder.decode(context_vector)

print("Input (English):", ' '.join(input_sentence))
print("Context Vector:", context_vector)
print("Output (French):", ' '.join(output_sentence))

# Visualize the encoding process
plt.figure(figsize=(12, 5))

# Plot encoder states
plt.subplot(1, 2, 1)
states_matrix = np.array(encoder_states).T
plt.imshow(states_matrix, aspect='auto', cmap='RdBu_r')
plt.colorbar(label='Activation')
plt.xlabel('Time Steps (Words)')
plt.ylabel('Hidden Units')
plt.title('Encoder Hidden States Over Time')
plt.xticks(range(len(input_sentence)), input_sentence, rotation=45)

# Plot final context vector
plt.subplot(1, 2, 2)
plt.bar(range(len(context_vector)), context_vector, color='#4ECDC4')
plt.xlabel('Hidden Dimension')
plt.ylabel('Value')
plt.title('Final Context Vector (Bottleneck)')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Part 3: The Information Bottleneck Problem

Let's visualize why compressing everything into a single vector is problematic.

In [None]:
def measure_information_loss(sentence_length: int, context_size: int = 256) -> float:
    """Estimate information loss when compressing a sentence."""
    # Assume each word contains ~10 bits of information
    bits_per_word = 10
    total_information = sentence_length * bits_per_word
    
    # Context vector capacity (assuming each dimension stores 1 bit effectively)
    context_capacity = context_size
    
    # Calculate loss
    if total_information <= context_capacity:
        return 0.0
    else:
        return (total_information - context_capacity) / total_information

# Test with different sentence lengths
sentence_lengths = range(1, 101, 5)
losses_256 = [measure_information_loss(l, 256) for l in sentence_lengths]
losses_512 = [measure_information_loss(l, 512) for l in sentence_lengths]
losses_1024 = [measure_information_loss(l, 1024) for l in sentence_lengths]

# Visualization
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(sentence_lengths, losses_256, label='256-dim context', linewidth=2)
plt.plot(sentence_lengths, losses_512, label='512-dim context', linewidth=2)
plt.plot(sentence_lengths, losses_1024, label='1024-dim context', linewidth=2)
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.5, label='50% loss threshold')
plt.xlabel('Sentence Length (words)')
plt.ylabel('Information Loss (%)')
plt.title('Information Bottleneck Problem')
plt.legend()
plt.grid(True, alpha=0.3)

# Demonstrate with actual compression
plt.subplot(1, 2, 2)

# Compress sentences of different lengths
test_sentences = [
    "Hello world",
    "The quick brown fox jumps",
    "The International Conference on Machine Learning accepted our paper",
    "The International Conference on Machine Learning, which is one of the premier venues for presenting research in machine learning and attracts submissions from researchers around the world"
]

lengths = [len(s.split()) for s in test_sentences]
context_dims = 8  # Use small dimension for visualization

contexts = []
for sent in test_sentences:
    # Simulate compression: more words = more averaged out
    words = sent.split()
    context = np.random.randn(context_dims)
    # Averaging effect
    context = context / np.sqrt(len(words))
    contexts.append(context)

# Plot contexts as heatmap
contexts_matrix = np.array(contexts)
im = plt.imshow(contexts_matrix.T, aspect='auto', cmap='RdBu_r')
plt.colorbar(im, label='Value')
plt.xlabel('Sentence')
plt.ylabel('Context Dimension')
plt.title('Context Vectors Get More "Averaged Out" with Length')
plt.xticks(range(len(lengths)), [f"{l} words" for l in lengths])

plt.tight_layout()
plt.show()

print("Key Insight: Longer sentences lose more information when compressed!")
print("This is why vanilla seq2seq models fail on long sentences.")

## Part 4: Introducing Attention

Now let's implement a simple attention mechanism to solve the bottleneck problem.

In [None]:
class AttentionModule:
    """Simple attention mechanism."""
    
    def compute_attention(self, 
                         query: np.ndarray, 
                         keys: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
        """Compute attention weights and context vector.
        
        Args:
            query: Current decoder state (hidden_size,)
            keys: List of encoder states [(hidden_size,), ...]
            
        Returns:
            attention_weights: Attention distribution over keys
            context: Weighted sum of keys
        """
        # Compute similarity scores (dot product attention)
        scores = []
        for key in keys:
            score = np.dot(query, key)
            scores.append(score)
        scores = np.array(scores)
        
        # Apply softmax to get attention weights
        exp_scores = np.exp(scores - np.max(scores))  # Numerical stability
        attention_weights = exp_scores / np.sum(exp_scores)
        
        # Compute weighted sum (context vector)
        context = np.zeros_like(keys[0])
        for weight, key in zip(attention_weights, keys):
            context += weight * key
        
        return attention_weights, context

# Demonstrate attention on our example
attention = AttentionModule()

# Use encoder states from before
encoder = SimpleEncoder()
input_sentence = ['the', 'cat', 'sat', 'on', 'the', 'mat']
_, encoder_states = encoder.encode(input_sentence)

# Simulate different decoder states focusing on different parts
decoder_queries = [
    np.array([0.8, 0.1, 0.05, 0.05]),  # Focus on "cat"
    np.array([0.2, 0.7, 0.05, 0.05]),  # Focus on "sat"
    np.array([0.3, 0.2, 0.2, 0.3]),    # Focus on "mat"
]

query_labels = ['Generating "chat" (cat)', 'Generating "assis" (sat)', 'Generating "tapis" (mat)']

# Compute attention for each query
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (query, label) in enumerate(zip(decoder_queries, query_labels)):
    weights, context = attention.compute_attention(query, encoder_states)
    
    # Plot attention weights
    ax = axes[idx]
    ax.bar(range(len(input_sentence)), weights, color='#FF6B6B', alpha=0.7)
    ax.set_xlabel('Source Words')
    ax.set_ylabel('Attention Weight')
    ax.set_title(label)
    ax.set_xticks(range(len(input_sentence)))
    ax.set_xticklabels(input_sentence, rotation=45)
    ax.set_ylim([0, 1])
    ax.grid(True, alpha=0.3)
    
    # Highlight the most attended word
    max_idx = np.argmax(weights)
    ax.bar(max_idx, weights[max_idx], color='#FF6B6B')

plt.tight_layout()
plt.show()

print("Key Insight: Attention allows the decoder to focus on relevant source words!")
print("This solves the information bottleneck problem.")

## Part 5: Visualizing Attention in Translation

Let's create a complete attention visualization for a translation example.

In [None]:
# Create a realistic attention pattern for translation
source_sentence = "The black cat sat on the mat"
target_sentence = "Le chat noir s'est assis sur le tapis"

source_words = source_sentence.split()
target_words = target_sentence.split()

# Define attention alignment (which source words each target word attends to)
# This is based on linguistic alignment
attention_matrix = np.zeros((len(target_words), len(source_words)))

# Manual alignment based on translation
alignments = [
    (0, 0),   # Le -> The
    (1, 2),   # chat -> cat
    (2, 1),   # noir -> black
    (3, 3),   # s'est -> sat
    (4, 3),   # assis -> sat
    (5, 4),   # sur -> on
    (6, 5),   # le -> the
    (7, 6),   # tapis -> mat
]

# Create attention weights with some noise
np.random.seed(42)
for tgt_idx in range(len(target_words)):
    # Add small random weights everywhere
    attention_matrix[tgt_idx, :] = np.random.random(len(source_words)) * 0.1
    
    # Add strong weight for aligned words
    for align_tgt, align_src in alignments:
        if align_tgt == tgt_idx:
            attention_matrix[tgt_idx, align_src] += 0.7
    
    # Normalize to sum to 1
    attention_matrix[tgt_idx, :] /= attention_matrix[tgt_idx, :].sum()

# Create attention heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(attention_matrix, 
            xticklabels=source_words,
            yticklabels=target_words,
            cmap='Blues',
            cbar_kws={'label': 'Attention Weight'},
            linewidths=0.5,
            linecolor='gray',
            square=True)

plt.xlabel('Source (English)', fontsize=12)
plt.ylabel('Target (French)', fontsize=12)
plt.title('Attention Weights in Translation', fontsize=14, fontweight='bold')

# Add text annotations for high attention values
for i in range(len(target_words)):
    for j in range(len(source_words)):
        if attention_matrix[i, j] > 0.3:
            plt.text(j + 0.5, i + 0.5, f'{attention_matrix[i, j]:.2f}',
                    ha='center', va='center', color='white', fontweight='bold')

plt.tight_layout()
plt.show()

print("Notice how:")
print("1. 'chat' (cat) strongly attends to 'cat' in the source")
print("2. 'noir' (black) attends to 'black'")
print("3. Some target words attend to multiple source words (s'est assis -> sat)")
print("4. The attention pattern roughly follows diagonal alignment")

## Part 6: Comparing Attention Types

Let's compare different attention mechanisms.

In [None]:
def dot_product_attention(query, key):
    """Bahdanau attention (dot product)."""
    return np.dot(query, key)

def additive_attention(query, key, W1=None, W2=None, v=None):
    """Additive attention (Bahdanau style with learnable parameters)."""
    # Simulate learned parameters
    if W1 is None:
        W1 = np.random.randn(4, 4) * 0.1
    if W2 is None:
        W2 = np.random.randn(4, 4) * 0.1
    if v is None:
        v = np.random.randn(4) * 0.1
    
    # Compute attention score
    combined = np.tanh(W1 @ query + W2 @ key)
    return np.dot(v, combined)

def scaled_dot_product_attention(query, key):
    """Scaled dot product (used in Transformers)."""
    d_k = len(key)
    return np.dot(query, key) / np.sqrt(d_k)

# Compare attention mechanisms
query = np.array([0.5, 0.3, 0.1, 0.1])
keys = [
    np.array([0.1, 0.2, 0.3, 0.4]),
    np.array([0.4, 0.3, 0.2, 0.1]),
    np.array([0.5, 0.3, 0.1, 0.1]),  # Similar to query
    np.array([0.2, 0.2, 0.3, 0.3]),
]

attention_types = [
    ('Dot Product', [dot_product_attention(query, k) for k in keys]),
    ('Scaled Dot Product', [scaled_dot_product_attention(query, k) for k in keys]),
    ('Additive', [additive_attention(query, k) for k in keys]),
]

# Visualize different attention scores
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, scores) in enumerate(attention_types):
    # Convert to probabilities
    scores = np.array(scores)
    exp_scores = np.exp(scores - np.max(scores))
    probs = exp_scores / np.sum(exp_scores)
    
    ax = axes[idx]
    bars = ax.bar(range(len(probs)), probs, color=['#FF6B6B', '#4ECDC4', '#95E77E', '#FAB563'])
    ax.set_xlabel('Key Index')
    ax.set_ylabel('Attention Weight')
    ax.set_title(f'{name} Attention')
    ax.set_ylim([0, 1])
    ax.grid(True, alpha=0.3)
    
    # Highlight the highest attention
    max_idx = np.argmax(probs)
    bars[max_idx].set_edgecolor('black')
    bars[max_idx].set_linewidth(2)
    
    # Add value labels
    for bar, prob in zip(bars, probs):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                f'{prob:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print("Key Observations:")
print("1. All attention types identify key 2 (similar to query) as important")
print("2. Scaled dot product prevents values from getting too large")
print("3. Additive attention can learn more complex patterns")

## Part 7: Beam Search Visualization

Let's visualize how beam search explores multiple translation paths.

In [None]:
# Simulate beam search for translation
import networkx as nx
from matplotlib.patches import FancyBboxPatch

# Define beam search tree
beam_tree = {
    'START': [
        ('Le', 0.8),
        ('Un', 0.15),
        ('La', 0.05)
    ],
    'Le': [
        ('chat', 0.9),
        ('chien', 0.08),
        ('animal', 0.02)
    ],
    'Un': [
        ('chat', 0.7),
        ('animal', 0.3)
    ],
    'Le-chat': [
        ('noir', 0.85),
        ('blanc', 0.1),
        ('est', 0.05)
    ],
    'Le-chien': [
        ('noir', 0.6),
        ('court', 0.4)
    ],
    'Un-chat': [
        ('noir', 0.8),
        ('dort', 0.2)
    ]
}

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Left: Beam search tree
ax1.set_xlim(-0.5, 3.5)
ax1.set_ylim(-0.5, 4.5)
ax1.set_aspect('equal')
ax1.axis('off')
ax1.set_title('Beam Search Tree (beam_size=2)', fontsize=14, fontweight='bold')

# Draw nodes and edges
positions = {
    'START': (0, 2),
    'Le': (1, 3),
    'Un': (1, 1),
    'chat1': (2, 3.5),
    'chien': (2, 2.5),
    'chat2': (2, 1),
    'noir1': (3, 3.5),
    'noir2': (3, 2.5),
}

# Draw edges with probabilities
edges = [
    ('START', 'Le', 0.8, True),
    ('START', 'Un', 0.15, True),
    ('Le', 'chat1', 0.72, True),  # 0.8 * 0.9
    ('Le', 'chien', 0.064, False),  # 0.8 * 0.08
    ('Un', 'chat2', 0.105, True),  # 0.15 * 0.7
    ('chat1', 'noir1', 0.612, True),  # 0.72 * 0.85
    ('chat2', 'noir2', 0.084, False),  # 0.105 * 0.8
]

for start, end, prob, in_beam in edges:
    start_pos = positions[start]
    end_pos = positions[end]
    
    color = '#4ECDC4' if in_beam else '#E0E0E0'
    width = 2 if in_beam else 1
    alpha = 1.0 if in_beam else 0.4
    
    ax1.arrow(start_pos[0], start_pos[1], 
             end_pos[0] - start_pos[0] - 0.15, 
             end_pos[1] - start_pos[1],
             head_width=0.08, head_length=0.1, 
             fc=color, ec=color, linewidth=width, alpha=alpha)
    
    # Add probability label
    mid_x = (start_pos[0] + end_pos[0]) / 2
    mid_y = (start_pos[1] + end_pos[1]) / 2
    ax1.text(mid_x, mid_y + 0.1, f'{prob:.3f}', 
            fontsize=9, ha='center', 
            color='black' if in_beam else 'gray')

# Draw nodes
node_labels = {
    'START': 'START',
    'Le': 'Le',
    'Un': 'Un',
    'chat1': 'chat',
    'chien': 'chien',
    'chat2': 'chat',
    'noir1': 'noir',
    'noir2': 'noir',
}

for node, pos in positions.items():
    # Check if node is in beam
    in_beam = node in ['START', 'Le', 'Un', 'chat1', 'chat2', 'noir1']
    
    color = '#FF6B6B' if in_beam else '#E0E0E0'
    
    circle = plt.Circle(pos, 0.15, color=color, alpha=0.8 if in_beam else 0.4)
    ax1.add_patch(circle)
    ax1.text(pos[0], pos[1], node_labels[node], 
            ha='center', va='center', fontsize=10, 
            fontweight='bold' if in_beam else 'normal',
            color='white' if in_beam else 'gray')

# Add beam size indicator
ax1.text(3.5, 4, 'Beam Size = 2', fontsize=11, 
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
ax1.text(3.5, 3.7, 'Keep top 2 paths', fontsize=9, style='italic')

# Right: Score comparison
paths = [
    'Le chat noir',
    'Un chat noir',
    'Le chien noir',
    'Le chat blanc',
]
scores = [0.612, 0.084, 0.038, 0.072]
colors = ['#4ECDC4', '#FF6B6B', '#E0E0E0', '#E0E0E0']

bars = ax2.barh(range(len(paths)), scores, color=colors)
ax2.set_yticks(range(len(paths)))
ax2.set_yticklabels(paths)
ax2.set_xlabel('Cumulative Score')
ax2.set_title('Path Scores', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3, axis='x')

# Add value labels
for bar, score in zip(bars, scores):
    ax2.text(score + 0.01, bar.get_y() + bar.get_height()/2,
            f'{score:.3f}', va='center')

# Mark selected paths
ax2.text(0.7, 3.5, 'Selected', fontsize=10, 
        bbox=dict(boxstyle='round', facecolor='lightgreen', alpha=0.5))
ax2.text(0.7, 1.5, 'Pruned', fontsize=10,
        bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.5))

plt.tight_layout()
plt.show()

print("Beam Search Strategy:")
print("1. At each step, expand all possible next words")
print("2. Score each path by cumulative probability")
print("3. Keep only top-k paths (beam size)")
print("4. Continue until reaching end token or max length")
print("\nThis avoids getting stuck with early bad choices!")

## Summary and Key Takeaways

Let's summarize what we've learned about sequence-to-sequence models.

In [None]:
# Create a summary visualization
fig = plt.figure(figsize=(14, 8))

# Timeline of improvements
ax1 = plt.subplot(2, 2, 1)
years = [2014, 2015, 2016, 2017]
models = ['Basic\nSeq2Seq', 'Attention\nMechanism', 'Google\nNMT', 'Transformer']
improvements = [1, 3.5, 7, 10]

ax1.plot(years, improvements, 'o-', linewidth=2, markersize=10, color='#4ECDC4')
for year, model, imp in zip(years, models, improvements):
    ax1.annotate(model, (year, imp), 
                textcoords="offset points", xytext=(0,10), ha='center',
                fontsize=9, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

ax1.set_xlabel('Year')
ax1.set_ylabel('Performance (relative)')
ax1.set_title('Evolution of Seq2Seq Models')
ax1.grid(True, alpha=0.3)

# Key components
ax2 = plt.subplot(2, 2, 2)
components = ['Encoder', 'Decoder', 'Attention', 'Beam\nSearch']
importance = [8, 8, 10, 6]
colors_comp = ['#4ECDC4', '#FF6B6B', '#95E77E', '#FAB563']

bars = ax2.bar(components, importance, color=colors_comp)
ax2.set_ylabel('Importance Score')
ax2.set_title('Key Components')
ax2.set_ylim([0, 12])
ax2.grid(True, alpha=0.3, axis='y')

# Problem solved
ax3 = plt.subplot(2, 2, 3)
problems = ['Variable\nLength', 'Information\nBottleneck', 'Long\nDependencies', 'Search\nStrategy']
solutions = ['Encoder-\nDecoder', 'Attention', 'Attention', 'Beam\nSearch']
y_pos = np.arange(len(problems))

ax3.barh(y_pos, [1, 1, 1, 1], color='#E0E0E0', alpha=0.3)
ax3.set_yticks(y_pos)
ax3.set_yticklabels(problems)
ax3.set_xlabel('Solution')
ax3.set_title('Problems Solved')
ax3.set_xlim([0, 1.2])

for i, sol in enumerate(solutions):
    ax3.text(0.5, i, sol, ha='center', va='center',
            fontsize=10, fontweight='bold', color='#2C3E50')

# Applications
ax4 = plt.subplot(2, 2, 4)
applications = ['Translation', 'Summarization', 'Q&A', 'Code\nGeneration']
adoption = [95, 70, 80, 60]

ax4.pie(adoption, labels=applications, autopct='%1.0f%%',
       colors=['#4ECDC4', '#FF6B6B', '#95E77E', '#FAB563'],
       startangle=90)
ax4.set_title('Industry Adoption (%)')

plt.suptitle('Sequence-to-Sequence Models: Complete Picture', 
            fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("KEY TAKEAWAYS FROM WEEK 4")
print("="*50)
print()
print("1. PROBLEM: Variable-length sequences don't map 1-to-1")
print("   SOLUTION: Encoder-Decoder architecture")
print()
print("2. PROBLEM: Fixed-size context vectors lose information")
print("   SOLUTION: Attention mechanism")
print()
print("3. KEY INSIGHT: Let the model learn what to focus on")
print("   - Don't compress everything")
print("   - Keep all encoder states")
print("   - Use attention to select relevant information")
print()
print("4. MODERN IMPACT:")
print("   - Google Translate")
print("   - GitHub Copilot")
print("   - ChatGPT (uses attention at its core)")
print()
print("Next Week: We'll see how Transformers take attention")
print("           to the extreme - 'Attention is All You Need'!")
print("="*50)

## Exercises for Practice

Try these exercises to deepen your understanding:

1. **Modify the SimpleEncoder** to use different aggregation methods (max pooling, mean pooling)
2. **Implement multiplicative attention** and compare with dot-product attention
3. **Visualize attention** for a longer sentence and observe the patterns
4. **Experiment with beam size** - how does it affect translation quality vs speed?
5. **Build a toy translator** using the concepts learned

Remember: The key insight of seq2seq models is that we don't need to map inputs to outputs directly - we can use an intermediate representation and attention to handle variable-length sequences!