# Week 3: Build Your Own Memory Network
## From Simple RNNs to LSTMs - A Hands-on Journey

In this lab, you'll build RNNs from scratch and watch them learn (and fail!) on real tasks.

In [None]:
# Setup and imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, List, Dict
import warnings
warnings.filterwarnings('ignore')

# Set style for beautiful plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# For reproducibility
np.random.seed(42)

print("Ready to build memory networks!")

## Part 1: Understanding Sequential Memory

Let's start with a simple task: predicting the next character in a sequence.

In [None]:
# Simple text for our experiments
text = "hello world! how are you doing today? hello again!"
chars = sorted(list(set(text)))
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")
print(f"Characters: {chars}")

# Convert text to indices
data = [char_to_idx[ch] for ch in text]
print(f"\nFirst 20 characters as indices: {data[:20]}")

## Part 2: Building a Simple RNN from Scratch

Let's implement the core RNN equations:
- $h_t = \tanh(W_{hh} \cdot h_{t-1} + W_{xh} \cdot x_t + b_h)$
- $y_t = W_{hy} \cdot h_t + b_y$

In [None]:
class SimpleRNN:
    def __init__(self, vocab_size, hidden_size=10):
        """Initialize a simple RNN."""
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # Initialize weights (Xavier initialization)
        self.Wxh = np.random.randn(hidden_size, vocab_size) * 0.01
        self.Whh = np.random.randn(hidden_size, hidden_size) * 0.01
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.bh = np.zeros((hidden_size, 1))
        self.by = np.zeros((vocab_size, 1))
        
    def forward_step(self, x, h_prev):
        """Forward pass for one time step."""
        # Update hidden state
        h = np.tanh(self.Whh @ h_prev + self.Wxh @ x + self.bh)
        
        # Compute output
        y = self.Why @ h + self.by
        
        # Apply softmax
        p = np.exp(y) / np.sum(np.exp(y))
        
        return h, p
    
    def forward_sequence(self, inputs):
        """Process a full sequence."""
        h = np.zeros((self.hidden_size, 1))
        outputs = []
        hidden_states = [h]
        
        for x in inputs:
            # One-hot encode input
            x_vec = np.zeros((self.vocab_size, 1))
            x_vec[x] = 1
            
            h, p = self.forward_step(x_vec, h)
            outputs.append(p)
            hidden_states.append(h)
            
        return outputs, hidden_states

# Create and test our RNN
rnn = SimpleRNN(vocab_size, hidden_size=16)

# Process first 5 characters
outputs, hidden_states = rnn.forward_sequence(data[:5])
print("RNN successfully created!")
print(f"Hidden state shape: {hidden_states[0].shape}")
print(f"Output shape: {outputs[0].shape}")

## Part 3: Visualizing Hidden States

Let's see how hidden states evolve as the RNN processes text.

In [None]:
# Process a longer sequence
sequence_length = 20
outputs, hidden_states = rnn.forward_sequence(data[:sequence_length])

# Extract hidden states as matrix
H = np.hstack([h for h in hidden_states[1:]])  # Skip initial zero state

# Visualize hidden state evolution
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

# Plot 1: Hidden state activations over time
im1 = ax1.imshow(H, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
ax1.set_xlabel('Time Step')
ax1.set_ylabel('Hidden Unit')
ax1.set_title('Hidden State Evolution')
ax1.set_xticks(range(sequence_length))
ax1.set_xticklabels([text[i] for i in range(sequence_length)], rotation=45)
plt.colorbar(im1, ax=ax1)

# Plot 2: Hidden state magnitude over time
hidden_magnitudes = np.linalg.norm(H, axis=0)
ax2.plot(hidden_magnitudes, 'b-', linewidth=2)
ax2.set_xlabel('Time Step')
ax2.set_ylabel('Hidden State Magnitude')
ax2.set_title('Memory Strength Over Time')
ax2.grid(True, alpha=0.3)
ax2.set_xticks(range(sequence_length))
ax2.set_xticklabels([text[i] for i in range(sequence_length)], rotation=45)

plt.tight_layout()
plt.show()

print(f"Average hidden state magnitude: {np.mean(hidden_magnitudes):.3f}")

## Part 4: The Vanishing Gradient Problem

Let's demonstrate why simple RNNs struggle with long sequences.

In [None]:
def simulate_gradient_flow(sequence_length, decay_rate=0.9):
    """Simulate gradient decay through time."""
    gradients = []
    current_gradient = 1.0
    
    for t in range(sequence_length):
        gradients.append(current_gradient)
        current_gradient *= decay_rate
    
    return gradients

# Simulate for different decay rates
sequence_lengths = np.arange(1, 51)
decay_rates = [0.99, 0.95, 0.9, 0.8, 0.5]

plt.figure(figsize=(12, 6))

for rate in decay_rates:
    gradients = [simulate_gradient_flow(length, rate)[-1] 
                 for length in sequence_lengths]
    plt.semilogy(sequence_lengths, gradients, 
                 label=f'Decay rate = {rate}', linewidth=2)

# Add threshold line
plt.axhline(y=1e-5, color='r', linestyle='--', 
            label='Effective zero (1e-5)')

plt.xlabel('Sequence Length', fontsize=12)
plt.ylabel('Gradient Magnitude (log scale)', fontsize=12)
plt.title('Vanishing Gradient Problem in RNNs', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Calculate specific examples
for rate in [0.9, 0.5]:
    gradient_20 = rate ** 20
    gradient_50 = rate ** 50
    print(f"Decay rate {rate}:")
    print(f"  After 20 steps: {gradient_20:.6f}")
    print(f"  After 50 steps: {gradient_50:.10f}")
    print()

## Part 5: Building an LSTM from Scratch

Now let's implement LSTM with its clever gating mechanism.

In [None]:
class SimpleLSTM:
    def __init__(self, vocab_size, hidden_size=10):
        """Initialize a simple LSTM."""
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # Combined weight matrix for all gates
        # [input_gate; forget_gate; output_gate; candidate]
        concat_size = vocab_size + hidden_size
        
        # Initialize weights
        self.W = np.random.randn(4 * hidden_size, concat_size) * 0.01
        self.b = np.zeros((4 * hidden_size, 1))
        
        # Output weights
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.by = np.zeros((vocab_size, 1))
        
    def sigmoid(self, x):
        """Sigmoid activation."""
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def forward_step(self, x, h_prev, c_prev):
        """Forward pass for one LSTM time step."""
        # Concatenate input and previous hidden state
        concat = np.vstack([x, h_prev])
        
        # Compute all gates at once
        gates = self.W @ concat + self.b
        
        # Split into individual gates
        i_gate = self.sigmoid(gates[:self.hidden_size])      # Input gate
        f_gate = self.sigmoid(gates[self.hidden_size:2*self.hidden_size])  # Forget gate
        o_gate = self.sigmoid(gates[2*self.hidden_size:3*self.hidden_size])  # Output gate
        c_candidate = np.tanh(gates[3*self.hidden_size:])    # Candidate values
        
        # Update cell state
        c = f_gate * c_prev + i_gate * c_candidate
        
        # Update hidden state
        h = o_gate * np.tanh(c)
        
        # Compute output
        y = self.Why @ h + self.by
        p = np.exp(y) / np.sum(np.exp(y))
        
        # Return everything for visualization
        gates_dict = {
            'input': i_gate,
            'forget': f_gate,
            'output': o_gate,
            'candidate': c_candidate
        }
        
        return h, c, p, gates_dict
    
    def forward_sequence(self, inputs):
        """Process a full sequence."""
        h = np.zeros((self.hidden_size, 1))
        c = np.zeros((self.hidden_size, 1))
        
        outputs = []
        hidden_states = [h]
        cell_states = [c]
        all_gates = []
        
        for x in inputs:
            # One-hot encode input
            x_vec = np.zeros((self.vocab_size, 1))
            x_vec[x] = 1
            
            h, c, p, gates = self.forward_step(x_vec, h, c)
            
            outputs.append(p)
            hidden_states.append(h)
            cell_states.append(c)
            all_gates.append(gates)
            
        return outputs, hidden_states, cell_states, all_gates

# Create and test our LSTM
lstm = SimpleLSTM(vocab_size, hidden_size=16)

# Process first 20 characters
outputs, hidden_states, cell_states, gates = lstm.forward_sequence(data[:20])
print("LSTM successfully created!")
print(f"Number of parameters in LSTM: {lstm.W.size + lstm.b.size + lstm.Why.size + lstm.by.size}")
print(f"Number of parameters in RNN: {rnn.Wxh.size + rnn.Whh.size + rnn.Why.size + rnn.bh.size + rnn.by.size}")

## Part 6: Visualizing LSTM Gates

Let's see how LSTM gates control information flow.

In [None]:
# Extract gate activations
sequence_length = 30
outputs, hidden_states, cell_states, all_gates = lstm.forward_sequence(data[:sequence_length])

# Prepare gate data for visualization
forget_gates = np.hstack([g['forget'] for g in all_gates])
input_gates = np.hstack([g['input'] for g in all_gates])
output_gates = np.hstack([g['output'] for g in all_gates])

# Create visualization
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

gates_data = [
    (forget_gates, 'Forget Gate', 'Reds'),
    (input_gates, 'Input Gate', 'Greens'),
    (output_gates, 'Output Gate', 'Blues')
]

for ax, (gate_data, title, cmap) in zip(axes, gates_data):
    im = ax.imshow(gate_data, aspect='auto', cmap=cmap, vmin=0, vmax=1)
    ax.set_title(f'{title} Activations', fontsize=12)
    ax.set_xlabel('Time Step')
    ax.set_ylabel('Hidden Unit')
    ax.set_xticks(range(0, sequence_length, 5))
    ax.set_xticklabels([text[i] for i in range(0, sequence_length, 5)], rotation=45)
    plt.colorbar(im, ax=ax)

plt.suptitle('LSTM Gate Activations Over Time', fontsize=14)
plt.tight_layout()
plt.show()

# Analyze gate statistics
print("Gate Statistics:")
print(f"Forget gate mean: {np.mean(forget_gates):.3f} (closer to 1 = remembering)")
print(f"Input gate mean: {np.mean(input_gates):.3f} (closer to 1 = storing new info)")
print(f"Output gate mean: {np.mean(output_gates):.3f} (closer to 1 = outputting info)")

## Part 7: Comparing RNN vs LSTM Memory

Let's compare how well RNNs and LSTMs maintain information over time.

In [None]:
def memory_retention_test(model_type, sequence_length=50):
    """Test how well a model retains information."""
    # Create a simple pattern: remember first character
    test_sequence = [0] + [1] * (sequence_length - 1)  # First char is 0, rest are 1
    
    if model_type == 'RNN':
        model = SimpleRNN(vocab_size, hidden_size=16)
        outputs, hidden_states = model.forward_sequence(test_sequence)
        # Convert hidden states to matrix
        H = np.hstack([h for h in hidden_states[1:]])
        memory_strength = np.linalg.norm(H, axis=0)
        
    else:  # LSTM
        model = SimpleLSTM(vocab_size, hidden_size=16)
        outputs, hidden_states, cell_states, _ = model.forward_sequence(test_sequence)
        # Use cell states for LSTM (long-term memory)
        C = np.hstack([c for c in cell_states[1:]])
        memory_strength = np.linalg.norm(C, axis=0)
    
    return memory_strength

# Test both models
sequence_length = 40
rnn_memory = memory_retention_test('RNN', sequence_length)
lstm_memory = memory_retention_test('LSTM', sequence_length)

# Normalize for comparison
rnn_memory = rnn_memory / rnn_memory[0] if rnn_memory[0] != 0 else rnn_memory
lstm_memory = lstm_memory / lstm_memory[0] if lstm_memory[0] != 0 else lstm_memory

# Plot comparison
plt.figure(figsize=(12, 6))
x_axis = np.arange(sequence_length)

plt.plot(x_axis, rnn_memory, 'r-', linewidth=2, label='Simple RNN', alpha=0.7)
plt.plot(x_axis, lstm_memory, 'b-', linewidth=2, label='LSTM', alpha=0.7)

plt.xlabel('Time Steps', fontsize=12)
plt.ylabel('Relative Memory Strength', fontsize=12)
plt.title('Memory Retention: RNN vs LSTM', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)

# Add annotations
plt.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
plt.text(sequence_length * 0.7, 0.52, '50% retention', fontsize=10)

plt.tight_layout()
plt.show()

# Calculate retention metrics
rnn_half_life = np.where(rnn_memory < 0.5)[0]
lstm_half_life = np.where(lstm_memory < 0.5)[0]

print("Memory Retention Analysis:")
print(f"RNN memory at step 20: {rnn_memory[19]:.3f}")
print(f"LSTM memory at step 20: {lstm_memory[19]:.3f}")
if len(rnn_half_life) > 0:
    print(f"RNN half-life: {rnn_half_life[0]} steps")
else:
    print("RNN half-life: > 40 steps")
if len(lstm_half_life) > 0:
    print(f"LSTM half-life: {lstm_half_life[0]} steps")
else:
    print("LSTM half-life: > 40 steps")

## Part 8: Text Generation Challenge

Let's train our models and see which generates better text!

In [None]:
def generate_text(model, seed_char, length=50, temperature=1.0):
    """Generate text using trained model."""
    generated = seed_char
    x = char_to_idx[seed_char]
    
    if isinstance(model, SimpleRNN):
        h = np.zeros((model.hidden_size, 1))
        
        for _ in range(length):
            x_vec = np.zeros((model.vocab_size, 1))
            x_vec[x] = 1
            
            h, p = model.forward_step(x_vec, h)
            
            # Sample with temperature
            p = np.power(p, 1/temperature)
            p = p / np.sum(p)
            x = np.random.choice(range(model.vocab_size), p=p.ravel())
            generated += idx_to_char[x]
            
    else:  # LSTM
        h = np.zeros((model.hidden_size, 1))
        c = np.zeros((model.hidden_size, 1))
        
        for _ in range(length):
            x_vec = np.zeros((model.vocab_size, 1))
            x_vec[x] = 1
            
            h, c, p, _ = model.forward_step(x_vec, h, c)
            
            # Sample with temperature
            p = np.power(p, 1/temperature)
            p = p / np.sum(p)
            x = np.random.choice(range(model.vocab_size), p=p.ravel())
            generated += idx_to_char[x]
    
    return generated

# Generate text with both models
print("=" * 60)
print("Text Generation Comparison")
print("=" * 60)

seed_char = 'h'
print(f"\nSeed character: '{seed_char}'\n")

# RNN generation
print("Simple RNN generated text:")
rnn_text = generate_text(rnn, seed_char, length=50, temperature=0.8)
print(f"  {rnn_text}")

print("\nLSTM generated text:")
lstm_text = generate_text(lstm, seed_char, length=50, temperature=0.8)
print(f"  {lstm_text}")

print("\n" + "=" * 60)
print("Note: These are untrained models with random weights.")
print("With training, LSTM would show much better coherence!")

## Part 9: Building a GRU (Simplified LSTM)

Let's implement a GRU, which achieves similar performance to LSTM with fewer parameters.

In [None]:
class SimpleGRU:
    def __init__(self, vocab_size, hidden_size=10):
        """Initialize a simple GRU."""
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        # GRU has 3 weight matrices: reset, update, candidate
        concat_size = vocab_size + hidden_size
        
        self.Wr = np.random.randn(hidden_size, concat_size) * 0.01  # Reset gate
        self.Wz = np.random.randn(hidden_size, concat_size) * 0.01  # Update gate
        self.Wh = np.random.randn(hidden_size, concat_size) * 0.01  # Candidate
        
        self.br = np.zeros((hidden_size, 1))
        self.bz = np.zeros((hidden_size, 1))
        self.bh = np.zeros((hidden_size, 1))
        
        # Output weights
        self.Why = np.random.randn(vocab_size, hidden_size) * 0.01
        self.by = np.zeros((vocab_size, 1))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))
    
    def forward_step(self, x, h_prev):
        """Forward pass for one GRU time step."""
        # Concatenate input and previous hidden state
        concat = np.vstack([x, h_prev])
        
        # Reset gate
        r = self.sigmoid(self.Wr @ concat + self.br)
        
        # Update gate
        z = self.sigmoid(self.Wz @ concat + self.bz)
        
        # Candidate hidden state (with reset gate applied)
        concat_reset = np.vstack([x, r * h_prev])
        h_candidate = np.tanh(self.Wh @ concat_reset + self.bh)
        
        # Final hidden state (interpolation)
        h = (1 - z) * h_prev + z * h_candidate
        
        # Output
        y = self.Why @ h + self.by
        p = np.exp(y) / np.sum(np.exp(y))
        
        return h, p, {'reset': r, 'update': z}

# Create GRU and compare parameter counts
gru = SimpleGRU(vocab_size, hidden_size=16)

# Count parameters
lstm_params = lstm.W.size + lstm.b.size + lstm.Why.size + lstm.by.size
gru_params = (gru.Wr.size + gru.Wz.size + gru.Wh.size + 
              gru.br.size + gru.bz.size + gru.bh.size +
              gru.Why.size + gru.by.size)
rnn_params = (rnn.Wxh.size + rnn.Whh.size + rnn.Why.size + 
              rnn.bh.size + rnn.by.size)

print("Model Comparison:")
print(f"Simple RNN parameters: {rnn_params}")
print(f"LSTM parameters: {lstm_params}")
print(f"GRU parameters: {gru_params}")
print(f"\nGRU has {lstm_params - gru_params} fewer parameters than LSTM")
print(f"GRU has {gru_params - rnn_params} more parameters than simple RNN")

## Part 10: Summary and Key Takeaways

Let's visualize what we've learned about RNNs, LSTMs, and GRUs.

In [None]:
# Create comparison visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Model Complexity
models = ['Simple RNN', 'GRU', 'LSTM']
params = [rnn_params, gru_params, lstm_params]
colors = ['#FF6B6B', '#4ECDC4', '#95E77E']

axes[0, 0].bar(models, params, color=colors)
axes[0, 0].set_ylabel('Number of Parameters')
axes[0, 0].set_title('Model Complexity')
axes[0, 0].grid(True, alpha=0.3)

# 2. Memory Retention (simulated)
x = np.linspace(0, 50, 100)
rnn_retention = np.exp(-x/10)
lstm_retention = np.exp(-x/40)
gru_retention = np.exp(-x/35)

axes[0, 1].plot(x, rnn_retention, 'r-', label='RNN', linewidth=2)
axes[0, 1].plot(x, gru_retention, 'g-', label='GRU', linewidth=2)
axes[0, 1].plot(x, lstm_retention, 'b-', label='LSTM', linewidth=2)
axes[0, 1].set_xlabel('Sequence Length')
axes[0, 1].set_ylabel('Memory Strength')
axes[0, 1].set_title('Memory Retention Capability')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Training Speed (relative)
models = ['Simple RNN', 'GRU', 'LSTM']
speed = [100, 70, 60]  # Relative speed
axes[1, 0].barh(models, speed, color=colors)
axes[1, 0].set_xlabel('Relative Training Speed (%)')
axes[1, 0].set_title('Training Efficiency')
axes[1, 0].grid(True, alpha=0.3)

# 4. Use Case Recommendations
use_cases_text = """
When to use each model:

Simple RNN:
• Short sequences (< 10 steps)
• Simple patterns
• Speed is critical

GRU:
• Medium sequences (10-100 steps)
• Good balance of performance/speed
• Limited computational resources

LSTM:
• Long sequences (> 100 steps)
• Complex dependencies
• Best accuracy needed
"""

axes[1, 1].text(0.1, 0.5, use_cases_text, fontsize=10, 
                verticalalignment='center', family='monospace')
axes[1, 1].set_xlim(0, 1)
axes[1, 1].set_ylim(0, 1)
axes[1, 1].axis('off')
axes[1, 1].set_title('Use Case Recommendations')

plt.suptitle('RNN vs GRU vs LSTM: Complete Comparison', fontsize=16)
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("Lab Complete!")
print("="*60)
print("\nYou've successfully:")
print("1. Built RNN, LSTM, and GRU from scratch")
print("2. Visualized hidden states and gate activations")
print("3. Demonstrated the vanishing gradient problem")
print("4. Compared memory retention capabilities")
print("5. Generated text with different architectures")
print("\nNext steps: Train these models on real data!")