# RNN Step-by-Step: Processing "I love NLP"

## Setup
- Sentence: **"I love NLP"**
- Hidden units: **4**
- Embedding dimension: **3**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
vocab = {"I": 0, "love": 1, "NLP": 2}
embedding_dim = 3
hidden_units = 4

print("Vocabulary:", vocab)
print(f"Embedding dimension: {embedding_dim}")
print(f"Hidden units: {hidden_units}")

## Step 1: Create Embedding Matrix

Each word gets converted to a vector

In [None]:
# Embedding matrix: vocab_size × embedding_dim
embedding_matrix = np.array([
    [0.5, 0.2, 0.1],   # "I"
    [0.8, 0.6, 0.3],   # "love"
    [0.1, 0.9, 0.7]    # "NLP"
])

print("Embedding Matrix:")
print(embedding_matrix)
print(f"\nShape: {embedding_matrix.shape}")

# Get embeddings for our sentence
sentence = ["I", "love", "NLP"]
sentence_ids = [vocab[word] for word in sentence]
embeddings = embedding_matrix[sentence_ids]

print("\nWord Embeddings:")
for i, word in enumerate(sentence):
    print(f"{word:6s}: {embeddings[i]}")

## Step 2: Initialize RNN Weights

RNN has three weight matrices:
- **W_xh**: Input to hidden (3 × 4)
- **W_hh**: Hidden to hidden (4 × 4)
- **b_h**: Bias (4)

In [None]:
# Weight matrices (simplified for clarity)
W_xh = np.array([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.1, 0.2, 0.3],
    [0.4, 0.3, 0.1, 0.2]
])  # Shape: (3, 4) - maps 3-dim input to 4-dim hidden

W_hh = np.array([
    [0.2, 0.1, 0.3, 0.2],
    [0.1, 0.3, 0.2, 0.1],
    [0.3, 0.2, 0.1, 0.3],
    [0.2, 0.3, 0.2, 0.1]
])  # Shape: (4, 4) - maps 4-dim hidden to 4-dim hidden

b_h = np.array([0.1, 0.1, 0.1, 0.1])  # Shape: (4,)

print("Weight Matrix W_xh (input → hidden):")
print(W_xh)
print(f"Shape: {W_xh.shape}\n")

print("Weight Matrix W_hh (hidden → hidden):")
print(W_hh)
print(f"Shape: {W_hh.shape}\n")

print("Bias b_h:")
print(b_h)
print(f"Shape: {b_h.shape}")

## Step 3: Process Each Time Step

### RNN Formula:
```
h_t = tanh(W_xh @ x_t + W_hh @ h_{t-1} + b_h)
```

Where:
- `x_t` = input at time t (embedding vector)
- `h_{t-1}` = previous hidden state
- `h_t` = new hidden state (4-dimensional vector)

In [None]:
# Initialize hidden state (all zeros)
h_0 = np.zeros(hidden_units)
print("Initial hidden state h_0:")
print(h_0)
print(f"Shape: {h_0.shape}\n")
print("="*70)

### Time Step 1: Process "I"

In [None]:
# Time step 1: "I"
x_1 = embeddings[0]  # [0.5, 0.2, 0.1]
h_prev = h_0         # [0, 0, 0, 0]

print("TIME STEP 1: Processing 'I'")
print("="*70)
print(f"Input x_1 ('I'):        {x_1}")
print(f"Previous hidden h_0:    {h_prev}")
print()

# Calculate components
input_contrib = W_xh.T @ x_1
hidden_contrib = W_hh.T @ h_prev
combined = input_contrib + hidden_contrib + b_h
h_1 = np.tanh(combined)

print("Step-by-step calculation:")
print(f"1. W_xh @ x_1          = {input_contrib}")
print(f"2. W_hh @ h_0          = {hidden_contrib}")
print(f"3. Sum + bias          = {combined}")
print(f"4. tanh(...)           = {h_1}")
print()
print(f"New hidden state h_1:   {h_1}")
print(f"Shape: {h_1.shape}")
print("="*70)
print()

### Time Step 2: Process "love"

In [None]:
# Time step 2: "love"
x_2 = embeddings[1]  # [0.8, 0.6, 0.3]
h_prev = h_1         # Use h_1 from previous step

print("TIME STEP 2: Processing 'love'")
print("="*70)
print(f"Input x_2 ('love'):     {x_2}")
print(f"Previous hidden h_1:    {h_prev}")
print()

# Calculate components
input_contrib = W_xh.T @ x_2
hidden_contrib = W_hh.T @ h_prev
combined = input_contrib + hidden_contrib + b_h
h_2 = np.tanh(combined)

print("Step-by-step calculation:")
print(f"1. W_xh @ x_2          = {input_contrib}")
print(f"2. W_hh @ h_1          = {hidden_contrib}")
print(f"   ↑ Carries info from 'I'")
print(f"3. Sum + bias          = {combined}")
print(f"4. tanh(...)           = {h_2}")
print()
print(f"New hidden state h_2:   {h_2}")
print(f"Shape: {h_2.shape}")
print("="*70)
print()

### Time Step 3: Process "NLP"

In [None]:
# Time step 3: "NLP"
x_3 = embeddings[2]  # [0.1, 0.9, 0.7]
h_prev = h_2         # Use h_2 from previous step

print("TIME STEP 3: Processing 'NLP'")
print("="*70)
print(f"Input x_3 ('NLP'):      {x_3}")
print(f"Previous hidden h_2:    {h_prev}")
print()

# Calculate components
input_contrib = W_xh.T @ x_3
hidden_contrib = W_hh.T @ h_prev
combined = input_contrib + hidden_contrib + b_h
h_3 = np.tanh(combined)

print("Step-by-step calculation:")
print(f"1. W_xh @ x_3          = {input_contrib}")
print(f"2. W_hh @ h_2          = {hidden_contrib}")
print(f"   ↑ Carries info from 'I love'")
print(f"3. Sum + bias          = {combined}")
print(f"4. tanh(...)           = {h_3}")
print()
print(f"Final hidden state h_3: {h_3}")
print(f"Shape: {h_3.shape}")
print("="*70)

## Summary: Hidden State Evolution

In [None]:
# Collect all hidden states
hidden_states = np.array([h_0, h_1, h_2, h_3])

print("\nHidden State Evolution:")
print("="*70)
print("Time Step | Word   | Hidden State (4 dimensions)")
print("-"*70)
print(f"t=0       | (init) | {h_0}")
print(f"t=1       | I      | {h_1}")
print(f"t=2       | love   | {h_2}")
print(f"t=3       | NLP    | {h_3}")
print("="*70)

print("\nKey Points:")
print("• Each hidden state is a 4-dimensional vector")
print("• h_3 contains information from ALL previous words")
print("• This final state h_3 would be passed to output layer for prediction")

## Visualization: Hidden State Evolution

In [None]:
# Visualize hidden state evolution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Heatmap of hidden states
ax1 = axes[0]
sns.heatmap(hidden_states, annot=True, fmt='.3f', cmap='RdYlGn', 
            center=0, cbar_kws={'label': 'Activation'},
            xticklabels=['Unit 1', 'Unit 2', 'Unit 3', 'Unit 4'],
            yticklabels=['h_0 (init)', 'h_1 (I)', 'h_2 (love)', 'h_3 (NLP)'],
            ax=ax1)
ax1.set_title('Hidden State Evolution\n(4 hidden units across time)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Hidden Unit', fontsize=12)
ax1.set_ylabel('Time Step', fontsize=12)

# Plot 2: Line plot showing each unit's evolution
ax2 = axes[1]
time_steps = ['h_0\n(init)', 'h_1\n(I)', 'h_2\n(love)', 'h_3\n(NLP)']
for i in range(4):
    ax2.plot(time_steps, hidden_states[:, i], marker='o', linewidth=2, 
             markersize=8, label=f'Unit {i+1}')
ax2.set_xlabel('Time Step', fontsize=12)
ax2.set_ylabel('Activation Value', fontsize=12)
ax2.set_title('Hidden Unit Activations Over Time', fontsize=14, fontweight='bold')
ax2.legend(loc='best')
ax2.grid(True, alpha=0.3)
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8, alpha=0.5)

plt.tight_layout()
plt.savefig('rnn_hidden_state_evolution.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualization saved as 'rnn_hidden_state_evolution.png'")

## Architecture Diagram

In [None]:
# Create architecture diagram
fig, ax = plt.subplots(figsize=(14, 6))
ax.set_xlim(0, 14)
ax.set_ylim(0, 6)
ax.axis('off')

# Helper function to draw circles
def draw_circle(ax, x, y, r, color, label=''):
    circle = plt.Circle((x, y), r, color=color, ec='black', linewidth=2)
    ax.add_patch(circle)
    if label:
        ax.text(x, y, label, ha='center', va='center', fontsize=10, fontweight='bold')

# Time step positions
x_positions = [2, 5.5, 9, 12.5]
y_input = 1.5
y_hidden = 3.5

words = ['(init)', 'I', 'love', 'NLP']
colors = ['lightgray', '#E8F4F8', '#B8E6F0', '#7DD3E8']

# Draw each time step
for i, (x, word, color) in enumerate(zip(x_positions, words, colors)):
    # Input
    if i > 0:
        draw_circle(ax, x, y_input, 0.3, color)
        ax.text(x, y_input - 0.6, f'x_{i}\n({word})', ha='center', fontsize=10, fontweight='bold')
    
    # Hidden state (4 units stacked)
    for j in range(4):
        y_offset = y_hidden + (j - 1.5) * 0.3
        draw_circle(ax, x, y_offset, 0.12, '#FFB84D')
    
    # Label
    ax.text(x, y_hidden + 1.2, f'h_{i}', ha='center', fontsize=12, fontweight='bold')
    
    # Arrows
    if i > 0:
        # Input to hidden
        ax.arrow(x, y_input + 0.35, 0, y_hidden - y_input - 1.5, 
                head_width=0.15, head_length=0.15, fc='black', ec='black', linewidth=2)
    
    if i < len(x_positions) - 1:
        # Hidden to hidden
        ax.arrow(x + 0.4, y_hidden, x_positions[i+1] - x - 0.8, 0,
                head_width=0.15, head_length=0.15, fc='red', ec='red', linewidth=2.5)

# Title
ax.text(7, 5.5, 'RNN Processing "I love NLP" (4 Hidden Units)', 
        ha='center', fontsize=16, fontweight='bold')

# Legend
ax.text(1, 0.5, 'Each hidden state = 4-dimensional vector', fontsize=10, style='italic')
ax.text(1, 0.2, 'Red arrows = information flow through time', fontsize=10, style='italic', color='red')

plt.tight_layout()
plt.savefig('rnn_architecture_diagram.png', dpi=300, bbox_inches='tight')
plt.show()

print("Architecture diagram saved as 'rnn_architecture_diagram.png'")

## Key Takeaways

### What is the Hidden State?
**The hidden state is a 4-dimensional vector** (one value per hidden unit)

```python
h_3 = [value1, value2, value3, value4]
       ↑       ↑       ↑       ↑
    unit 1  unit 2  unit 3  unit 4
```

### How Information Flows
1. **t=1**: Process "I" → produces h_1
2. **t=2**: Process "love" + h_1 (memory of "I") → produces h_2
3. **t=3**: Process "NLP" + h_2 (memory of "I love") → produces h_3

### Final Hidden State
- **h_3 encodes the entire sentence "I love NLP"**
- This vector can be used for:
  - Sentiment classification
  - Next word prediction
  - Sequence generation
  
### Important Points
- Number of hidden units = dimension of hidden state vector
- Hidden state carries information from ALL previous time steps
- Same weights (W_xh, W_hh) used at every time step