In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import re
import csv
from collections import Counter
import torch.nn.functional as F

# Original Simple RNN Implementation
This is the basic RNN implementation we started with, for reference.

In [2]:
# Original RNN Class
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        
        # input to hidden (i2h) weights 
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        # hidden to output (h2o) weights
        self.h2o = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, input, hidden):
        # input: (batch_size, input_size)
        # hidden: (batch_size, hidden_size)
        # why combine? 
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.h2o(combined)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(batch_size, self.hidden_size)

# Educational Demo: Sentiment Analysis Step-by-Step
We demonstrate how the RNN processes a sentence for sentiment classification.

In [3]:
# 1. Data Preparation
sentence = "this movie was absolutely amazing and wonderful"
words = sentence.split()
vocab_demo = list(set(words))
word_to_ix_demo = {word: i for i, word in enumerate(vocab_demo)}
input_size = len(vocab_demo)
hidden_size = 10
output_size = 2 # Positive/Negative

def make_one_hot(word, word_to_ix):
    vec = torch.zeros(1, input_size)
    vec[0][word_to_ix[word]] = 1
    return vec

# 2. Model Initialization
rnn_demo = RNN(input_size, hidden_size, output_size)
hidden_demo = rnn_demo.init_hidden(1) # Batch size 1

# 3. Step-by-Step Forward Pass
print("Step-by-Step Forward Pass:")
print("-" * 50)
for word in words:
    input_tensor = make_one_hot(word, word_to_ix_demo)
    output_demo, hidden_demo = rnn_demo(input_tensor, hidden_demo)
    print(f"Word: {word:<15} | Hidden (first 3): {hidden_demo[0][:3].detach().numpy()} ...")
    print(f"Logits: {output_demo.detach().numpy()} ")

# 4. Loss Calculation
target_class = torch.tensor([0], dtype=torch.long) # 0 = Positive (assuming)
criterion_demo = nn.CrossEntropyLoss()
loss_demo = criterion_demo(output_demo, target_class)
print("-" * 50)
print(f"Final Output Logits: {output_demo.detach().numpy()}")
print(f"Loss: {loss_demo.item():.4f}")

# 5. Backpropagation
rnn_demo.zero_grad()
loss_demo.backward()
print("-" * 50)
print("Backpropagation complete.")
print(f"Gradient norm for i2h weights: {rnn_demo.i2h.weight.grad.norm().item():.4f}")

# 6. Optimization
optimizer_demo = optim.SGD(rnn_demo.parameters(), lr=0.1)
optimizer_demo.step()
print("Optimization step complete.")

Step-by-Step Forward Pass:
--------------------------------------------------
Word: this            | Hidden (first 3): [ 0.20034426 -0.08665498  0.11625531] ...
Logits: [[-0.12855901 -0.14238012]] 
Word: movie           | Hidden (first 3): [ 0.20630811 -0.01775336  0.12262754] ...
Logits: [[-0.08699305 -0.04649505]] 
Word: was             | Hidden (first 3): [ 0.28001308  0.17004642 -0.19548813] ...
Logits: [[-0.05168187  0.08595321]] 
Word: absolutely      | Hidden (first 3): [ 0.43885353  0.04597702 -0.03268138] ...
Logits: [[-0.07313797 -0.12640046]] 
Word: amazing         | Hidden (first 3): [ 0.49027127  0.19018902 -0.01289213] ...
Logits: [[-0.39773023  0.29555368]] 
Word: and             | Hidden (first 3): [ 0.7256163  -0.11263353 -0.06685778] ...
Logits: [[-0.12968136 -0.01403646]] 
Word: wonderful       | Hidden (first 3): [0.53845525 0.22406152 0.20921092] ...
Logits: [[-0.4114157   0.29998404]] 
--------------------------------------------------
Final Output Logits: [[-0.4

# 1. Load IMDB Dataset
We load a subset of the IMDB dataset for sentiment classification (Positive vs Negative).

In [23]:
reviews = []
labels = []

with open('data/imdb_subset.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader) # Skip header
    for row in reader:
        if len(row) >= 2:
            reviews.append(row[0])
            # Label: positive -> 1, negative -> 0
            labels.append(1 if row[1].strip().lower() == 'positive' else 0)

print(f"Loaded {len(reviews)} reviews.")
print(f"Sample Review: {reviews[0][:100]}...")
print(f"Sample Label: {labels[0]}")

# Preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

tokenized_reviews = [preprocess(r) for r in reviews]

# Build Vocabulary
all_words = [word for review in tokenized_reviews for word in review]
vocab_count = Counter(all_words)
vocab = sorted(vocab_count, key=vocab_count.get, reverse=True)
# Limit vocab size for speed
vocab = vocab[:5000]
word_to_ix = {word: i+1 for i, word in enumerate(vocab)} # 0 is padding
ix_to_word = {i+1: word for i, word in enumerate(vocab)}
ix_to_word[0] = '<PAD>'
vocab_size = len(word_to_ix) + 1

print(f"Vocabulary size: {vocab_size}")

Loaded 2000 reviews.
Sample Review: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. The...
Sample Label: 1
Vocabulary size: 5001


# 2. Load GloVe Embeddings
We use pretrained GloVe embeddings.

In [24]:
def load_glove_embeddings(file_path, word_to_ix, embedding_dim=50):
    embeddings = np.zeros((len(word_to_ix) + 1, embedding_dim))
    found = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            if word in word_to_ix:
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word_to_ix[word]] = vector
                found += 1
    print(f"Found embeddings for {found} / {len(word_to_ix)} words.")
    return torch.tensor(embeddings, dtype=torch.float32)

embedding_dim = 50
pretrained_embeddings = load_glove_embeddings('data/glove.6B.50d.txt', word_to_ix, embedding_dim)

Found embeddings for 4897 / 5000 words.


# 3. Prepare Dataset (Padding)
We pad sequences to a fixed length.

In [25]:
seq_length = 100 # Truncate/Pad to 100 words
dataX = []
dataY = labels

for review in tokenized_reviews:
    # Convert to indices
    idxs = [word_to_ix.get(w, 0) for w in review if w in word_to_ix]
    # Pad or Truncate
    if len(idxs) < seq_length:
        idxs = idxs + [0] * (seq_length - len(idxs))
    else:
        idxs = idxs[:seq_length]
    dataX.append(idxs)

X = torch.tensor(dataX, dtype=torch.long)
y = torch.tensor(dataY, dtype=torch.long)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: torch.Size([2000, 100])
y shape: torch.Size([2000])


In [26]:
# divide into train and test
# change torch random seed
torch.manual_seed(1234)

train_size = int(len(reviews) * 0.8)
# shuffle
perm = torch.randperm(len(reviews))
X = X[perm]
y = y[perm]
X_train = X[:train_size]
y_train = y[:train_size]
X_test = X[train_size:]
y_test = y[train_size:]

# 4. Unrolled RNN Classifier
We use the UnrolledRNN for classification (Many-to-One).

In [27]:
class UnrolledRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, dropout_prob=0.5, pretrained_embeddings=None):
        super(UnrolledRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False, padding_idx=0)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # self.dropout = nn.Dropout(dropout_prob)
        
        self.rnn_cells = nn.ModuleList()
        for i in range(num_layers):
            input_dim = embedding_dim if i == 0 else hidden_size
            self.rnn_cells.append(nn.ModuleDict({
                'i2h': nn.Linear(input_dim, hidden_size),
                'h2h': nn.Linear(hidden_size, hidden_size)
            }))
            
        self.fc_out = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden, return_all_outputs=False):
        # x: (batch_size, seq_length)
        batch_size = x.size(0)
        seq_length = x.size(1)
        
        embedded = self.embedding(x)
        #embedded = self.dropout(embedded)
        all_outputs = []
        current_hidden = hidden
        
        for t in range(seq_length):
            input_t = embedded[:, t, :]
            next_hidden_states = []
            
            for layer_idx in range(self.num_layers):
                cell = self.rnn_cells[layer_idx]
                h_prev = current_hidden[layer_idx]
                
                layer_input = input_t if layer_idx == 0 else next_hidden_states[-1]
                
                h_t = torch.tanh(cell['i2h'](layer_input) + cell['h2h'](h_prev))
                
                # if layer_idx < self.num_layers - 1:
                #     h_t = self.dropout(h_t)
                    
                next_hidden_states.append(h_t)
            
            current_hidden = torch.stack(next_hidden_states)
            # We only care about the final output for classification, 
            # but we compute it at every step for the demo if needed.
            output_t = self.fc_out(current_hidden[-1])
            all_outputs.append(output_t)
            
        if return_all_outputs:
            return torch.stack(all_outputs, dim=1), current_hidden
        else:
            # Return only the last output (batch_size, output_size)
            return all_outputs[-1], current_hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

# 5. Training Loop
We train the RNN on the IMDB subset.

In [28]:
hidden_size = 64
learning_rate = 0.001
epochs = 5
batch_size = 32
num_layers = 2
output_size = 2 # Positive/Negative

model = UnrolledRNN(vocab_size, embedding_dim, hidden_size, output_size, num_layers=num_layers, pretrained_embeddings=pretrained_embeddings)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print("Starting training...")
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0
    
    # Shuffle
    perm = torch.randperm(len(X_train))
    X_shuffled = X_train[perm]
    y_shuffled = y_train[perm]
    
    for i in range(0, len(X_train), batch_size):
        inputs = X_shuffled[i:i+batch_size]
        targets = y_shuffled[i:i+batch_size]
        
        if len(inputs) != batch_size: continue
        
        hidden = model.init_hidden(batch_size)
        model.zero_grad()
        
        output, hidden = model(inputs, hidden)
        
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / (len(X)/batch_size):.4f}, Acc: {100 * correct / total:.2f}%")


Starting training...
Epoch 1/5, Loss: 0.5553, Acc: 51.50%
Epoch 2/5, Loss: 0.5353, Acc: 58.38%
Epoch 3/5, Loss: 0.5017, Acc: 65.31%
Epoch 4/5, Loss: 0.4471, Acc: 70.19%
Epoch 5/5, Loss: 0.3728, Acc: 76.50%


In [29]:
# Evaluate on Test Set
model.eval()
correct = 0
total = 0
test_loss = 0

with torch.no_grad():
    for i in range(0, len(X_test), batch_size):
        inputs = X_test[i:i+batch_size]
        targets = y_test[i:i+batch_size]
        
        if len(inputs) == 0: continue

        hidden = model.init_hidden(len(inputs))
        output, _ = model(inputs, hidden)
        loss = criterion(output, targets)
        test_loss += loss.item()
        
        _, predicted = torch.max(output.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

print(f"Test Loss: {test_loss / (len(X_test)/batch_size):.4f}")
print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Loss: 0.9648
Test Accuracy: 52.25%


# 6. Inference
Test the model on a new review.

In [30]:
def predict_sentiment(model, review, word_to_ix):
    # Enable gradient tracking
    model.eval()
    words = preprocess(review)
    idxs = [word_to_ix.get(w, 0) for w in words]
    # Pad/Truncate
    if len(idxs) < seq_length:
        idxs = idxs + [0] * (seq_length - len(idxs))
    else:
        idxs = idxs[:seq_length]
        
    input_tensor = torch.tensor([idxs], dtype=torch.long)
    hidden = model.init_hidden(1)
    
    model.zero_grad()
    output, _ = model(input_tensor, hidden)
    probs = F.softmax(output, dim=1)
    top_prob, top_ix = torch.max(probs, 1)
    
    # Compute gradients based on the prediction to see the norm
    loss = nn.CrossEntropyLoss()(output, top_ix)
    loss.backward()
    grad_norm = model.rnn_cells[0]['h2h'].weight.grad.norm().item()
    
    sentiment = "Positive" if top_ix.item() == 1 else "Negative"
    print(f"Review: '{review}'")
    print(f"Prediction: {sentiment} (Confidence: {top_prob.item():.4f})")
    print(f"Gradient Norm: {grad_norm:.4f}")

predict_sentiment(model, "This movie was fantastic and I loved it!", word_to_ix)
predict_sentiment(model, "bad " * 1000, word_to_ix)

Review: 'This movie was fantastic and I loved it!'
Prediction: Negative (Confidence: 0.5489)
Gradient Norm: 1.4757
Review: 'bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad bad 

# 8. Exploding Gradients and Clipping Fix

In [21]:
exploding_model = UnrolledRNN(vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, pretrained_embeddings=pretrained_embeddings)

with torch.no_grad():
    exploding_model.rnn_cells[0]['h2h'].weight.data.normal_(0.0, 2.0)
seq_len = 50
input_seq = torch.randint(1, vocab_size, (1, seq_len))
target = torch.tensor([0], dtype=torch.long)
hidden = exploding_model.init_hidden(1)
exploding_model.zero_grad()
output, _ = exploding_model(input_seq, hidden)
loss = criterion(output, target)
loss.backward()
grad_norm = exploding_model.rnn_cells[0]['h2h'].weight.grad.norm().item()
print(f"Gradient Norm (Before Clipping): {grad_norm:.4f}")

max_norm = 5.0
torch.nn.utils.clip_grad_norm_(exploding_model.parameters(), max_norm)

grad_norm_clipped = exploding_model.rnn_cells[0]['h2h'].weight.grad.norm().item()
print(f"Gradient Norm (After Clipping): {grad_norm_clipped:.4f}")

Gradient Norm (Before Clipping): 1679682306048.0000
Gradient Norm (After Clipping): 0.2699


# 8. Vanishing Gradients Demo

In [None]:
vanishing_model = UnrolledRNN(vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, pretrained_embeddings=pretrained_embeddings)

with torch.no_grad():
    # initialize with small weights to cause vanishing gradients
    vanishing_model.rnn_cells[0]['h2h'].weight.data.normal_(0.0, 0.1)

# long sequence
long_input = torch.randint(1, vocab_size, (1, 100))
target = torch.tensor([0], dtype=torch.long)

hidden = vanishing_model.init_hidden(1)
vanishing_model.zero_grad()
output, _ = vanishing_model(long_input, hidden)
loss = criterion(output, target)
loss.backward()

# get gradient of embedding for first token
first_token_grad = vanishing_model.embedding.weight.grad[long_input[0, 0]].norm().item()
print(f"First Token Gradient (Long Sequence): {first_token_grad:.6f}")

# Short sequence
vanishing_model.zero_grad()
short_input = torch.randint(1, vocab_size, (1, 4))
output, _ = vanishing_model(short_input, hidden)
loss = criterion(output, target)
loss.backward()

first_token_grad_short = vanishing_model.embedding.weight.grad[short_input[0, 0]].norm().item()
print(f"First Token Gradient (Short Sequence): {first_token_grad_short:.6f}")

First Token Gradient (Long Sequence): 0.000000
First Token Gradient (Short Sequence): 0.042213
