# Recurrent Neural Network

Recurrent neural networks (RNNs) are a class of artificial neural networks designed for processing sequential data, such as text, speech, and time series, where the order of elements is important. Unlike feedforward neural networks, which process inputs independently, RNNs utilize recurrent connections, where the output of a neuron at one time step is fed back as input to the network at the next time step. This enables RNNs to capture temporal dependencies and patterns within sequences.
The fundamental building block of RNNs is the recurrent unit, which maintains a hidden state—a form of memory that is updated at each time step based on the current input and the previous hidden state. This feedback mechanism allows the network to learn from past inputs and incorporate that knowledge into its current processing. RNNs have been successfully applied to tasks such as unsegmented, connected handwriting recognition, speech recognition, natural language processing, and neural machine translation.


## Simple Form

In [1]:
# import Libraries

import numpy as np

In [2]:
# Vocabulary and one-hot encoding

chars = "helo"
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in char_to_idx.items()}

In [3]:
# Input and output sequences

sequence = "hello"
X_seq = [char_to_idx[ch] for ch in sequence[:-1]] # Input sequence "hell"
Y_seq = [char_to_idx[ch] for ch in sequence[1:]] # Output sequence "ello"


In [5]:
# One-hot encoding input and output sequences

def one_hot(i, vocab_size):
    """One-hot encode an integer index."""
    vec = np.zeros((vocab_size,))
    vec[i] = 1
    return vec

X = np.array([one_hot(i, len(chars)) for i in X_seq]) # One-hot encoded input

In [6]:
# RNN parameters

hidden_size = 4 # Number of hidden units
input_size = len(chars) # Size of input layer (vocabulary size)
output_size = len(chars) # Size of output layer (vocabulary size)

Wxh = np.random.randn(hidden_size, input_size) * 0.01 # Input to hidden weights
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # Hidden to hidden weights
Why = np.random.randn(output_size, hidden_size) * 0.01 # Hidden to output weights
bh = np.zeros((hidden_size,)) # Hidden bias
by = np.zeros((output_size,)) # Output bias

In [7]:
# Forward pass through the RNN

h = np.zeros((hidden_size,)) # Initial hidden state
outputs = [] # Store outputs for each time step

for x_t in X:
    h = np.tanh(Wxh @ x_t + Whh @ h + bh) # Update hidden state
    y = Why @ h + by # Compute output
    outputs.append(y) # Store output
    

In [8]:
# Print softmax probabilities

def softmax(x):
    exps = np.exp(x - np.max(x)) # Subtract max for numerical stability
    return exps / np.sum(exps)

for i, y in enumerate(outputs):
    p = softmax(y) # Compute softmax probabilities
    print(f"Step {i} prediction:", p.round(2))

Step 0 prediction: [0.25 0.25 0.25 0.25]
Step 1 prediction: [0.25 0.25 0.25 0.25]
Step 2 prediction: [0.25 0.25 0.25 0.25]
Step 3 prediction: [0.25 0.25 0.25 0.25]


# PyTorch Form

In [12]:
# Import Libraries

import torch
import torch.nn as nn


In [13]:
# Prepare data

chars = "helo"
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}
sequence = "hello"

In [15]:
# One-hot encode input and output sequence

input_seq = [char_to_idx[ch] for ch in sequence[:-1]] # Input sequence "hell"
target_seq = [char_to_idx[ch] for ch in sequence[1:]] # Output sequence "ello"


In [16]:
# convert to PyTorch tensors

input_tensor = torch.eye(len(chars))[input_seq].unsqueeze(1) # Shape: (seq_len, batch_size, input_size)
target_tensor = torch.tensor(target_seq)

In [18]:
# Define RNN model

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        out, h = self.rnn(x, h)
        out = self.fc(out.squeeze(1)) # Remove batch dimension
        return out, h

In [19]:
# initialize model, loss function, and optimizer

model = CharRNN(input_size = 4, hidden_size = 8, output_size = 4)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [20]:
# Train one step

output, _ = model(input_tensor) # Forward pass
loss = criterion(output, target_tensor) # Compute loss
loss.backward() # Backpropagation
optimizer.step() # Update weights

print("Output logits:\n", output)

Output logits:
 tensor([[-0.0728, -0.2635, -0.1429,  0.0543],
        [ 0.1242, -0.3715, -0.1328,  0.1894],
        [-0.1763, -0.3454, -0.1156,  0.1728],
        [-0.1083, -0.4030, -0.2436,  0.1000]], grad_fn=<AddmmBackward0>)


## PyTorch With GPU

In [32]:
# Import Libraries

import torch
import torch.nn as nn


In [33]:
# Prepare data

chars = "helo"
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}
sequence = "hello"

In [34]:
# One-hot encode input and output sequence

input_seq = [char_to_idx[ch] for ch in sequence[:-1]] # Input sequence "hell"
target_seq = [char_to_idx[ch] for ch in sequence[1:]] # Output sequence "ello"


In [35]:
# convert to PyTorch tensors

input_tensor = torch.eye(len(chars))[input_seq].unsqueeze(1) # Shape: (seq_len, batch_size, input_size)
target_tensor = torch.tensor(target_seq)

In [36]:
# Define RNN model

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        out, h = self.rnn(x, h)
        out = self.fc(out.squeeze(1)) # Remove batch dimension
        return out, h

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Check for GPU
print("Using device:", device)

Using device: cuda


In [38]:
# initialize model, loss function, and optimizer

model = CharRNN(input_size = 4, hidden_size = 8, output_size = 4).to(device) # Move model to GPU if available
input_tensor = input_tensor.to(device) # Move input tensor to GPU if available
target_tensor = target_tensor.to(device) # Move target tensor to GPU if available
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [47]:
# Train one step

output, _ = model(input_tensor) # Forward pass
loss = criterion(output, target_tensor) # Compute loss
loss.backward() # Backpropagation
optimizer.step() # Update weights

print("Output predictions:\n", torch.softmax(output, dim=1).cpu().detach().numpy().round(2))


Output predictions:
 [[0.15 0.23 0.37 0.26]
 [0.13 0.19 0.45 0.23]
 [0.14 0.23 0.38 0.26]
 [0.13 0.19 0.37 0.31]]
