Algorithm 1:Token embedding



In [1]:
import torch
import torch.nn as nn

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, token_id):
        return self.embedding(token_id)

def test_token_embedding():
    vocab_size = 10000  # |N_v|
    embedding_dim = 300  # d_e

    print("Initializing TokenEmbedding...")
    token_embedding = TokenEmbedding(vocab_size, embedding_dim)

    # Test 1: Check output shape
    print("\nTest 1: Checking output shape")
    token_id = torch.tensor([42])
    embedding_vector = token_embedding(token_id)
    if embedding_vector.shape == (1, embedding_dim):
        print(f"PASS: Output shape is correct: {embedding_vector.shape}")
    else:
        print(f"FAIL: Expected shape (1, {embedding_dim}), but got {embedding_vector.shape}")

    # Test 2: Check if different tokens produce different embeddings
    print("\nTest 2: Checking if different tokens produce different embeddings")
    token_id_1 = torch.tensor([10])
    token_id_2 = torch.tensor([20])
    embedding_1 = token_embedding(token_id_1)
    embedding_2 = token_embedding(token_id_2)
    if not torch.allclose(embedding_1, embedding_2):
        print("PASS: Different tokens produce different embeddings")
    else:
        print("FAIL: Different tokens produced the same embedding")

    # Test 3: Check if the same token always produces the same embedding
    print("\nTest 3: Checking if the same token produces consistent embeddings")
    embedding_1_repeat = token_embedding(token_id_1)
    if torch.allclose(embedding_1, embedding_1_repeat):
        print("PASS: Same token produces consistent embeddings")
    else:
        print("FAIL: Same token produced different embeddings")

test_token_embedding()






Initializing TokenEmbedding...

Test 1: Checking output shape
PASS: Output shape is correct: torch.Size([1, 300])

Test 2: Checking if different tokens produce different embeddings
PASS: Different tokens produce different embeddings

Test 3: Checking if the same token produces consistent embeddings
PASS: Same token produces consistent embeddings


Algorithm 2: Positional  embedding: 



In [2]:
import torch
import torch.nn as nn

class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEmbedding, self).__init__()
        self.positional_embedding = nn.Embedding(max_len, d_model)
    
    def forward(self, positions):
        return self.positional_embedding(positions)

# Testing the PositionalEmbedding class

# Define the model parameters
d_model = 8  # Dimension of the embedding
max_len = 10  # Maximum length of the sequence

# Instantiate the PositionalEmbedding class
pos_emb = PositionalEmbedding(d_model, max_len)

# Create a sample position tensor
positions = torch.tensor([0, 1, 2, 3, 4])  # Example positions in the sequence

# Forward pass to get the positional embeddings
output = pos_emb(positions)

# Print the results
print("Positions:")
print(positions)
print("\nPositional Embeddings:")
print(output)


Positions:
tensor([0, 1, 2, 3, 4])

Positional Embeddings:
tensor([[-0.3664,  0.3055,  0.5400,  0.1085,  0.2731, -1.6780, -0.8474, -2.1193],
        [ 0.0931, -1.1981, -1.0825,  0.5379, -0.0384,  1.8018,  0.3153, -0.3435],
        [-0.3769, -0.7387, -0.3788, -0.2342,  1.3154,  1.7451,  0.5810,  0.4470],
        [-0.4017, -0.8696,  0.6587,  0.4861,  0.1649, -2.0578, -0.1555, -1.0668],
        [-0.8478,  0.2871, -0.1655,  1.4034,  0.4210, -0.5587, -0.2899, -0.8643]],
       grad_fn=<EmbeddingBackward0>)


Algorithm 3: Basic Single-Query attention

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicSingleQueryAttention(nn.Module):
    def __init__(self, din, datt, dout):
        super(BasicSingleQueryAttention, self).__init__()
        self.Wq = nn.Linear(din, datt)
        self.Wk = nn.Linear(din, datt)
        self.Wv = nn.Linear(din, dout)
        self.bq = nn.Parameter(torch.zeros(datt))
        self.bk = nn.Parameter(torch.zeros(datt))
        self.bv = nn.Parameter(torch.zeros(dout))
    
    def forward(self, e, et):
        # Compute query, keys, and values
        q = self.Wq(e) + self.bq
        k = self.Wk(et) + self.bk
        v = self.Wv(et) + self.bv
        
        # Compute attention weights
        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(k.size(-1), dtype=torch.float32))
        alpha_t = F.softmax(scores, dim=-1)
        
        # Compute the output as a weighted sum of the values
        u = torch.sum(alpha_t.unsqueeze(-1) * v, dim=1)
        
        return u

# Testing the BasicSingleQueryAttention class

# Define model parameters
din = 8    # Dimension of the input token vectors
datt = 8   # Dimension of the attention projections
dout = 8   # Dimension of the output vector

# Instantiate the BasicSingleQueryAttention class
attention = BasicSingleQueryAttention(din, datt, dout)

# Create sample input tensors
e = torch.rand(din)  # Vector representation of the current token
et = torch.rand(5, din)  # Vector representations of context tokens

# Forward pass to get the output
output = attention(e, et)

# Print the results
print("Current Token Representation (e):")
print(e)
print("\nContext Tokens Representations (et):")
print(et)
print("\nAttention Output:")
print(output)


Current Token Representation (e):
tensor([0.4292, 0.5354, 0.0491, 0.7914, 0.0320, 0.0460, 0.6593, 0.8800])

Context Tokens Representations (et):
tensor([[0.3340, 0.5503, 0.9377, 0.6325, 0.8560, 0.3793, 0.2951, 0.3250],
        [0.6116, 0.2268, 0.3247, 0.9012, 0.5142, 0.0788, 0.3687, 0.9991],
        [0.4418, 0.3248, 0.2440, 0.0244, 0.0200, 0.0375, 0.9091, 0.6481],
        [0.5830, 0.6217, 0.8130, 0.7250, 0.5637, 0.6455, 0.7217, 0.6539],
        [0.0652, 0.8452, 0.8847, 0.7397, 0.2845, 0.5236, 0.6467, 0.6613]])

Attention Output:
tensor([ 0.1365, -0.0128, -0.1075,  0.0437,  0.0684], grad_fn=<SumBackward1>)


Algorithm 4: Self Attention

In [4]:
#Import the required libraries and classes
import torch
import torch.nn.functional as F

def self_attention(Q, K, V):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attn_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attn_weights, V)
    return output, attn_weights

# example data
batch_size = 1
seq_len = 3
d_k = 4

#queries, keys, and values for testing
Q = torch.rand(batch_size, seq_len, d_k)
K = torch.rand(batch_size, seq_len, d_k)
V = torch.rand(batch_size, seq_len, d_k)

# Apply self-attention
output, attn_weights = self_attention(Q, K, V)

# Shape Check
if output.shape != V.shape:
    raise ValueError(f"Output shape {output.shape} does not match input shape {V.shape}")

# Attention Weights Sum Check
attn_weights_sum = attn_weights.sum(dim=-1)
if not torch.allclose(attn_weights_sum, torch.ones_like(attn_weights_sum)):
    raise ValueError("Attention weights do not sum to 1")

# Print the results
print("Queries (Q):")
print(Q)
print("\nKeys (K):")
print(K)
print("\nValues (V):")
print(V)
print("\nAttention Weights:")
print(attn_weights)
print("\nSelf-Attention Output:")
print(output)

#  Calculation Check
Q_manual = torch.tensor([[[1.0, 1.0], [1.0, 1.0]]])
K_manual = torch.tensor([[[1.0, 1.0], [1.0, 1.0]]])
V_manual = torch.tensor([[[1.0, 2.0], [1.0, 2.0]]])

# Apply self-attention
output_manual, attn_weights_manual = self_attention(Q_manual, K_manual, V_manual)

# Manually calculate attention weights and output
expected_attn_weights = torch.tensor(attn_weights_manual)
expected_output = torch.tensor(output_manual)

#Test1: Verify attention weights
if not torch.allclose(attn_weights_manual, expected_attn_weights):
    raise ValueError(f"Expected {expected_attn_weights}, but got {attn_weights_manual}")

# Test2: Verify output
if not torch.allclose(output_manual, expected_output):
    raise ValueError(f"Expected {expected_output}, but got {output_manual}")

print("Attention Weights and Output are correct!")




Queries (Q):
tensor([[[0.7524, 0.7599, 0.0182, 0.9471],
         [0.8302, 0.7609, 0.2017, 0.9719],
         [0.4551, 0.5293, 0.5499, 0.5052]]])

Keys (K):
tensor([[[0.7406, 0.3117, 0.2835, 0.4902],
         [0.2361, 0.6238, 0.1198, 0.1800],
         [0.8271, 0.0879, 0.6143, 0.8410]]])

Values (V):
tensor([[[0.4381, 0.4962, 0.1358, 0.2185],
         [0.4792, 0.6490, 0.5115, 0.6546],
         [0.6116, 0.1929, 0.4689, 0.9123]]])

Attention Weights:
tensor([[[0.3417, 0.2743, 0.3840],
         [0.3401, 0.2628, 0.3970],
         [0.3326, 0.2847, 0.3826]]])

Self-Attention Output:
tensor([[[0.5160, 0.4217, 0.3668, 0.6045],
         [0.5178, 0.4160, 0.3668, 0.6086],
         [0.5162, 0.4237, 0.3702, 0.6081]]])
Attention Weights and Output are correct!


  expected_attn_weights = torch.tensor(attn_weights_manual)
  expected_output = torch.tensor(output_manual)


Algorithm 5: Multi Headed Attention

In [5]:
import torch
import torch.nn.functional as F

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        
        self.linear_Q = torch.nn.Linear(d_model, d_model)
        self.linear_K = torch.nn.Linear(d_model, d_model)
        self.linear_V = torch.nn.Linear(d_model, d_model)
        self.linear_out = torch.nn.Linear(d_model, d_model)
    
    def forward(self, Q, K, V):
        batch_size = Q.size(0)
        
        Q = self.linear_Q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.linear_K(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.linear_V(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_k, dtype=torch.float32))
        attn_weights = F.softmax(scores, dim=-1)
        output = torch.matmul(attn_weights, V).transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        return self.linear_out(output)

# Testing the MultiHeadAttention class

# Create sample input tensors
batch_size = 2
seq_len = 5
d_model = 8
num_heads = 2

# Random queries, keys, and values for testing
Q = torch.rand(batch_size, seq_len, d_model)
K = torch.rand(batch_size, seq_len, d_model)
V = torch.rand(batch_size, seq_len, d_model)

# Instantiate the MultiHeadAttention class
multi_head_attention = MultiHeadAttention(d_model, num_heads)

# Forward pass
output = multi_head_attention(Q, K, V)

# Print the results
print("Queries (Q):")
print(Q)
print("\nKeys (K):")
print(K)
print("\nValues (V):")
print(V)
print("\nMulti-Head Attention Output:")
print(output)

# Test1: Check the shape of the output
expected_output_shape = (batch_size, seq_len, d_model)
assert output.shape == expected_output_shape, f"Output shape {output.shape} does not match expected shape {expected_output_shape}"

print("Multi-Head Attention output shape is correct!")


Queries (Q):
tensor([[[0.3103, 0.5171, 0.1094, 0.4216, 0.9282, 0.0211, 0.4052, 0.5655],
         [0.3803, 0.6952, 0.8784, 0.7460, 0.5943, 0.2777, 0.2367, 0.2167],
         [0.5384, 0.9434, 0.1484, 0.4566, 0.9863, 0.4564, 0.8737, 0.6148],
         [0.0328, 0.8144, 0.7495, 0.7358, 0.2058, 0.6979, 0.0830, 0.9954],
         [0.3006, 0.5749, 0.0427, 0.1986, 0.9882, 0.1850, 0.0674, 0.4321]],

        [[0.1207, 0.8643, 0.0437, 0.8941, 0.6642, 0.7079, 0.2138, 0.8552],
         [0.4074, 0.4329, 0.4630, 0.3193, 0.8165, 0.7253, 0.6936, 0.8091],
         [0.3120, 0.2342, 0.4066, 0.9199, 0.6162, 0.2377, 0.1977, 0.3163],
         [0.1416, 0.7976, 0.7212, 0.3243, 0.2996, 0.2788, 0.6903, 0.7252],
         [0.5932, 0.4542, 0.0579, 0.5437, 0.3136, 0.9310, 0.9686, 0.9712]]])

Keys (K):
tensor([[[0.1532, 0.4731, 0.3110, 0.3604, 0.4949, 0.7306, 0.9031, 0.8793],
         [0.5894, 0.0083, 0.8383, 0.0659, 0.5458, 0.5877, 0.5884, 0.1184],
         [0.6591, 0.7337, 0.1310, 0.9161, 0.2011, 0.5538, 0.2587, 0.5992

Algorithm 6: Layer Normalization

In [2]:
import torch

class LayerNorm(torch.nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.scale = torch.nn.Parameter(torch.ones(d_model))
        self.shift = torch.nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.scale * (x - mean) / (std + self.eps) + self.shift

# Testing the LayerNorm class

# Create a sample input tensor
batch_size = 2
seq_len = 5
d_model = 8

# Random input tensor for testing
x = torch.rand(batch_size, seq_len, d_model)

# Instantiate the LayerNorm class
layer_norm = LayerNorm(d_model)

# Forward pass
output = layer_norm(x)

# Print the results
print("Input (x):")
print(x)
print("\nLayerNorm Output:")
print(output)

# Check the shape of the output
expected_output_shape = (batch_size, seq_len, d_model)
assert output.shape == expected_output_shape, f"Output shape {output.shape} does not match expected shape {expected_output_shape}"

# Check if the mean of the normalized output is close to zero and the standard deviation is close to one
output_mean = output.mean(dim=-1)
output_std = output.std(dim=-1)

print("\nOutput Mean (per feature):")
print(output_mean)
print("\nOutput Standard Deviation (per feature):")
print(output_std)

# Verify that the mean is close to zero and the standard deviation is close to one
mean_close_to_zero = torch.allclose(output_mean, torch.zeros_like(output_mean), atol=1e-5)
std_close_to_one = torch.allclose(output_std, torch.ones_like(output_std), atol=1e-5)

if not mean_close_to_zero:
    print("Mean is not close to zero.")
if not std_close_to_one:
    print("Standard deviation is not close to one.")

assert mean_close_to_zero, "Mean of the normalized output is not close to zero"
assert std_close_to_one, "Standard deviation of the normalized output is not close to one"

print("LayerNorm output is correct!")


Input (x):
tensor([[[0.6730, 0.0393, 0.2422, 0.3432, 0.8349, 0.2660, 0.5552, 0.0876],
         [0.2978, 0.6528, 0.6984, 0.2043, 0.7365, 0.2762, 0.3128, 0.6423],
         [0.1707, 0.7318, 0.6638, 0.8711, 0.9573, 0.6590, 0.5851, 0.9833],
         [0.9339, 0.1842, 0.8975, 0.5075, 0.7982, 0.0733, 0.0222, 0.0952],
         [0.2145, 0.9904, 0.2172, 0.1104, 0.2826, 0.3466, 0.0355, 0.0180]],

        [[0.5575, 0.6447, 0.3804, 0.0915, 0.0267, 0.6132, 0.3795, 0.1032],
         [0.9970, 0.2188, 0.1447, 0.8744, 0.1595, 0.2305, 0.5848, 0.9647],
         [0.3207, 0.7426, 0.8021, 0.9830, 0.8479, 0.6076, 0.9073, 0.4856],
         [0.0055, 0.1092, 0.0075, 0.5047, 0.8556, 0.3745, 0.2527, 0.0981],
         [0.2415, 0.0697, 0.3685, 0.2235, 0.9081, 0.6701, 0.9360, 0.8069]]])

LayerNorm Output:
tensor([[[ 1.0367, -1.2069, -0.4886, -0.1308,  1.6100, -0.4041,  0.6198,
          -1.0360],
         [-0.8063,  0.7854,  0.9895, -1.2251,  1.1604, -0.9032, -0.7388,
           0.7382],
         [-2.0467,  0.1118, -0

Algorithm 7: Un-Embedding


In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn

class Unembedding(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Unembedding, self).__init__()
        self.linear = nn.Linear(d_model, vocab_size)
    
    def forward(self, x):
        return F.log_softmax(self.linear(x), dim=-1)

# Testing the Unembedding class

# Define model param
d_model = 8      # Dimension of the model
vocab_size = 10  # Size of the vocabulary

# Instantiate the Unembedding class
unembedding = Unembedding(d_model, vocab_size)

# Create a sample input tensor
batch_size = 2
seq_len = 5
input_tensor = torch.rand(batch_size, seq_len, d_model)

# Forward pass to get the output
output = unembedding(input_tensor)

# Print the results
print("Input Tensor:")
print(input_tensor)
print("\nUnembedding Output:")
print(output)

# Check the shape of the output
expected_output_shape = (batch_size, seq_len, vocab_size)
assert output.shape == expected_output_shape, f"Output shape {output.shape} does not match expected shape {expected_output_shape}"

print("Unembedding output shape is correct!")


Input Tensor:
tensor([[[0.8037, 0.5185, 0.4576, 0.8712, 0.0828, 0.5390, 0.0203, 0.2325],
         [0.0614, 0.5910, 0.5577, 0.7081, 0.9924, 0.0757, 0.2376, 0.9929],
         [0.7960, 0.0503, 0.9653, 0.3061, 0.2017, 0.1018, 0.7190, 0.7597],
         [0.1039, 0.2429, 0.5565, 0.7926, 0.0304, 0.1916, 0.7768, 0.9296],
         [0.5250, 0.1673, 0.3328, 0.8441, 0.7479, 0.5023, 0.0496, 0.6746]],

        [[0.5939, 0.5569, 0.1761, 0.9483, 0.6539, 0.3119, 0.5168, 0.5212],
         [0.6581, 0.6763, 0.4474, 0.3233, 0.2153, 0.2479, 0.4800, 0.2957],
         [0.6499, 0.3989, 0.1904, 0.6794, 0.9713, 0.4954, 0.1838, 0.4393],
         [0.9507, 0.8477, 0.7939, 0.3968, 0.4259, 0.9036, 0.8160, 0.6578],
         [0.5237, 0.7211, 0.5870, 0.2909, 0.3612, 0.7647, 0.2361, 0.0112]]])

Unembedding Output:
tensor([[[-2.4364, -2.4295, -2.0336, -1.9029, -2.3706, -2.3000, -2.7691,
          -2.5996, -1.9223, -2.6960],
         [-2.2849, -2.2570, -1.9966, -2.0270, -2.7172, -2.5164, -2.4518,
          -2.3075, -1.9774,

Algorithm 8 Encoder Decoder Transformer Forward Pass

In [9]:
import torch
import torch.nn as nn

# Define a mock Encoder and Decoder for testing purposes
class MockEncoder(nn.Module):
    def __init__(self):
        super(MockEncoder, self).__init__()
    
    def forward(self, src):
        # Dummy implementation
        return src

class MockDecoder(nn.Module):
    def __init__(self):
        super(MockDecoder, self).__init__()
    
    def forward(self, tgt, memory):
        # Dummy implementation
        return tgt

# Define your TransformerED model
class TransformerED(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(TransformerED, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        return output

# Test your TransformerED model
def test_transformer_ed():
    # Create mock encoder and decoder
    encoder = MockEncoder()
    decoder = MockDecoder()
    
    # Initialize TransformerED model with mock encoder and decoder
    model = TransformerED(encoder, decoder)
    
    # Create dummy inputs
    src = torch.randn(3, 10, 64)  # Batch size 3, sequence length 10, embedding size 64
    tgt = torch.randn(3, 5, 64)   # Batch size 3, sequence length 5, embedding size 64
    
    # Pass inputs through the model
    output = model(src, tgt)
    
    # Assert output shape or any other relevant test
    assert output.shape == tgt.shape, f"Expected output shape {tgt.shape}, but got {output.shape}"
    
    print("Transformer model test passed.")

# Run the test
test_transformer_ed()


Transformer model test passed.


Algorithm 9 Encoder Only Transformer Forward Pass

In [3]:
import torch
import torch.nn as nn

# Define a mock encoder for testing
class MockEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MockEncoder, self).__init__()
        self.linear = nn.Linear(input_dim, hidden_dim)
    
    def forward(self, x):
        return self.linear(x)

# Define the BERT class
class BERT(nn.Module):
    def __init__(self, encoder):
        super(BERT, self).__init__()
        self.encoder = encoder
    
    def forward(self, input_seq):
        output = self.encoder(input_seq)
        return output

# Testing the BERT class

# Create a sample input sequence
batch_size = 2
seq_len = 5
input_dim = 8
hidden_dim = 16

# Random input sequence for testing
input_seq = torch.rand(batch_size, seq_len, input_dim)

# Initiate the mock encoder and the BERT class
mock_encoder = MockEncoder(input_dim, hidden_dim)
bert_model = BERT(mock_encoder)

# Forward pass
output = bert_model(input_seq)

# Print the results
print("Input Sequence:")
print(input_seq)
print("\nBERT Output:")
print(output)

# Check the shape of the output
expected_output_shape = (batch_size, seq_len, hidden_dim)
assert output.shape == expected_output_shape, f"Output shape {output.shape} does not match expected shape {expected_output_shape}"

print("BERT output shape is correct!")


Input Sequence:
tensor([[[0.0123, 0.0487, 0.6565, 0.9062, 0.9578, 0.9143, 0.9001, 0.9651],
         [0.9134, 0.5352, 0.6798, 0.7069, 0.9205, 0.4562, 0.6485, 0.5332],
         [0.8936, 0.6976, 0.5772, 0.4466, 0.4275, 0.5398, 0.7640, 0.5166],
         [0.1108, 0.9545, 0.7013, 0.7492, 0.1525, 0.2904, 0.2380, 0.8247],
         [0.9369, 0.6789, 0.2084, 0.0808, 0.5408, 0.0084, 0.1422, 0.8883]],

        [[0.0292, 0.8140, 0.1304, 0.9156, 0.9735, 0.2640, 0.1121, 0.0237],
         [0.6488, 0.2644, 0.1613, 0.6476, 0.3756, 0.2225, 0.0025, 0.7841],
         [0.5375, 0.9858, 0.9521, 0.7677, 0.2883, 0.7155, 0.8090, 0.7118],
         [0.6931, 0.2761, 0.8604, 0.5321, 0.1667, 0.0019, 0.1825, 0.1979],
         [0.1029, 0.5356, 0.0836, 0.6257, 0.3853, 0.7208, 0.5792, 0.4792]]])

BERT Output:
tensor([[[ 0.4235,  0.0565,  0.4561, -0.3044, -0.0441, -0.3350,  1.0221,
           0.5668, -0.4276, -0.2551, -0.7985,  0.5039,  0.5373,  0.0824,
          -0.0416, -0.4198],
         [ 0.0115,  0.0567,  0.1962, -0.4

Algorithm 10 Decoder Only Transformer Forward Pass

In [2]:
import torch
import torch.nn as nn

# Define a mock decoder for testing
class MockDecoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MockDecoder, self).__init__()
        self.linear = nn.Linear(input_dim, hidden_dim)
    
    def forward(self, x):
        return self.linear(x)

# Define the GPT class
class GPT(nn.Module):
    def __init__(self, decoder):
        super(GPT, self).__init__()
        self.decoder = decoder
    
    def forward(self, input_seq):
        output = self.decoder(input_seq)
        return output

# Testing the GPT class

# Define model parame
input_dim = 8  # Dimension of the input sequence
hidden_dim = 16  # Dimension of the hidden layer in the mock decoder

# Instantiate the mock decoder and the GPT class
mock_decoder = MockDecoder(input_dim, hidden_dim)
gpt_model = GPT(mock_decoder)

# Create a sample input sequence
batch_size = 2
seq_len = 5
input_seq = torch.rand(batch_size, seq_len, input_dim)

# Forward pass to get the output
output = gpt_model(input_seq)

# Print the results
print("Input Sequence:")
print(input_seq)
print("\nGPT Output:")
print(output)

# Check the shape of the output
expected_output_shape = (batch_size, seq_len, hidden_dim)
assert output.shape == expected_output_shape, f"Output shape {output.shape} does not match expected shape {expected_output_shape}"

print("GPT output shape is correct!")


Input Sequence:
tensor([[[0.3108, 0.6857, 0.3788, 0.8841, 0.7553, 0.1768, 0.2128, 0.4038],
         [0.9586, 0.0653, 0.4560, 0.7207, 0.5442, 0.2432, 0.7239, 0.4283],
         [0.3311, 0.5133, 0.7004, 0.3936, 0.7205, 0.1069, 0.4762, 0.4689],
         [0.9218, 0.6598, 0.9886, 0.6121, 0.6697, 0.9771, 0.2160, 0.2816],
         [0.0665, 0.7581, 0.2189, 0.2522, 0.1204, 0.4010, 0.5582, 0.9560]],

        [[0.9547, 0.9027, 0.6145, 0.3295, 0.2217, 0.5260, 0.9754, 0.7816],
         [0.9498, 0.5644, 0.9866, 0.8768, 0.2674, 0.0650, 0.9938, 0.5533],
         [0.8005, 0.1937, 0.4296, 0.0428, 0.6288, 0.9077, 0.8755, 0.6899],
         [0.4976, 0.8016, 0.0941, 0.9904, 0.2570, 0.0965, 0.3930, 0.1906],
         [0.7597, 0.9085, 0.3927, 0.5773, 0.5488, 0.1074, 0.6197, 0.6681]]])

GPT Output:
tensor([[[-0.2901,  0.4368,  0.5083,  0.1564,  0.2875, -0.3014, -0.1412,
           0.1948,  0.3387, -0.2137,  0.4809,  0.0216, -0.1465, -0.6224,
          -0.0057, -0.7066],
         [ 0.0635,  0.2487,  0.5825,  0.08

Algorithm 11: Encoder Decoder Transformer Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define a simple Transformer model for testing purposes
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, nheads, num_layers):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.transformer = nn.Transformer(hidden_dim, nheads, num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        output = self.transformer(src, tgt)
        output = self.fc(output)
        return output

# Create a dummy DataLoader
batch_size = 2
seq_len = 5
vocab_size = 10
hidden_dim = 8
nheads = 2
num_layers = 2

# Generate random source and target sequences
src_data = torch.randint(0, vocab_size, (batch_size, seq_len))
tgt_data = torch.randint(0, vocab_size, (batch_size, seq_len))

dataset = TensorDataset(src_data, tgt_data)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate the model, optimizer, and loss criterion
model = SimpleTransformer(vocab_size, vocab_size, hidden_dim, nheads, num_layers)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the training function
def train_transformerED(model, data_loader, optimizer, criterion, device):
    model.train()
    for src, tgt in data_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output.view(-1, output.size(-1)), tgt.view(-1))
        loss.backward()
        optimizer.step()
        print(f"Loss: {loss.item()}")

# Test the training function
train_transformerED(model, data_loader, optimizer, criterion, device)



Algorithm 12 Encoder Only Transformer Training

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

def train_BERT(model, data_loader, optimizer, criterion, device):
    model.train()
    for input_seq, labels in data_loader:
        input_seq, labels = input_seq.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(input_seq)
        loss = criterion(output.view(-1, output.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()
    return loss.item()  # Return the last batch loss for testing purposes

def test_train_BERT():
    print("Starting BERT training tests...")

    # Set up a simple mock model
    class MockBERT(nn.Module):
        def __init__(self, vocab_size, hidden_size):
            super(MockBERT, self).__init__()
            self.embedding = nn.Embedding(vocab_size, hidden_size)
            self.linear = nn.Linear(hidden_size, vocab_size)
        
        def forward(self, x):
            return self.linear(self.embedding(x).mean(dim=1))

    # Test parameters
    vocab_size = 1000
    hidden_size = 128
    seq_length = 10
    batch_size = 32
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and move to device
    model = MockBERT(vocab_size, hidden_size).to(device)
    
    # Create mock data
    input_data = torch.randint(0, vocab_size, (batch_size, seq_length))
    labels = torch.randint(0, vocab_size, (batch_size,))
    dataset = TensorDataset(input_data, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size)

    # Initialize optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    # Test 1: Check if training runs without errors
    print("\nTest 1: Checking if training runs without errors")
    try:
        loss = train_BERT(model, data_loader, optimizer, criterion, device)
        print(f"PASS: Training completed successfully. Final batch loss: {loss}")
    except Exception as e:
        print(f"FAIL: Training raised an exception: {str(e)}")

    # Test 2: Check if loss decreases over multiple epochs
    print("\nTest 2: Checking if loss decreases over multiple epochs")
    initial_loss = train_BERT(model, data_loader, optimizer, criterion, device)
    for epoch in range(5):
        new_loss = train_BERT(model, data_loader, optimizer, criterion, device)
    
    if new_loss < initial_loss:
        print(f"PASS: Loss decreased from {initial_loss} to {new_loss}")
    else:
        print(f"FAIL: Loss did not decrease. Initial: {initial_loss}, Final: {new_loss}")

    # Test 3: Check if model parameters are updated
    print("\nTest 3: Checking if model parameters are updated")
    initial_params = [p.clone().detach() for p in model.parameters()]
    train_BERT(model, data_loader, optimizer, criterion, device)
    updated_params = list(model.parameters())
    
    params_changed = all(not torch.allclose(i, u) for i, u in zip(initial_params, updated_params))
    if params_changed:
        print("PASS: Model parameters were updated during training")
    else:
        print("FAIL: Model parameters did not change during training")

    # Test 4: Check if training works with different batch sizes
    print("\nTest 4: Checking if training works with different batch sizes")
    for test_batch_size in [1, 16, 64]:
        test_loader = DataLoader(dataset, batch_size=test_batch_size)
        try:
            train_BERT(model, test_loader, optimizer, criterion, device)
            print(f"PASS: Training successful with batch size {test_batch_size}")
        except Exception as e:
            print(f"FAIL: Training failed with batch size {test_batch_size}. Error: {str(e)}")

    print("\nAll tests completed.")

# Run the tests
test_train_BERT()

Starting BERT training tests...

Test 1: Checking if training runs without errors
PASS: Training completed successfully. Final batch loss: 6.924795627593994

Test 2: Checking if loss decreases over multiple epochs
PASS: Loss decreased from 6.878496170043945 to 6.647015571594238

Test 3: Checking if model parameters are updated
PASS: Model parameters were updated during training

Test 4: Checking if training works with different batch sizes
PASS: Training successful with batch size 1
PASS: Training successful with batch size 16
PASS: Training successful with batch size 64

All tests completed.


Algorithm 13: Decoder Only Transformer Training

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Define a simple mock model
class MockGPTModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(MockGPTModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        return self.linear(embedded)

# Define the test function
def test_train_GPT():
    # Mock parameters
    vocab_size = 100
    hidden_size = 10
    seq_length = 5
    batch_size = 2
    
    # Create mock data
    input_data = torch.randint(0, vocab_size, (batch_size, seq_length))
    labels = torch.randint(0, vocab_size, (batch_size, seq_length))
    
    # Create DataLoader
    dataset = TensorDataset(input_data, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size)
    
    # Create mock model, optimizer, and criterion
    model = MockGPTModel(vocab_size, hidden_size)
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()
    
    # Specify device (CPU for testing)
    device = torch.device('cpu')
    
    # Call the train_GPT function
    train_GPT(model, data_loader, optimizer, criterion, device)
    
    # Check if the model parameters have been updated
    for param in model.parameters():
        assert param.grad is not None, "Model parameters were not updated."

# Example train_GPT function
def train_GPT(model, data_loader, optimizer, criterion, device):
    model.train()
    for input_seq, labels in data_loader:
        input_seq, labels = input_seq.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(input_seq)
        loss = criterion(output.view(-1, output.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()

# Run the test
test_train_GPT()
print("Test passed!")



Test passed!


Algorithm 14: Decoder Only Transformer Model Inference

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Mock GPT model definition
class MockGPT(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(MockGPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.fc = nn.Linear(d_model, vocab_size)
    
    def forward(self, input_seq):
        x = self.embedding(input_seq)
        x = self.fc(x)
        return x

# gpt_inference function definition
def gpt_inference(model, input_seq, max_len, temperature, device):
    model.eval()
    generated = input_seq.to(device)
    with torch.no_grad():
        for _ in range(max_len):
            output = model(generated)
            next_token_logits = output[:, -1, :] / temperature
            next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated

# Test function definition
def test_gpt_inference():
    print("Starting GPT inference tests...")

    # Test parameters
    vocab_size = 1000
    d_model = 128
    input_len = 5
    max_len = 10
    temperature = 1.0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and move to device
    model = MockGPT(vocab_size, d_model).to(device)
    
    # Create mock input
    input_seq = torch.randint(0, vocab_size, (1, input_len), device=device)
    
    # Run inference
    output = gpt_inference(model, input_seq, max_len, temperature, device)
    
    # Assertions to check output
    assert output.size(1) == input_len + max_len, f"Expected output length {input_len + max_len}, got {output.size(1)}"
    assert torch.all(output >= 0) and torch.all(output < vocab_size), "Output tokens should be within vocabulary range."

    print("All tests passed!")

# Run the test
test_gpt_inference()



Starting GPT inference tests...
All tests passed!


Algorithm 15: Encoder-Decoder Transformer Model Inference

In [7]:
import torch
import torch.nn as nn

def transformerED_inference(model, src, tgt_start_token, max_len, device):
    model.eval()
    src = src.to(device)
    tgt = torch.tensor([[tgt_start_token]], device=device)
    with torch.no_grad():
        memory, hidden_state = model.encode(src)
        for _ in range(max_len):
            output, hidden_state = model.decode(tgt, hidden_state)
            next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
            tgt = torch.cat((tgt, next_token), dim=1)
    return tgt

class MockTransformerED(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(MockTransformerED, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder_lstm = nn.LSTM(d_model, d_model, batch_first=True)
        self.decoder_lstm = nn.LSTM(d_model, d_model, batch_first=True)
        self.decoder_fc = nn.Linear(d_model, vocab_size)
    
    def encode(self, src):
        src_emb = self.embedding(src)
        memory, hidden_state = self.encoder_lstm(src_emb)
        return memory, hidden_state
    
    def decode(self, tgt, hidden_state):
        tgt_emb = self.embedding(tgt)
        output, hidden_state = self.decoder_lstm(tgt_emb, hidden_state)
        output = self.decoder_fc(output)
        return output, hidden_state

def test_transformerED_inference():
    print("Starting Transformer Encoder-Decoder inference tests...")

    # Test parameters
    vocab_size = 1000
    d_model = 128
    src_len = 10
    max_len = 15
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize model and move to device
    model = MockTransformerED(vocab_size, d_model).to(device)
    
    # Create mock input
    src = torch.randint(0, vocab_size, (1, src_len), device=device)
    tgt_start_token = vocab_size - 1  # Using last token as start token
    
    # Run inference
    output = transformerED_inference(model, src, tgt_start_token, max_len, device)
    
    # Assertions to check output
    assert output.size(1) == max_len + 1, f"Expected output length {max_len + 1}, got {output.size(1)}"
    assert output[0, 0].item() == tgt_start_token, "First token of output should be the start token."
    assert torch.all(output >= 0) and torch.all(output < vocab_size), "Output tokens should be within vocabulary range."

    print("All tests passed!")

# Run the test
test_transformerED_inference()


Starting Transformer Encoder-Decoder inference tests...
All tests passed!
