In [38]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [39]:
df = pd.read_parquet("./test-00000-of-00001.parquet")  # If using local file

In [40]:

english_sentences = []
deutsch_sentences = []
for i in range(len(df))[:100]:
    english = df.iloc[i].iloc[0]['en']
    deutsch = df.iloc[i].iloc[0]['de']
    english_sentences.append(english)
    deutsch_sentences.append(deutsch)

In [41]:
def BPETokenizer(corpus_sentences,vocab_size=30000):
    # Initialize a BPE tokenizer
    tokenizer = Tokenizer(BPE())

    # Set up a trainer with desired vocabulary size
    trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=2, special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"])

    # Define a pre-tokenizer to split input text into words
    tokenizer.pre_tokenizer = Whitespace()

    # Tokenizer expects an iterator of strings
    tokenizer.train_from_iterator(corpus_sentences, trainer=trainer)
    tokenizer.enable_padding(length=GLOBALS['CONTEXT-SIZE'], pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>")

    return tokenizer


GLOBALS = {
    "INPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct INPUT tokens.
    "OUTPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct OUTPUTtokens.
    'INPUT-EMBEDDING-DIMENSION' : 4096 ,# dimension of each embedding vector for a token,
    "CONTEXT-SIZE" : 512, # Fixed length of an input sequence
    "ATTENTION-HEAD-COUNT" : 4, # Number of attention heads to use. should properly divide input-embedding-dimension
    "FFN-HIDDEN-DIMENSION" : 4096 * 2, # Dimension of hidden layer in FFN ,Usually higher than Embedding Dimension.
    "ENCODER-BLOCK-COUNT" : 4, # Number of blocks of stacked encoders
}
english_encoder = BPETokenizer(english_sentences,GLOBALS["INPUT-VOCABULARY-SIZE"])
deutsch_encoder = BPETokenizer(deutsch_sentences,GLOBALS["OUTPUT-VOCABULARY-SIZE"])



In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

##########################################
# Input Embedding Layer
##########################################
class InputEmbeddingLayer(nn.Module):
    def __init__(self, vocabulary_size, embedding_vector_dimension):
        super(InputEmbeddingLayer, self).__init__()
        # nn.Embedding registers the lookup table as a parameter
        self.embedding = nn.Embedding(vocabulary_size, embedding_vector_dimension)
    
    def forward(self, tokens):
        # Output shape : (seq_len, embedding_dim)
        return self.embedding(tokens)

##########################################
# Positional Encoding Layer
##########################################
class PositionalEncodingLayer(nn.Module):
    def __init__(self, sequence_length, embedding_vector_dimension):
        super(PositionalEncodingLayer, self).__init__()
        # Precompute positional encoding and register as buffer so it's moved to GPU with the model.
        encoding = torch.zeros(sequence_length, embedding_vector_dimension)
        position = torch.arange(0, sequence_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_vector_dimension, 2, dtype=torch.float) *
                             -(math.log(10000.0) / embedding_vector_dimension))
        encoding[:, 0::2] = torch.sin(position * div_term)
        encoding[:, 1::2] = torch.cos(position * div_term)
        # Register as a buffer (non-trainable but saved with the model)
        self.register_buffer('encoding', encoding)

    def forward(self, embeddings):
        # embeddings shape: (seq_len, embedding_dim) or broadcastable shape.
        return embeddings + self.encoding

##########################################
# Multi-Head Attention
##########################################
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # d_model is the embedding dimension; must be divisible by num_heads.
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        
        # Linear projections for Q, K, V. Bias is often set to False.
        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)
        # Final output projection.
        self.W_o = nn.Linear(d_model, d_model, bias=False)
    
    def split_heads(self, x):
        # x shape: (batch, seq_len, d_model)
        batch_size, seq_len, _ = x.size()
        # Reshape to (batch, seq_len, num_heads, head_dim)
        x = x.view(batch_size, seq_len, self.num_heads, self.head_dim)
        # Transpose to (batch, num_heads, seq_len, head_dim)
        return x.transpose(1, 2)
    
    def combine_heads(self, x):
        # x shape: (batch, num_heads, seq_len, head_dim)
        batch_size, num_heads, seq_len, head_dim = x.size()
        # Transpose to (batch, seq_len, num_heads, head_dim) then reshape to (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous()  # ensure contiguous memory
        return x.view(batch_size, seq_len, self.d_model)
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Q, K, V shape: (batch, num_heads, seq_len, head_dim)
        d_k = Q.size(-1)
        # Compute attention scores: (batch, num_heads, seq_len, seq_len)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        # Softmax over the last dimension (keys)
        attention_weights = F.softmax(scores, dim=-1)
        output = torch.matmul(attention_weights, V)  # (batch, num_heads, seq_len, head_dim)
        return attention_weights, output
    
    def forward(self, x, mask=None):
        # x shape: (batch, seq_len, d_model)
        Q = self.W_q(x)
        K = self.W_k(x)
        V = self.W_v(x)
        # Split into heads.
        Q = self.split_heads(Q)  # (batch, num_heads, seq_len, head_dim)
        K = self.split_heads(K)
        V = self.split_heads(V)
        # Compute attention.
        attention_weights, attention_output = self.scaled_dot_product_attention(Q, K, V, mask)
        # Combine heads back.
        attention_output = self.combine_heads(attention_output)  # (batch, seq_len, d_model)
        # Final linear projection.
        output = self.W_o(attention_output)
        return attention_weights, output

##########################################
# Add & Norm Layer (Residual Connection + LayerNorm)
##########################################
class AddAndNorm(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super(AddAndNorm, self).__init__()
        self.norm = nn.LayerNorm(d_model)  # Normalizes over last dimension (d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer_output):
        # Apply dropout to sublayer output, then add the residual (x) and normalize.
        return self.norm(x + self.dropout(sublayer_output))

##########################################
# Position-wise Feed Forward Network (FFN)
##########################################
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Apply two linear transformations with ReLU activation and dropout in between.
        return self.linear2(self.dropout(self.relu(self.linear1(x))))

##########################################
# Encoder Block
##########################################
class EncoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.add_and_norm1 = AddAndNorm(d_model, dropout)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.add_and_norm2 = AddAndNorm(d_model, dropout)

    def forward(self, x, mask=None):
        # Multi-head attention sublayer.
        attention_weights, attention_output = self.mha(x, mask)
        x = self.add_and_norm1(x, attention_output)
        # Position-wise FFN sublayer.
        ffn_output = self.ffn(x)
        x = self.add_and_norm2(x, ffn_output)
        return x



##########################################
# Encoder (Stack of Encoder Blocks)
##########################################
class Encoder(nn.Module):
    def __init__(self, num_blocks, d_model, num_heads, d_ff, dropout=0.1):
        super(Encoder, self).__init__()
        # Using nn.ModuleList ensures submodules are registered.
        self.blocks = nn.ModuleList([
            EncoderBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_blocks)
        ])

    def forward(self, x, mask=None):
        # Pass input through each encoder block sequentially.
        for block in self.blocks:
            x = block(x, mask)
        return x
    

In [None]:



import torch

vocab_size = GLOBALS['INPUT-VOCABULARY-SIZE']
d_model = GLOBALS['INPUT-EMBEDDING-DIMENSION']
seq_len = GLOBALS['CONTEXT-SIZE']
input_embedding_layer = InputEmbeddingLayer(vocab_size,d_model)
encoder_block_count = GLOBALS['ENCODER-BLOCK-COUNT']
num_heads = GLOBALS['ATTENTION-HEAD-COUNT']
ffn_hidden_dimension = GLOBALS['FFN-HIDDEN-DIMENSION']
positional_encoding_layer = PositionalEncodingLayer(seq_len,d_model)

encoder = Encoder(encoder_block_count,d_model,num_heads,ffn_hidden_dimension)

# Create a causal mask with shape (1, 1, seq_len, seq_len)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)


for english_sentence,deutsch_sentence in zip(english_sentences,deutsch_sentences):
    
    english_encoding = english_encoder.encode(english_sentence).ids
    deutsch_encoding = deutsch_encoder.encode(deutsch_sentence).ids
    

    input_embeddings = input_embedding_layer.forward(torch.LongTensor(english_encoding).reshape(1,-1))
    
    input_embeddings = positional_encoding_layer.forward(input_embeddings)
    
    x = input_embeddings
    encoder.forward(x)
    print(x)
    
    
    # print("English: ",english_encoding)
    # print("Deutsch: ",deutsch_encoding)
    break



tensor([[[-1.0169,  1.3837,  0.1888,  ...,  0.5041, -0.2973, -0.0765],
         [-0.1123,  1.3558,  1.4663,  ...,  1.2477,  1.4175,  1.9930],
         [ 1.5835, -1.4333,  1.1004,  ...,  0.6044, -0.9925,  0.6191],
         ...,
         [ 0.4850,  0.5302, -1.2330,  ...,  0.7893, -0.5802,  0.7583],
         [ 1.2964,  0.0193, -1.3790,  ...,  0.7893, -0.5801,  0.7583],
         [ 1.3049, -0.9395, -0.6666,  ...,  0.7893, -0.5800,  0.7583]]],
       grad_fn=<AddBackward0>)
