In [143]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [144]:
df = pd.read_parquet("train.parquet")  # If using local file

In [145]:

english_sentences = []
deutsch_sentences = []
for i in range(len(df))[:100]:
    english = df.iloc[i].iloc[0]['en']
    deutsch = df.iloc[i].iloc[0]['de']
    english_sentences.append(english)
    deutsch_sentences.append(deutsch)

In [146]:
def BPETokenizer(corpus_sentences,vocab_size=30000):
    # Initialize a BPE tokenizer
    tokenizer = Tokenizer(BPE())

    # Set up a trainer with desired vocabulary size
    trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=2, special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"])

    # Define a pre-tokenizer to split input text into words
    tokenizer.pre_tokenizer = Whitespace()

    # Tokenizer expects an iterator of strings
    tokenizer.train_from_iterator(corpus_sentences, trainer=trainer)
    tokenizer.enable_padding(length=GLOBALS['CONTEXT-SIZE'], pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>")

    return tokenizer


GLOBALS = {
    "INPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct INPUT tokens.
    "OUTPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct OUTPUTtokens.
    'INPUT-EMBEDDING-DIMENSION' : 4096 ,# dimension of each embedding vector for a token,
    "CONTEXT-SIZE" : 512, # Fixed length of an input sequence
}
english_encoder = BPETokenizer(english_sentences,GLOBALS["INPUT-VOCABULARY-SIZE"])
deutsch_encoder = BPETokenizer(deutsch_sentences,GLOBALS["OUTPUT-VOCABULARY-SIZE"])



In [153]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class InputEmbeddingLayer(nn.Module):
    def __init__(self,vocabulary_size,embedding_vector_dimension):
        super().__init__()
        
        # The input to the module is a list of indices, and the output is the corresponding word embeddings.
        # num_embeddings (int) – size of the dictionary of embeddings
        # embedding_dim (int) – the size of each embedding vector
        self.embedding = nn.Embedding(vocabulary_size,embedding_vector_dimension)


    def forward(self,tokens):
        '''
        Output shape : (len(tokens),embedding_dim)
        '''
        return self.embedding(tokens)
    

class PositionalEncodingLayer(nn.Module):
    def __init__(self,sequence_length,embedding_vector_dimension):
        super().__init__()
        
        self.encoding = torch.zeros(sequence_length, embedding_vector_dimension)
    
        # Create a position tensor
        position = torch.arange(0, sequence_length, dtype=torch.float).unsqueeze(1)
        
        # Compute the div_term
        div_term = torch.exp(torch.arange(0, embedding_vector_dimension, 2, dtype=torch.float) *
                            -(math.log(10000.0) / embedding_vector_dimension))
        
        # Apply the sinusoidal functions
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        
    def forward(self,embeddings):
        # Shape : sequence , N_embed_dim
        return embeddings + self.encoding
        
        

In [154]:



import torch

input_embedding_layer = InputEmbeddingLayer(GLOBALS['INPUT-VOCABULARY-SIZE'],GLOBALS['INPUT-EMBEDDING-DIMENSION'])

positional_encoding_layer = PositionalEncodingLayer(GLOBALS['CONTEXT-SIZE'],GLOBALS['INPUT-EMBEDDING-DIMENSION'])

for english_sentence,deutsch_sentence in zip(english_sentences,deutsch_sentences):
    
    english_encoding = english_encoder.encode(english_sentence).ids
    deutsch_encoding = deutsch_encoder.encode(deutsch_sentence).ids
    
    input_embeddings = input_embedding_layer.forward(torch.LongTensor(english_encoding))
    
    input_embeddings = positional_encoding_layer.forward(input_embeddings)
    print(input_embeddings)
    # print("English: ",english_encoding)
    # print("Deutsch: ",deutsch_encoding)
    break



tensor([[-0.0745,  0.0561,  1.1936,  ..., -0.0871, -0.8049, -1.3031],
        [ 0.2833, -0.7635,  3.2787,  ...,  1.4501,  1.4383, -0.7224],
        [ 0.6348, -0.2284,  1.2468,  ...,  0.0390,  0.3357,  1.4654],
        ...,
        [-0.8920,  1.7560, -2.1955,  ...,  0.5288,  0.2063,  0.7464],
        [-0.0806,  1.2451, -2.3415,  ...,  0.5288,  0.2064,  0.7464],
        [-0.0721,  0.2863, -1.6291,  ...,  0.5288,  0.2065,  0.7464]],
       grad_fn=<AddBackward0>)
