In [173]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [174]:
df = pd.read_parquet("./train.parquet")  # If using local file

In [175]:

english_sentences = []
deutsch_sentences = []
for i in range(len(df))[:100]:
    english = df.iloc[i].iloc[0]['en']
    deutsch = df.iloc[i].iloc[0]['de']
    english_sentences.append(english)
    deutsch_sentences.append(deutsch)

In [176]:
GLOBALS = {
    "INPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct INPUT tokens.
    "OUTPUT-VOCABULARY-SIZE" : 30_000, # number of accepted distinct OUTPUT tokens.
    "d_model" : 4096, # dimension of each token's embedding vector.
    "INPUT-SEQUENCE-LENGTH" : 512,
}

In [None]:
def BPETokenizer(corpus_sentences,vocab_size=30000):
    # Initialize a BPE tokenizer
    tokenizer = Tokenizer(BPE())

    # Set up a trainer with desired vocabulary size
    trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=2, special_tokens=["<unk>", "<pad>", "<bos>", "<eos>"])

    # Define a pre-tokenizer to split input text into words
    tokenizer.pre_tokenizer = Whitespace()

    # Tokenizer expects an iterator of strings
    tokenizer.train_from_iterator(corpus_sentences, trainer=trainer)
    tokenizer.enable_padding(length=GLOBALS['INPUT-SEQUENCE-LENGTH'], pad_id=tokenizer.token_to_id("<pad>"), pad_token="<pad>")

    return tokenizer


english_encoder = BPETokenizer(english_sentences,GLOBALS["INPUT-VOCABULARY-SIZE"])
deutsch_encoder = BPETokenizer(deutsch_sentences,GLOBALS["OUTPUT-VOCABULARY-SIZE"])


In [183]:

for english_sentence,deutsch_sentence in zip(english_sentences,deutsch_sentences):
    
    english_encoding = english_encoder.encode(english_sentence).ids
    print(english_encoding)
    break

[135, 8, 66, 54, 83, 97, 123, 99, 8, 66, 54, 92, 205, 117, 94, 191, 48, 81, 126, 84, 11, 8, 256, 84, 165, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 