Notebook for experimenting with what's implemented

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tokenizers import Tokenizer

de_en_tokenizer = Tokenizer.from_file("tokenizers/de_en_tokenizer.json")

de_en_tokenizer.get_vocab_size()

37000

In [3]:
import importlib
from datasets import Dataset
import dnlp2025.dataset  # your local module
importlib.reload(dnlp2025)

def get_mock_split_small():
    return Dataset.from_list([
        { "translation": { "de": "Guten morgen.", "en": "Good morning." }},
        { "translation": { "de": "Guten tag.", "en": "Good day." }},
        { "translation": { "de": "Wilkommen.", "en": "Welcome." }},
        { "translation": { "de": "Wie geht es Ihnen?", "en": "How are you?" }},
        { "translation": { "de": "Das ist ein Buch.", "en": "That is a book." }},
        { "translation": { "de": "Ich denke, dass ich einen Kaffee brauche.", "en": "I think I need a coffee." }},
        { "translation": { "de": "Ich bin ein bisschen m dde.", "en": "I am a bit tired." }}
    ])

mock_split = Dataset.from_list(get_mock_split_small())

de_en_train_dataloader = dnlp2025.dataset.create_dataloader(
    mock_split,
    "test", # If set to "train", then the last batch will be removed
    de_en_tokenizer,
    max_tokens_per_batch=20,
    shuffle=False # Shuffling disabled to see how the batches are created
)


Tokenizing dataset:   0%|          | 0/7 [00:00<?, ? examples/s]



In [4]:
print("Special token ids: Padding - 0, EOS - 1, BOS - 2")

batch_num = 0
for batch in de_en_train_dataloader:
    batch_num += 1

    print("\nBatch: ", batch_num)
    print("Encoder input ids: ")
    print(batch['encoder_input_ids'])

    print("\nDecoder input ids (should strip EOS token): ")
    print(batch['decoder_input_ids'])

    print("\nLabel ids (should strip BOS token): ")
    print(batch['labels'])

Special token ids: Padding - 0, EOS - 1, BOS - 2

Batch:  1
Encoder input ids: 
tensor([[    1, 14070,    16,     2,     0,     0],
        [    1, 12492, 10038,    16,     2,     0],
        [    1, 12492,  4328,    16,     2,     0],
        [    1,  7815,  3862,  3974,    33,     2]])

Decoder input ids (should strip EOS token): 
tensor([[    1, 12905,  4391,    16,     0,     0],
        [    1, 34138,  9304,    16,     0,     0],
        [    1, 34138,  6169,    16,     0,     0],
        [    1,  5118,  5255,  3769,  4755,    33]])

Label ids (should strip BOS token): 
tensor([[12905,  4391,    16,     2,     0,     0],
        [34138,  9304,    16,     2,     0,     0],
        [34138,  6169,    16,     2,     0,     0],
        [ 5118,  5255,  3769,  4755,    33,     2]])

Batch:  2
Encoder input ids: 
tensor([[    1,  5336,  3772,    67,  5689,    16,     2,     0],
        [    1,    43,  3811,    67,  5760, 28026,    16,     2]])

Decoder input ids (should strip EOS token): 

In [5]:
from src.dnlp2025.download_datasets import download_wmt14_de_en

de_en_dataset = download_wmt14_de_en()
small_dataset = de_en_dataset['train'].train_test_split(train_size=0.1, seed=42)['train']


In [6]:
de_en_full_dataloader = dnlp2025.dataset.create_dataloader(
    small_dataset,
    "train",
    de_en_tokenizer,
    max_tokens_per_batch=25000,
    shuffle=True,
    num_workers=6)

Tokenizing dataset (num_proc=10):   0%|          | 0/450878 [00:00<?, ? examples/s]



In [7]:
len(de_en_full_dataloader)

571

In [8]:
for batch in de_en_full_dataloader:
    print(batch)
    break


{'encoder_input_ids': tensor([[    1,  4505,  4006,  ...,  7538,    16,     2],
        [    1,    41,  7410,  ..., 34734,    16,     2],
        [    1,  3869,  3971,  ...,  5665,    16,     2],
        ...,
        [    1,  4179,  3772,  ...,  3998,    16,     2],
        [    1, 13791,  3790,  ..., 10277,    16,     2],
        [    1,  8821,  3801,  ...,  6806,    16,     2]]), 'decoder_input_ids': tensor([[    1,  4527, 16724,  ...,     0,     0,     0],
        [    1,  4151,    41,  ...,     0,     0,     0],
        [    1,  3869,  5312,  ...,     0,     0,     0],
        ...,
        [    1, 18205,  4031,  ...,     0,     0,     0],
        [    1,  3972, 18404,  ...,     0,     0,     0],
        [    1, 10236, 15128,  ...,     0,     0,     0]]), 'labels': tensor([[ 4527, 16724,  4034,  ...,     0,     0,     0],
        [ 4151,    41,  7410,  ...,     0,     0,     0],
        [ 3869,  5312, 15210,  ...,     0,     0,     0],
        ...,
        [18205,  4031,  4354,  ...