Notebook for experimenting with what's implemented

In [None]:
%load_ext autoreload
%autoreload 2

In [43]:
from tokenizers import Tokenizer

de_en_tokenizer = Tokenizer.from_file("tokenizers/de_en_tokenizer.json")

de_en_tokenizer.get_vocab_size()

30000

In [40]:
de_en_tokenizer.get_vocab_size()

30000

In [28]:
import importlib
from datasets import Dataset
import dnlp2025.dataset  # your local module
importlib.reload(dnlp2025)

def get_mock_split_small():
    return Dataset.from_list([
        { "translation": { "de": "Guten morgen.", "en": "Good morning." }},
        { "translation": { "de": "Guten tag.", "en": "Good day." }},
        { "translation": { "de": "Wilkommen.", "en": "Welcome." }},
        { "translation": { "de": "Wie geht es Ihnen?", "en": "How are you?" }},
        { "translation": { "de": "Das ist ein Buch.", "en": "That is a book." }},
        { "translation": { "de": "Ich denke, dass ich einen Kaffee brauche.", "en": "I think I need a coffee." }},
        { "translation": { "de": "Ich bin ein bisschen m dde.", "en": "I am a bit tired." }}
    ])

mock_split = Dataset.from_list(get_mock_split_small())

de_en_train_dataloader = dnlp2025.dataset.create_dataloader(
    mock_split,
    "test", # If set to "train", then the last batch will be removed
    de_en_tokenizer,
    max_tokens_per_batch=20,
    shuffle=False # Shuffling disabled to see how the batches are created
)


Create dataloader called
Bos token:  1
EOS token:  2
Tokenizing dataset


Tokenizing dataset:   0%|          | 0/7 [00:00<?, ? examples/s]

Preprocess function called for:  {'translation': [{'de': 'Guten morgen.', 'en': 'Good morning.'}, {'de': 'Guten tag.', 'en': 'Good day.'}, {'de': 'Wilkommen.', 'en': 'Welcome.'}, {'de': 'Wie geht es Ihnen?', 'en': 'How are you?'}, {'de': 'Das ist ein Buch.', 'en': 'That is a book.'}, {'de': 'Ich denke, dass ich einen Kaffee brauche.', 'en': 'I think I need a coffee.'}, {'de': 'Ich bin ein bisschen m dde.', 'en': 'I am a bit tired.'}]}
Dataset tokenized
Sampler initialized
Dataloader initialized


In [24]:
print("Special token ids: Padding - 0, EOS - 1, BOS - 2")

batch_num = 0
for batch in de_en_train_dataloader:
    batch_num += 1

    print("\nBatch: ", batch_num)
    print("Encoder input ids: ")
    print(batch['encoder_input_ids'])

    print("\nDecoder input ids (should strip EOS token): ")
    print(batch['decoder_input_ids'])

    print("\nLabel ids (should strip BOS token): ")
    print(batch['labels'])

Special token ids: Padding - 0, EOS - 1, BOS - 2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Batch:  1
Encoder input ids: 
tensor([[    1, 14070,    16,     2,     0,     0],
        [    1, 12492, 10038,    16,     2,     0],
        [    1, 12492,  4328,    16,     2,     0],
        [    1,  7815,  3862,  3974,    33,     2]])

Decoder input ids (should strip EOS token): 
tensor([[    1, 12905,  4391,    16,     0,     0],
        [    1,    41,  5289,  9304,    16,     0],
        [    1,    41,  5289,  6169,    16,     0],
        [    1,  5118,  5255,  3769,  4755,    33]])

Label ids (should strip BOS token): 
tensor([[12905,  4391,    16,     2,     0,     0],
        [   41,  5289,  9304,    16,     2,     0],
        [   41,  5289,  6169,    16,     2,     0],
        [ 5118,  5255,  3769,  4755,    33,     2]])

Batch:  2
Encoder input ids: 
tensor([[    1,  5336,  3772,    67,  5689,    16,     2,     0],
        [    1,    43,  3811,    67,  5760, 28026,    16,     2]])

Decoder input ids (should strip EOS token): 
tensor([[    1,  4151,  3831,  3791,  7057,    1

In [35]:
from src.dnlp2025.download_datasets import download_wmt14_de_en

de_en_dataset = download_wmt14_de_en()
small_dataset = de_en_dataset['train'].train_test_split(train_size=0.1, seed=42)['train']


In [36]:
de_en_full_dataloader = dnlp2025.dataset.create_dataloader(
    small_dataset,
    "train",
    de_en_tokenizer,
    max_tokens_per_batch=25000,
    shuffle=True,
    num_workers=6)

Tokenizing dataset (num_proc=10):   0%|          | 0/450878 [00:00<?, ? examples/s]



In [37]:
len(de_en_full_dataloader)

584

In [39]:
for batch in de_en_full_dataloader:
    print(batch)
    break


{'encoder_input_ids': tensor([[    1, 15607,  4712,  ...,  3923,  9944,     2],
        [    1,  4004,  3964,  ...,  9139,    16,     2],
        [    1,  4863,  3772,  ..., 13443,    16,     2],
        ...,
        [    1,  3885,  5077,  ...,  8689,    16,     2],
        [    1,  3885,  9889,  ...,  3923,    16,     2],
        [    1,    40,  3892,  ..., 28280,    16,     2]]), 'decoder_input_ids': tensor([[    1,  9495,  9542,  ...,     0,     0,     0],
        [    1, 10390, 19170,  ...,     0,     0,     0],
        [    1,  3869, 18639,  ...,     0,     0,     0],
        ...,
        [    1,  4275,  4095,  ...,     0,     0,     0],
        [    1,  4275, 12688,  ...,     0,     0,     0],
        [    1, 12772,  5274,  ...,     0,     0,     0]]), 'labels': tensor([[ 9495,  9542,    16,  ...,     0,     0,     0],
        [10390, 19170,    14,  ...,     0,     0,     0],
        [ 3869, 18639,  5862,  ...,     0,     0,     0],
        ...,
        [ 4275,  4095,  5039,  ...