In [None]:
import sys
sys.path.append("../")
import torch

# Tokenize
from src.components.bpe_tokenizer import BPETokenizer
# Embed tokens
from src.components.token_embedding import TokenEmbedding

**1. Parameters**

In [16]:
batch_size = 2
seq_len = 10
vocab_size = 300
d_model = 128

**2. Train the Tokenizer**

In [17]:
sample = """
The quick brown fox jumps over the lazy dog.
Machine learning is fascinating and powerful.
Transformers have revolutionized natural language processing.
"""

tokenizer = BPETokenizer(vocab_size=vocab_size)
tokenizer.train(sample)

[32m2026-02-03 10:45:41.408[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m63[0m - [1mTraining BPE with target vocab size: 300[0m
[32m2026-02-03 10:45:41.409[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m67[0m - [1mFound 28 word pieces[0m
[32m2026-02-03 10:45:41.413[0m | [32m[1mSUCCESS [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m106[0m - [32m[1m
Final vocab size: 302[0m


## **A) Example with 1 sentence**

**3. Encode the text**

In [36]:
test = "The fox jumps"
encoded = tokenizer.encode(test)
encoded_tensor = torch.tensor([encoded])  # Shape: (batch_size, seq_len)
encoded_tensor

tensor([[266, 277, 282]])

**4. Generate the embedding**

In [37]:
emb = TokenEmbedding(vocab_size, d_model)
out = emb(encoded_tensor)               # Shape: (batch_size, seq_len, d_model)

[32m2026-02-03 10:53:03.806[0m | [34m[1mDEBUG   [0m | [36msrc.components.token_embedding[0m:[36m__init__[0m:[36m24[0m - [34m[1mTokenEmbedding: vocab=300, dim=128[0m


*Remember that we have an embedding for each token*

In [38]:
print(out.shape)

torch.Size([1, 3, 128])


## **B) Example with batch size 2**

**3. Encode the text**

In [None]:
sentence1 = "The fox jumps"
sentence2 = "Machine learning"

encoded1 = tokenizer.encode(sentence1) 
encoded2 = tokenizer.encode(sentence2)
print(encoded1)
print(encoded2)

[266, 277, 282]
[298, 299, 97, 114, 110, 259]


**4. Padding/truncating to fixed length**

*When using batching, we need to pad/truncate the sequences to a fixed length.*

In [43]:
# 4. Pad to same length (seq_len=10) for batching
# Pad with 0 (or use your <|pad|> token)
padded1 = encoded1 + [0] * (seq_len - len(encoded1))
padded2 = encoded2 + [0] * (seq_len - len(encoded2))

# Truncate if too long
padded1 = padded1[:seq_len]
padded2 = padded2[:seq_len]

print(padded1)
print(padded2)

batch = torch.tensor([padded1, padded2])
print(batch.shape)

[266, 277, 282, 0, 0, 0, 0, 0, 0, 0]
[298, 299, 97, 114, 110, 259, 0, 0, 0, 0]
torch.Size([2, 10])


**5. Generate the embedding**

In [45]:
from src.components.token_embedding import TokenEmbedding

emb = TokenEmbedding(vocab_size, d_model)
out = emb(batch)

[32m2026-02-03 10:58:00.851[0m | [34m[1mDEBUG   [0m | [36msrc.components.token_embedding[0m:[36m__init__[0m:[36m24[0m - [34m[1mTokenEmbedding: vocab=300, dim=128[0m


In [47]:
print(out.shape)
print(out[0].shape)
print(out[1].shape)

torch.Size([2, 10, 128])
torch.Size([10, 128])
torch.Size([10, 128])


---