In [7]:
import sys
sys.path.append("../")
import torch

# Tokenize
from src.components.bpe_tokenizer import BPETokenizer
# Input Embeddings (Just the combination of TokenEmbedding and PositionalEncoding)
from src.components.input_embeddings import InputEmbeddings

**1. Parameters**

In [8]:
batch_size = 2
seq_len = 10
vocab_size = 300
d_model = 128
max_seq_len = 512

**2. Train the Tokenizer**

In [9]:
sample = """
The quick brown fox jumps over the lazy dog.
Machine learning is fascinating and powerful.
Transformers have revolutionized natural language processing.
"""

tokenizer = BPETokenizer(vocab_size=vocab_size)
tokenizer.train(sample)

[32m2026-02-03 11:09:11.424[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m63[0m - [1mTraining BPE with target vocab size: 300[0m
[32m2026-02-03 11:09:11.425[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m67[0m - [1mFound 28 word pieces[0m
[32m2026-02-03 11:09:11.427[0m | [32m[1mSUCCESS [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m106[0m - [32m[1m
Final vocab size: 302[0m


**3. Encode the text**

In [10]:
sentence1 = "The fox jumps"
sentence2 = "Machine learning"

encoded1 = tokenizer.encode(sentence1) 
encoded2 = tokenizer.encode(sentence2)
print(encoded1)
print(encoded2)

[266, 277, 282]
[298, 299, 97, 114, 110, 259]


**4. Padding/truncating to fixed length**

*When using batching, we need to pad/truncate the sequences to a fixed length.*

In [None]:
# 4. Pad to same length (seq_len=10) for batching
# Pad with 0 (or use your <|pad|> token)
padded1 = encoded1 + [0] * (seq_len - len(encoded1))
padded2 = encoded2 + [0] * (seq_len - len(encoded2))

# Truncate if too long
padded1 = padded1[:seq_len]
padded2 = padded2[:seq_len]

print(padded1)
print(padded2)

# x = torch.randint(0, vocab_size, (batch_size, seq_len))
x = torch.tensor([padded1, padded2])
print(x.shape)

[266, 277, 282, 0, 0, 0, 0, 0, 0, 0]
[298, 299, 97, 114, 110, 259, 0, 0, 0, 0]
torch.Size([2, 10])


**5. Generate the final embedding**

In [13]:
emb = InputEmbeddings(vocab_size, d_model, max_seq_len, dropout=0.1)
out = emb(x)

[32m2026-02-03 11:09:33.816[0m | [34m[1mDEBUG   [0m | [36msrc.components.token_embedding[0m:[36m__init__[0m:[36m24[0m - [34m[1mTokenEmbedding: vocab=300, dim=128[0m
[32m2026-02-03 11:09:33.819[0m | [34m[1mDEBUG   [0m | [36msrc.components.positional_encoding[0m:[36m__init__[0m:[36m54[0m - [34m[1mSinusoidalPositionalEncoding: max_len=512, dim=128, dropout=0.1[0m
[32m2026-02-03 11:09:33.821[0m | [34m[1mDEBUG   [0m | [36msrc.components.input_embeddings[0m:[36m__init__[0m:[36m31[0m - [34m[1mInputEmbeddings: vocab=300, dim=128, max_len=512, dropout=0.1[0m


In [14]:
x.shape

torch.Size([2, 10])

---