In [1]:
import sys
sys.path.append("../")
import torch

# Tokenize
from src.components.bpe_tokenizer import BPETokenizer
# Transformer Block
from src.components.model import GPTModel

**1. Parameters**

In [2]:
vocab_size = 300
d_model = 128
n_heads = 4
n_layers = 2
d_ff = 512
max_seq_len = 128
batch_size = 2
seq_len = 10

**2. Instanciate the model**

In [3]:
# Create model
model = GPTModel(
    vocab_size=vocab_size,
    d_model=d_model,
    n_heads=n_heads,
    n_layers=n_layers,
    d_ff=d_ff,
    max_seq_len=max_seq_len
)

[32m2026-02-03 13:00:09.954[0m | [34m[1mDEBUG   [0m | [36msrc.components.token_embedding[0m:[36m__init__[0m:[36m24[0m - [34m[1mTokenEmbedding: vocab=300, dim=128[0m
[32m2026-02-03 13:00:09.966[0m | [34m[1mDEBUG   [0m | [36msrc.components.positional_encoding[0m:[36m__init__[0m:[36m54[0m - [34m[1mSinusoidalPositionalEncoding: max_len=128, dim=128, dropout=0.1[0m
[32m2026-02-03 13:00:09.967[0m | [34m[1mDEBUG   [0m | [36msrc.components.input_embeddings[0m:[36m__init__[0m:[36m31[0m - [34m[1mInputEmbeddings: vocab=300, dim=128, max_len=128, dropout=0.1[0m
[32m2026-02-03 13:00:09.968[0m | [34m[1mDEBUG   [0m | [36msrc.components.layer_norm[0m:[36m__init__[0m:[36m24[0m - [34m[1mLayerNorm: d_model=128, eps=1e-06[0m
[32m2026-02-03 13:00:09.968[0m | [34m[1mDEBUG   [0m | [36msrc.components.layer_norm[0m:[36m__init__[0m:[36m24[0m - [34m[1mLayerNorm: d_model=128, eps=1e-06[0m
[32m2026-02-03 13:00:09.971[0m | [34m[1mDEBUG   [0m

**3. Train the Tokenizer**

In [4]:
sample = """
The quick brown fox jumps over the lazy dog.
Machine learning is fascinating and powerful.
Transformers have revolutionized natural language processing.
"""

tokenizer = BPETokenizer(vocab_size=vocab_size)
tokenizer.train(sample)

[32m2026-02-03 13:00:09.992[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m63[0m - [1mTraining BPE with target vocab size: 300[0m
[32m2026-02-03 13:00:09.993[0m | [1mINFO    [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m67[0m - [1mFound 28 word pieces[0m
[32m2026-02-03 13:00:09.996[0m | [32m[1mSUCCESS [0m | [36msrc.components.bpe_tokenizer[0m:[36mtrain[0m:[36m106[0m - [32m[1m
Final vocab size: 302[0m


**4. Encode the text**

In [5]:
sentence1 = "The fox jumps"
sentence2 = "Machine learning"

encoded1 = tokenizer.encode(sentence1) 
encoded2 = tokenizer.encode(sentence2)
print(encoded1)
print(encoded2)

[266, 277, 282]
[298, 299, 97, 114, 110, 259]


**5. Padding/truncating to fixed length**

*When using batching, we need to pad/truncate the sequences to a fixed length.*

In [6]:
# 4. Pad to same length (seq_len=10) for batching
# Pad with 0 (or use your <|pad|> token)
padded1 = encoded1 + [0] * (seq_len - len(encoded1))
padded2 = encoded2 + [0] * (seq_len - len(encoded2))

# Truncate if too long
padded1 = padded1[:seq_len]
padded2 = padded2[:seq_len]

print(padded1)
print(padded2)

x = torch.tensor([padded1, padded2])
print(x.shape)

[266, 277, 282, 0, 0, 0, 0, 0, 0, 0]
[298, 299, 97, 114, 110, 259, 0, 0, 0, 0]
torch.Size([2, 10])


**6. Traning mode**

*This is the forward pass of the model to obtain the logits that then will be used in the training.*    
*Basically, it is the prev step to the softmax and loss calculation.*

In [8]:
logits = model(x)
print(logits.shape)  # Should be (batch_size, seq_len, vocab_size)
logits

torch.Size([2, 10, 300])


tensor([[[ 0.2323,  0.4477,  0.2641,  ...,  0.0418, -0.2899,  0.1908],
         [ 0.0621,  0.3840,  0.3036,  ..., -0.0570, -0.3446, -0.2908],
         [ 0.1192,  0.2311,  0.2357,  ..., -0.1601, -0.5479, -0.3426],
         ...,
         [ 0.3862, -0.0397, -0.2463,  ...,  0.0656, -0.2158, -0.1340],
         [ 0.1332, -0.2551, -0.1081,  ...,  0.0121, -0.4112, -0.3136],
         [ 0.3712, -0.1096, -0.2335,  ...,  0.1228, -0.3239, -0.0541]],

        [[ 0.1119,  0.2297,  0.1429,  ...,  0.0237,  0.4376,  0.1735],
         [ 0.0795,  0.2076,  0.1459,  ..., -0.0544, -0.4255,  0.4761],
         [ 0.0335,  0.0063, -0.1720,  ...,  0.1461,  0.0365, -0.1258],
         ...,
         [ 0.1671, -0.0316, -0.1185,  ..., -0.0098, -0.2157, -0.1831],
         [ 0.2995, -0.1255, -0.0480,  ...,  0.0048, -0.3992, -0.1822],
         [ 0.3188, -0.2018, -0.3893,  ...,  0.0040, -0.3512, -0.1650]]],
       grad_fn=<UnsafeViewBackward0>)

**7. Inference**

*Take a look to the generate() function, but basically, gets the logist (like before) and applies softmax to get the probabilities of the next token*

In [None]:
# Encode
prompt_text = "The quick brown fox"
encoded_prompt = tokenizer.encode(prompt_text)
encoded_prompt_tensor = torch.tensor([encoded_prompt])
print(encoded_prompt_tensor)

# Generate
generated = model.generate(
    encoded_prompt_tensor,
    max_new_tokens=10,
    temperature=1.0
)
print(generated)

# Decode
encoded_list = generated[0].tolist()
decoded_text = tokenizer.decode(encoded_list)
print(decoded_text)

tensor([[266, 271, 275, 277]])
tensor([[266, 271, 275, 277, 173,  71, 287, 259,  20, 189, 270, 246, 257,  55]])
The quick brown fox�G theing� quic�er7


---