In [1]:
import torch

In [1]:
from src.char_tokenizer import CharTokenizer, CharTokenizerConfig

# Testing Tokenizer

In [4]:
# small dummy text
corpus = "hello world hi"

# build tokenizer
tok = CharTokenizer(corpus, config=CharTokenizerConfig(add_bos=True, add_eos=True))

print("Vocab:", tok.stoi)
print("Vocab size:", tok.vocab_size)

Vocab: {'<BOS>': 0, '<EOS>': 1, ' ': 2, 'd': 3, 'e': 4, 'h': 5, 'i': 6, 'l': 7, 'o': 8, 'r': 9, 'w': 10}
Vocab size: 11


In [5]:
# encode a sample
sample = "hi"
ids = tok.encode(sample)
print("Encoded:", ids)

Encoded: [0, 5, 6, 1]


In [6]:
# decode it back
decoded = tok.decode(ids)
print("Decoded:", decoded)

Decoded: hi


In [8]:
# test save/load
tok.save("artifacts/char_tok.json")
loaded = CharTokenizer.load("artifacts/char_tok.json")

print("Loaded encode:", loaded.encode(sample))
print("Loaded decode:", loaded.decode(loaded.encode(sample)))


Loaded encode: [0, 5, 6, 1]
Loaded decode: hi


In [1]:
import typing

# Testing Dataset

In [2]:
from pathlib import Path
import torch
from torch.utils.data import DataLoader

from src.dataset import CharDataset
from src.char_tokenizer import CharTokenizer

root = Path(".")

# 1) Load preprocessed data
train_ids = torch.load(root / "data" / "train_ids.pt")

In [3]:
# 2) Make a small dataset
block_size = 8  # tiny just for inspection
train_ds = CharDataset(train_ids, block_size=block_size)

In [5]:

print("Dataset length:", len(train_ds)), len(train_ids)

Dataset length: 1003845


(None, 1003854)

In [6]:

# 3) Look at a single sample
x, y = train_ds[0]
print("x shape:", x.shape)
print("y shape:", y.shape)
print("x ids:", x.tolist())
print("y ids:", y.tolist())

x shape: torch.Size([8])
y shape: torch.Size([8])
x ids: [20, 49, 58, 59, 60, 3, 17, 49]
y ids: [49, 58, 59, 60, 3, 17, 49, 60]


In [7]:
# 4) Decode to human-readable text
tok = CharTokenizer.load(root / "data" / "char_tokenizer.json")
print("x text:", tok.decode(x.tolist()))
print("y text:", tok.decode(y.tolist()))

x text: First Ci
y text: irst Cit


In [8]:
# 5) Wrap in a DataLoader to see batching
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)

batch_x, batch_y = next(iter(train_loader))
print("Batch x shape:", batch_x.shape)  # [batch, block_size]
print("Batch y shape:", batch_y.shape)

Batch x shape: torch.Size([4, 8])
Batch y shape: torch.Size([4, 8])


In [9]:
batch_x

tensor([[48, 45, 58,  3, 56, 41, 52, 45],
        [12,  2, 18, 55, 59, 60,  3, 60],
        [45,  3, 54, 55,  3, 47, 58, 41],
        [ 2, 17, 26, 23, 20, 20, 29, 32]])

# Testing Embedding Creation

In [11]:
import torch
from src.embeddings import GPTEmbedding
from src.char_tokenizer import CharTokenizer

In [13]:
# Load tokenizer to get vocab_size
tok = CharTokenizer.load("data/char_tokenizer.json")
vocab_size = tok.vocab_size


# ðŸ”§ Recommended tiny config
d_model = 128      # embedding / hidden size
block_size = 128   # max context length


In [14]:
embed = GPTEmbedding(vocab_size=vocab_size,
                     d_model=d_model,
                     block_size=block_size)

In [None]:
# Fake batch of token IDs just to test shapes
batch_size = 4
idx = torch.randint(0, vocab_size, (batch_size, block_size))  # [4, 128] - every element is a random integer between 0 and vocab_sizeâˆ’1, size is (4, 128)

out = embed(idx)
print("Input shape :", idx.shape)   # torch.Size([4, 128])
print("Output shape:", out.shape)   # torch.Size([4, 128, 128])

Input shape : torch.Size([4, 128])
Output shape: torch.Size([4, 128, 128])


In [24]:
idx[0].shape

torch.Size([128])

# Testing Single head Attention block

In [25]:
import torch
from src.attention import SingleHeadSelfAttention

In [26]:
# -------------------------
# Hyperparameters
# -------------------------
batch_size = 2
block_size = 5     # sequence length T
d_model = 8        # embedding dimension
d_head = 4         # attention head size


In [28]:
# -------------------------
# Create random input
# -------------------------
x = torch.randn(batch_size, block_size, d_model)  # [B, T, d_model]
print("Input shape:", x.shape)

# -------------------------
# Create attention module
# -------------------------
att = SingleHeadSelfAttention(
    d_model=d_model,
    d_head=d_head,
    block_size=block_size
)

# -------------------------
# Forward pass
# -------------------------
out = att(x)

print("Output shape:", out.shape)
print(out)


Input shape: torch.Size([2, 5, 8])
Output shape: torch.Size([2, 5, 4])
tensor([[[-0.8871,  0.1435, -0.5848, -0.2220],
         [-0.9292,  0.3800,  0.1793,  0.1037],
         [-0.3648,  0.1516,  0.2950,  0.2457],
         [-0.0495,  0.1000,  0.1602,  0.1275],
         [ 0.0432, -0.0264, -0.2434,  0.0873]],

        [[-0.7209, -0.0144,  0.1006,  0.8175],
         [-0.3836, -0.0228, -0.6610,  0.6331],
         [-0.1534, -0.1081, -0.5483,  0.6056],
         [ 0.3325, -0.4727, -0.4010,  0.2374],
         [-0.0347, -0.1647, -0.1051,  0.1884]]], grad_fn=<UnsafeViewBackward0>)


# Testing entire transformer block

In [35]:
import torch
from src.block import TransformerBlock

In [36]:
# -------------------------
# Hyperparameters
# -------------------------
batch_size = 2
block_size = 5      # sequence length T
d_model = 16        # embedding size
d_head = 8          # attention head size

# -------------------------
# Fake input: embeddings
# -------------------------
x = torch.randn(batch_size, block_size, d_model)
print("Input shape:", x.shape)

Input shape: torch.Size([2, 5, 16])


In [37]:



# -------------------------
# Create one Transformer block
# -------------------------
block = TransformerBlock(
    d_model=d_model,
    d_head=d_head,
    block_size=block_size,
    mlp_hidden_mult=4,
    dropout=0.1,
)

# -------------------------
# Forward pass
# -------------------------
out = block(x)

print("Output shape:", out.shape)
print(out[:1, :2])  # print first batch, first 2 tokens for a quick look

Output shape: torch.Size([2, 5, 16])
tensor([[[-1.2179,  0.2314, -0.4609, -2.1366, -1.6178, -1.2819, -0.4578,
           2.0654,  0.0117,  0.0976, -0.0517, -0.4331, -1.6662, -0.1376,
           2.3905, -1.5338],
         [ 0.1174,  0.2518, -1.8656, -0.8964, -1.3390, -0.0432,  2.8794,
           1.4301,  0.1454, -0.4416,  1.0007, -0.1694, -0.8234,  0.8987,
          -0.2490, -1.8497]]], grad_fn=<SliceBackward0>)
