In [10]:
import random


def generate_math_data(num_samples=10000, min_val=1, max_val=100):
    data = []
    operations = [
        ("+", lambda a, b: a + b),
        ("-", lambda a, b: a - b),
        ("*", lambda a, b: a * b),
        (
            "/",
            lambda a, b: round(a / b, 2) if b != 0 else None,
        ),  # Avoid division by zero
    ]

    for _ in range(num_samples):
        a = random.randint(min_val, max_val)
        b = random.randint(min_val, max_val)
        op, func = random.choice(operations)

        result = func(a, b)
        if result is not None:  # Skip division by zero cases
            question = f"{a} {op} {b} = ?"
            answer = str(result)
            data.append({"input": question, "output": answer})

    return data

In [11]:
data = generate_math_data(10000)
print("Sample data:", data[:5])

Sample data: [{'input': '100 - 10 = ?', 'output': '90'}, {'input': '63 + 3 = ?', 'output': '66'}, {'input': '59 - 51 = ?', 'output': '8'}, {'input': '29 - 5 = ?', 'output': '24'}, {'input': '2 + 37 = ?', 'output': '39'}]


In [12]:
import string

VOCAB = {ch: i for i, ch in enumerate(string.digits + "+= ?", start=1)}
VOCAB["PAD"] = 0  # Padding token
VOCAB["<EOS>"] = 15


def tokenize(expression):
    return [VOCAB[ch] for ch in expression if ch in VOCAB]


# Example
expr = "23 + 45 = ?"
tokenized = tokenize(expr)
print("Tokenized:", tokenized)

Tokenized: [3, 4, 13, 11, 13, 5, 6, 13, 12, 13, 14]


In [13]:
len(VOCAB)

16

In [14]:
VOCAB

{'0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 '+': 11,
 '=': 12,
 ' ': 13,
 '?': 14,
 'PAD': 0,
 '<EOS>': 15}

In [15]:
import torch
import torch.nn as nn

# Define embedding layer (size: vocab_size x embedding_dim)
vocab_size = len(VOCAB)
embedding_dim = 16  # Small embedding size

embedding_layer = nn.Embedding(vocab_size, embedding_dim)

# Convert tokenized input to tensor
token_tensor = torch.tensor(tokenized).unsqueeze(0)  # Add batch dim

# Get embeddings
embedded = embedding_layer(token_tensor)
print("Embeddings shape:", embedded.shape)  # (1, seq_length, embedding_dim)

Embeddings shape: torch.Size([1, 11, 16])


In [105]:
import string

import torch
from torch.utils.data import DataLoader, Dataset


class MathDataset(Dataset):
    def __init__(self, num_samples=10000, min_val=1, max_val=100):
        self.VOCAB = {ch: i for i, ch in enumerate(string.digits + "/*-+= ?", start=1)}
        self.VOCAB["PAD"] = 0  # Padding token
        self.VOCAB["<EOS>"] = max(self.VOCAB.values()) + 1
        self.ID2VOCAB = {v: k for k, v in self.VOCAB.items()}

        self.data = []
        for _ in range(num_samples):
            question, answer = self.generate_math_data(min_val, max_val)
            input_tokens = self.tokenize(question)  # FIXED: Use self.tokenize
            output_tokens = self.tokenize(answer)
            self.data.append((input_tokens, output_tokens))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def tokenize(self, expression):
        """Tokenize a string expression into a list of token IDs."""
        return [
            self.VOCAB[ch] for ch in expression if ch in self.VOCAB
        ]  # FIXED: Use self.VOCAB

    def decode(self, tokens):
        """Convert a tensor or list of token IDs back into a string."""
        if isinstance(tokens, torch.Tensor):
            tokens = tokens.tolist()  # Convert tensor to list

        # Exclude special tokens (PAD, <EOS>)
        return "".join(
            self.ID2VOCAB[token]
            for token in tokens
            if token in self.ID2VOCAB and self.ID2VOCAB[token] not in ["PAD", "<EOS>"]
        )

    def generate_math_data(self, min_val=1, max_val=100):
        operations = [
            ("+", lambda a, b: a + b),
            ("-", lambda a, b: a - b),
            ("*", lambda a, b: a * b),
            (
                "/",
                lambda a, b: round(a / b, 2) if b != 0 else None,
            ),  # Avoid division by zero
        ]

        a = random.randint(min_val, max_val)
        b = random.randint(min_val, max_val)
        op, func = random.choice(operations)

        result = func(a, b)
        if result is not None:
            question = f"{a} {op} {b} = ?"
            answer = str(result)
            return question, answer
        else:
            return self.generate_math_data(min_val, max_val)


# Create dataset
dataset = MathDataset(num_samples=10)

# Test sample
input_sample, output_sample = dataset[0]
decoded_input = dataset.decode(input_sample)
decoded_output = dataset.decode(output_sample)

print("Tokenized Input:", input_sample)
print("Decoded Input:", decoded_input)
print("Tokenized Output:", output_sample)
print("Decoded Output:", decoded_output)

Tokenized Input: [9, 1, 16, 14, 16, 4, 9, 16, 15, 16, 17]
Decoded Input: 80 + 38 = ?
Tokenized Output: [2, 2, 9]
Decoded Output: 118


In [106]:
def collate_fn(batch):
    """Collate function to pad sequences to the same length"""
    max_input_len = max(len(x[0]) for x in batch)
    max_output_len = max(len(x[1]) for x in batch)

    inputs = []
    outputs = []

    for x, y in batch:
        inputs.append(x + [VOCAB["PAD"]] * (max_input_len - len(x)))
        outputs.append(y + [VOCAB["PAD"]] * (max_output_len - len(y)))

    return torch.tensor(inputs), torch.tensor(outputs)

In [111]:
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Test batch
batch_inputs, batch_outputs = next(iter(dataloader))
print("Batch Input Shape:", batch_inputs.shape)
print("Batch Output Shape:", batch_outputs.shape)

Batch Input Shape: torch.Size([2, 11])
Batch Output Shape: torch.Size([2, 4])


In [113]:
print(dataset.decode(batch_inputs[0]))
print(dataset.decode(batch_outputs[0]))

93 * 58 = ?
5394


In [114]:
dataset.VOCAB

{'0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 '/': 11,
 '*': 12,
 '-': 13,
 '+': 14,
 '=': 15,
 ' ': 16,
 '?': 17,
 'PAD': 0,
 '<EOS>': 18}

In [109]:
%load_ext autoreload
%autoreload 2

In [110]:
from model import TinyTransformer

In [13]:
TinyTransformer(vocab_size=16)

TinyTransformer(
  (embedding): Embedding(16, 16, padding_idx=0)
  (pos_encoder): PositionalEncoding()
  (encoder): ModuleList(
    (0-1): 2 x EncoderLayer(
      (self_attn): MultiHeadSelfAttention(
        (q_proj): Linear(in_features=16, out_features=16, bias=True)
        (k_proj): Linear(in_features=16, out_features=16, bias=True)
        (v_proj): Linear(in_features=16, out_features=16, bias=True)
        (out_proj): Linear(in_features=16, out_features=16, bias=True)
        (softmax): Softmax(dim=-1)
      )
      (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=16, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=16, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder): ModuleList(
    (0-1): 2 x DecoderLayer(
      (self_attn): MultiHeadSelfAttention(
        (q_proj): Linear(in_features=16, out_features=16, bias=True)
        

---

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataset import BracketDataset

In [3]:
dataset = BracketDataset(num_samples=5)
dataset.data[0]
dataset.max_len
dataset.VOCAB

{'¿': 1,
 ']': 2,
 '}': 3,
 '?': 4,
 '(': 5,
 ')': 6,
 '{': 7,
 '[': 8,
 '>': 9,
 '<': 10,
 'PAD': 0,
 '<EOS>': 11,
 '<SOS>': 12}

In [4]:
for inp, out in dataset:
    print(f"input {len(inp)} ,output {len(out)}")
    print(f"Input: {dataset.decode(inp)}  ,Output: {dataset.decode(out)}")
    print(f"input {inp} output {out}")

input 12 ,output 12
Input: <SOS> ¿ ¿ ( < { { ¿ ¿ < [ <EOS>  ,Output: <SOS> ] > ? ? } } > ) ? ? <EOS>
input tensor([12,  1,  1,  5, 10,  7,  7,  1,  1, 10,  8, 11]) output tensor([12,  2,  9,  4,  4,  3,  3,  9,  6,  4,  4, 11])
input 12 ,output 12
Input: <SOS> { { ¿ < ¿ [ [ [ < { <EOS>  ,Output: <SOS> } > ] ] ] ? > ? } } <EOS>
input tensor([12,  7,  7,  1, 10,  1,  8,  8,  8, 10,  7, 11]) output tensor([12,  3,  9,  2,  2,  2,  4,  9,  4,  3,  3, 11])
input 12 ,output 12
Input: <SOS> ( ¿ [ ¿ < ¿ ¿ { [ { <EOS>  ,Output: <SOS> } ] } ? ? > ? ] ? ) <EOS>
input tensor([12,  5,  1,  8,  1, 10,  1,  1,  7,  8,  7, 11]) output tensor([12,  3,  2,  3,  4,  4,  9,  4,  2,  4,  6, 11])
input 12 ,output 12
Input: <SOS> [ { ¿ { { [ ¿ { ¿ < <EOS>  ,Output: <SOS> > ? } ? ] } } ? } ] <EOS>
input tensor([12,  8,  7,  1,  7,  7,  8,  1,  7,  1, 10, 11]) output tensor([12,  9,  4,  3,  4,  2,  3,  3,  4,  3,  2, 11])
input 12 ,output 12
Input: <SOS> { { ¿ ( [ [ ( ( { ¿ <EOS>  ,Output: <SOS> ? } ) ) ] ] )

In [5]:
from model import TinyTransformer, load_model

model = TinyTransformer(
    vocab_size=len(dataset.VOCAB),
    d_model=16,
    num_heads=4,
    num_layers=4,
    ff_dim=128,
    max_len=dataset.max_len,
)


model = load_model(model, "model.pth", device="cuda")

  state_dict = torch.load(file_path)


In [6]:
dataset.decode(dataset.data[1][0])

'<SOS> { { ¿ < ¿ [ [ [ < { <EOS>'

In [7]:
dataset.decode(dataset.data[1][1])

'<SOS> } > ] ] ] ? > ? } } <EOS>'

In [8]:
dataset.data[1][1]

[12, 3, 9, 2, 2, 2, 4, 9, 4, 3, 3, 11]

In [9]:
dataset.VOCAB

{'¿': 1,
 ']': 2,
 '}': 3,
 '?': 4,
 '(': 5,
 ')': 6,
 '{': 7,
 '[': 8,
 '>': 9,
 '<': 10,
 'PAD': 0,
 '<EOS>': 11,
 '<SOS>': 12}

In [17]:
import json

with open("vocab.json") as f:
    config = json.load(f)

dataset.VOCAB = config
dataset.update_vocab(config)

In [18]:
dataset.data[0][0]

[12, 1, 1, 5, 10, 7, 7, 1, 1, 10, 8, 11]

In [19]:
dataset.decode(dataset.data[0][0])

'<SOS> < < ) ( { { < < ( ? <EOS>'

In [20]:
import json

import torch

out = model.generate(
    torch.tensor(dataset.data[1][0]).to("cuda"),
    max_len=12,
    eos_token=dataset.VOCAB["<EOS>"],
    sos_token=dataset.VOCAB["<SOS>"],
)
print("INPUT")
print(dataset.data[1][0])
print(dataset.decode(dataset.data[1][0]))
print("------")
print("OUTPUT")
print(out)
dataset.decode(out)

INPUT
[12, 7, 7, 1, 10, 1, 8, 8, 8, 10, 7, 11]
<SOS> { { < ( < ? ? ? ( { <EOS>
------
OUTPUT
tensor([12,  2,  5,  6,  5,  6,  6,  5,  6,  2,  2, 11], device='cuda:0')


'<SOS> } ) > ) > > ) > } } <EOS>'

In [9]:
from torch.utils.data import DataLoader

dataset = BracketDataset(num_samples=500)
dataloader = DataLoader(
    dataset,
    batch_size=2,
)

dataset.data

[([12, 6, 7, 10, 7, 10, 6, 7, 7, 7, 7, 11],
  [12, 1, 1, 1, 1, 4, 8, 1, 8, 1, 4, 11]),
 ([12, 7, 5, 9, 7, 7, 9, 10, 7, 9, 10, 11],
  [12, 8, 2, 1, 8, 2, 1, 1, 2, 3, 1, 11]),
 ([12, 10, 5, 9, 10, 9, 5, 7, 5, 9, 6, 11],
  [12, 4, 2, 3, 1, 3, 2, 8, 2, 3, 8, 11]),
 ([12, 6, 10, 10, 10, 10, 7, 9, 10, 6, 9, 11],
  [12, 2, 4, 8, 2, 1, 8, 8, 8, 8, 4, 11]),
 ([12, 6, 6, 6, 6, 6, 5, 5, 6, 7, 5, 11],
  [12, 3, 1, 4, 3, 3, 4, 4, 4, 4, 4, 11]),
 ([12, 6, 7, 5, 10, 7, 9, 5, 9, 6, 7, 11],
  [12, 1, 4, 2, 3, 2, 1, 8, 3, 1, 4, 11]),
 ([12, 7, 6, 9, 10, 5, 7, 6, 6, 5, 7, 11],
  [12, 1, 3, 4, 4, 1, 3, 8, 2, 4, 1, 11]),
 ([12, 5, 7, 7, 5, 9, 7, 5, 7, 10, 9, 11],
  [12, 2, 8, 1, 3, 1, 2, 3, 1, 1, 3, 11]),
 ([12, 5, 5, 10, 6, 7, 7, 9, 6, 10, 5, 11],
  [12, 3, 8, 4, 2, 1, 1, 4, 8, 3, 3, 11]),
 ([12, 10, 10, 10, 9, 6, 7, 5, 6, 5, 7, 11],
  [12, 1, 3, 4, 3, 1, 4, 2, 8, 8, 8, 11]),
 ([12, 5, 6, 6, 10, 7, 5, 6, 7, 5, 7, 11],
  [12, 1, 3, 1, 4, 3, 1, 8, 4, 4, 3, 11]),
 ([12, 6, 9, 10, 5, 7, 7, 5, 5, 10, 10, 11],


In [93]:
for batch_idx, (input_tokens, output_tokens) in enumerate(dataloader):
    print(f"Batch {batch_idx + 1}")
    print("Input tokens:", input_tokens)
    print("Output tokens:", output_tokens)
    print()

Batch 1
Input tokens: tensor([[12,  6,  3,  5,  3,  7,  7,  5,  3,  7,  7, 11],
        [12,  5,  3,  6,  7,  7,  9,  9,  5,  5,  5, 11]])
Output tokens: tensor([[12,  2,  2,  8,  1,  2,  2,  8,  1,  8,  4, 11],
        [12,  1,  1,  1, 10, 10,  2,  2,  4,  8,  1, 11]])

Batch 2
Input tokens: tensor([[12,  6,  7,  6,  3,  7,  5,  6,  7,  7,  5, 11],
        [12,  3,  3,  6,  9,  5,  7,  6,  9,  9,  6, 11]])
Output tokens: tensor([[12,  1,  2,  2,  4,  1,  2,  8,  4,  2,  4, 11],
        [12,  4, 10, 10,  4,  2,  1, 10,  4,  8,  8, 11]])

Batch 3
Input tokens: tensor([[12,  6,  6,  6,  3,  3,  6,  5,  6,  6,  7, 11]])
Output tokens: tensor([[12,  2,  4,  4,  1,  4,  8,  8,  4,  4,  4, 11]])



In [94]:
print("Input: ", dataset.decode(input_tokens[0]))
print("Output: ", dataset.decode(output_tokens[0]))
print("Output tokens: ", output_tokens)

Input:  <SOS> ¿ ¿ ¿ [ [ ¿ < ¿ ¿ ( <EOS>
Output:  <SOS> ) ? ? > ? ] ] ? ? ? <EOS>
Output tokens:  tensor([[12,  2,  4,  4,  1,  4,  8,  8,  4,  4,  4, 11]])


In [95]:
output_tokens[:, 1:].reshape(-1)

tensor([ 2,  4,  4,  1,  4,  8,  8,  4,  4,  4, 11])

In [91]:
a = remove_eos(output_tokens)
a

tensor([[12, 10]])