In [20]:
import random

def generate_math_problem():
    problem_type = random.randint(1, 8)

    if problem_type == 1:
        name = random.choice(['John', 'Sara', 'Alex'])
        a = random.randint(1, 10)
        b = random.randint(1, 10)
        item = random.choice(['apples', 'candies'])
        return f"{name} has {a} {item}. They get {b} more. How many do they have now? Answer: {a + b}."

    elif problem_type == 2:
        v = random.randint(30, 100)
        t = random.randint(1, 5)
        return f"A car travels at {v} km/h for {t} hours. How far does it go? Answer: {v * t} km."

    elif problem_type == 3:
        w = random.randint(2, 10)
        l = random.randint(2, 10)
        return f"A rectangle has width {w} and length {l}. What is the area? Answer: {w * l}."

    elif problem_type == 4:
        p = random.randint(1000, 5000)
        s = random.randint(100, 900)
        return f"A person has ${p} and spends ${s}. How much is left? Answer: ${p - s}."

    elif problem_type == 5:
        r = random.randint(5, 20)
        h = random.randint(1, 10)
        return f"A machine produces {r} units/hour. How many in {h} hours? Answer: {r * h} units."

    elif problem_type == 6:
        distance = random.randint(50, 200)
        return f"A train moves {distance} km in 2 hours. What is the speed? Answer: {distance // 2} km/h."

    elif problem_type == 7:
        side1 = random.randint(3, 15)
        side2 = random.randint(3, 15)
        return f"Rectangle with sides {side1} and {side2}. Perimeter? Answer: {2 * (side1 + side2)}."

    else:  # problem_type == 8
        total = random.randint(20, 50)
        given = random.randint(5, 20)
        return f"You have {total} marbles and give away {given}. How many left? Answer: {total - given}."

math_data = [generate_math_problem() for _ in range(100)]

# Print a few examples to see the output
for i in range(5):
    print(f"Problem {i+1}: {math_data[i]}")

Problem 1: You have 46 marbles and give away 12. How many left? Answer: 34.
Problem 2: A rectangle has width 5 and length 4. What is the area? Answer: 20.
Problem 3: A person has $4663 and spends $453. How much is left? Answer: $4210.
Problem 4: A car travels at 80 km/h for 4 hours. How far does it go? Answer: 320 km.
Problem 5: A machine produces 19 units/hour. How many in 1 hours? Answer: 19 units.


In [25]:
len(toy_data)

100

In [3]:
import torch

In [57]:
from collections import Counter

# Include all characters including newlines
all_text = "\n".join(math_data)
chars = sorted(list(set(all_text)))  # now includes '\n'
vocab_size = len(chars)

# Build vocab
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])


In [8]:

# len(train_data)

288

In [7]:
# len(val_data)

32

In [28]:
import torch

block_size = 64


full_text = "\n".join(math_data)
data = encode(full_text)


n = int(0.9 * len(data))
train_data = torch.tensor(data[:n])
val_data = torch.tensor(data[n:])

def get_batch(split, batch_size=32):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y


In [51]:
import torch
import torch.nn as nn

class GPTConfig:
    def __init__(self, vocab_size, block_size=128, n_layer=4, n_head=4, n_embd=64, dropout=0.1):
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config.vocab_size, config.n_embd)
        self.pos_embedding = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.dropout = nn.Dropout(config.dropout)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.n_embd,
            nhead=config.n_head,
            dropout=config.dropout,
            batch_first=True
        )

        self.blocks = nn.TransformerEncoder(encoder_layer, num_layers=config.n_layer)
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size)

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.02)
        nn.init.normal_(self.head.weight, mean=0.0, std=0.02)

    def forward(self, idx):
        B, T = idx.shape
        if T > self.config.block_size:
            raise ValueError(f"Input too long ({T} tokens), max is {self.config.block_size}")

        tok_emb = self.token_embedding(idx)              # (B, T, C)
        pos_emb = self.pos_embedding[:, :T, :]           # (1, T, C)
        x = self.dropout(tok_emb + pos_emb)

        # Generate a causal mask to prevent attending to future tokens
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(idx.device)  # (T, T)
        x = self.blocks(x, src_key_padding_mask=None, mask=mask)  # causal mask applied here

        x = self.ln_f(x)
        logits = self.head(x)  # (B, T, vocab_size)
        return logits


In [52]:
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT(GPTConfig(vocab_size)).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

for step in range(4000):
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)

    logits = model(x)
    B, T, C = logits.shape
    loss = F.cross_entropy(logits.view(B*T, C), y.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")


Step 0, Loss: 3.9098
Step 100, Loss: 1.5354
Step 200, Loss: 0.5473
Step 300, Loss: 0.3424
Step 400, Loss: 0.3003
Step 500, Loss: 0.2764
Step 600, Loss: 0.2538
Step 700, Loss: 0.2379
Step 800, Loss: 0.2297
Step 900, Loss: 0.2412
Step 1000, Loss: 0.2367
Step 1100, Loss: 0.1890
Step 1200, Loss: 0.2008
Step 1300, Loss: 0.1823
Step 1400, Loss: 0.1797
Step 1500, Loss: 0.1688
Step 1600, Loss: 0.1611
Step 1700, Loss: 0.1633
Step 1800, Loss: 0.1505
Step 1900, Loss: 0.1552
Step 2000, Loss: 0.1423
Step 2100, Loss: 0.1290
Step 2200, Loss: 0.1397
Step 2300, Loss: 0.1406
Step 2400, Loss: 0.1324
Step 2500, Loss: 0.1319
Step 2600, Loss: 0.1301
Step 2700, Loss: 0.1162
Step 2800, Loss: 0.1187
Step 2900, Loss: 0.1278
Step 3000, Loss: 0.1196
Step 3100, Loss: 0.1222
Step 3200, Loss: 0.1284
Step 3300, Loss: 0.1270
Step 3400, Loss: 0.1239
Step 3500, Loss: 0.1031
Step 3600, Loss: 0.1047
Step 3700, Loss: 0.1201
Step 3800, Loss: 0.1135
Step 3900, Loss: 0.1225


In [46]:
@torch.no_grad()
def generate(idx, max_new_tokens=100):
    model.eval()
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, next_token), dim=1)
    return idx

# Generate
start = torch.tensor([[stoi["A"]]]).to(device)
out = generate(start, max_new_tokens=200)
print(decode(out[0].tolist()))


Answer: 14.
A person has $2685 and spends $548. How much is left? Answer: $933.
Sara has 9 candies. They get 2 more. How many do they have now? Answer: 7.
A person has $15000 and spends $548. How much 


In [53]:
@torch.no_grad()
def generate(prompt: str, max_new_tokens: int = 100, temperature: float = 1.0, top_k: int = None):
    model.eval()
    device = next(model.parameters()).device

    idx = torch.tensor([encode(prompt)], dtype=torch.long).to(device)

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -block_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :] / temperature

        vocab_size = logits.shape[-1]
        k = min(top_k if top_k is not None else vocab_size, vocab_size)

        if top_k is not None:
            top_logits, top_indices = torch.topk(logits, k)
            probs = F.softmax(top_logits, dim=-1)
            next_token = top_indices.gather(1, torch.multinomial(probs, num_samples=1))
        else:
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

        idx = torch.cat((idx, next_token), dim=1)

    return decode(idx[0].tolist())



In [54]:
output_text = generate(
    prompt="A train travels",
    max_new_tokens=100,
    temperature=0.8,
    top_k=20  # smaller than vocab size
)
print(output_text)



A train travels at 45 km/h for 2 hours. How far does it go? Answer: 90 km.
A car travels at 78 km/h for 1 hours. Ho


In [56]:

torch.save(model.state_dict(), "mini_gpt_v2.pth")
