In [20]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

### Loss analysis

In [21]:
from torch.utils.data import DataLoader
from src.mintrans import FibonacciModDataset, MinimalTransformer, evaluate_model, train_model
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data = FibonacciModDataset(num_samples=10)
print(data.__getitem__(0))

(tensor([2, 2, 4, 6, 0, 6, 6, 2, 8]), tensor([2, 4, 6, 0, 6, 6, 2, 8, 0]))


### With the default `10` epoch

In [22]:
vocab_size = 10
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size).to(device=device)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.2085
Epoch 2, Loss: 1.3469
Epoch 3, Loss: 0.9227
Epoch 4, Loss: 0.7797
Epoch 5, Loss: 0.6970
Epoch 6, Loss: 0.6500
Epoch 7, Loss: 0.6144
Epoch 8, Loss: 0.5838
Epoch 9, Loss: 0.5582
Epoch 10, Loss: 0.5343
Epoch 11, Loss: 0.5168
Epoch 12, Loss: 0.5009
Total Training Time: 9.10 seconds
Accuracy: 83.65%


### Epoch increased to `100`

In [23]:
train_model(model, train_loader, epochs=40)
evaluate_model(model, train_loader)

Epoch 1, Loss: 0.4848
Epoch 2, Loss: 0.4731
Epoch 3, Loss: 0.4632
Epoch 4, Loss: 0.4517
Epoch 5, Loss: 0.4482
Epoch 6, Loss: 0.4443
Epoch 7, Loss: 0.4356
Epoch 8, Loss: 0.4338
Epoch 9, Loss: 0.4325
Epoch 10, Loss: 0.4235
Epoch 11, Loss: 0.4285
Epoch 12, Loss: 0.4201
Epoch 13, Loss: 0.4167
Epoch 14, Loss: 0.4155
Epoch 15, Loss: 0.4127
Epoch 16, Loss: 0.4070
Epoch 17, Loss: 0.4126
Epoch 18, Loss: 0.4066
Epoch 19, Loss: 0.4104
Epoch 20, Loss: 0.4030
Epoch 21, Loss: 0.3977
Epoch 22, Loss: 0.3975
Epoch 23, Loss: 0.3959
Epoch 24, Loss: 0.4166
Epoch 25, Loss: 0.3956
Epoch 26, Loss: 0.4029
Epoch 27, Loss: 0.4009
Epoch 28, Loss: 0.3895
Epoch 29, Loss: 0.3946
Epoch 30, Loss: 0.3896
Epoch 31, Loss: 0.3911
Epoch 32, Loss: 0.4027
Epoch 33, Loss: 0.3889
Epoch 34, Loss: 0.3887
Epoch 35, Loss: 0.3889
Epoch 36, Loss: 0.3948
Epoch 37, Loss: 0.3885
Epoch 38, Loss: 0.3938
Epoch 39, Loss: 0.3797
Epoch 40, Loss: 0.3901
Total Training Time: 29.22 seconds
Accuracy: 86.44%


We have `~5.42%` accuracy increase with `10` times more epoch

### Increasing the batch size from `32` to `64` with epoch as `10`

Accuracy goes from ~75-80%

In [24]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size).to(device=device)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.2940
Epoch 2, Loss: 1.8292
Epoch 3, Loss: 1.5082
Epoch 4, Loss: 1.3372
Epoch 5, Loss: 1.2004
Epoch 6, Loss: 1.0723
Epoch 7, Loss: 0.9414
Epoch 8, Loss: 0.8482
Epoch 9, Loss: 0.7849
Epoch 10, Loss: 0.7374
Epoch 11, Loss: 0.6887
Epoch 12, Loss: 0.6579
Total Training Time: 2.89 seconds
Accuracy: 78.31%


### Switching back to default `batch_size` of `32`

In [25]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size).to(device=device)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.0016
Epoch 2, Loss: 1.1204
Epoch 3, Loss: 0.9242
Epoch 4, Loss: 0.8132
Epoch 5, Loss: 0.7289
Epoch 6, Loss: 0.6728
Epoch 7, Loss: 0.6383
Epoch 8, Loss: 0.6069
Epoch 9, Loss: 0.5875
Epoch 10, Loss: 0.5706
Epoch 11, Loss: 0.5528
Epoch 12, Loss: 0.5378
Total Training Time: 5.56 seconds
Accuracy: 81.48%


## Splitted dataset 

In [26]:
from torch.utils.data import random_split

vocab_size = 10
generated_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_size = int(0.8 * len(generated_ds)) # 80% to train
test_size = len(generated_ds) - train_size # rest of the size

train_ds, test_ds = random_split(generated_ds, [train_size, test_size]) # randomly splits our dataset

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

model = MinimalTransformer(vocab_size=vocab_size).to(device=device)
train_model(model, train_loader)

evaluate_model(model, test_loader)

Epoch 1, Loss: 2.2056
Epoch 2, Loss: 1.4975
Epoch 3, Loss: 1.0486
Epoch 4, Loss: 0.8584
Epoch 5, Loss: 0.7431
Epoch 6, Loss: 0.6609
Epoch 7, Loss: 0.5904
Epoch 8, Loss: 0.5428
Epoch 9, Loss: 0.5086
Epoch 10, Loss: 0.4825
Epoch 11, Loss: 0.4651
Epoch 12, Loss: 0.4503
Total Training Time: 7.67 seconds
Accuracy: 84.91%


## Splitted dataset and increase in number of epoch.

In [28]:
import torch.nn as nn 

def train_model(model, dataloader, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    model.train()

    accuracy_per_e = []

    for epoch in range(epochs):
        total_loss = 0
        correct, total = 0, 0

        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

            pred = logits.argmax(dim=-1)
            correct += (pred == y).sum().item()
            total += y.numel()
            total_loss += loss.item()

        acc = correct / total
        avg_loss = total_loss / len(dataloader)
        accuracy_per_e.append(acc)

        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    print(f"Mean accuracy across epochs (Training): {sum(accuracy_per_e) / len(accuracy_per_e):.4%}")

train_model(model, train_loader, epochs=150)
evaluate_model(model, test_loader)

Epoch 1, Loss: 0.4396
Epoch 2, Loss: 0.4281
Epoch 3, Loss: 0.4209
Epoch 4, Loss: 0.4162
Epoch 5, Loss: 0.4101
Epoch 6, Loss: 0.4073
Epoch 7, Loss: 0.4027
Epoch 8, Loss: 0.3937
Epoch 9, Loss: 0.3876
Epoch 10, Loss: 0.3877
Epoch 11, Loss: 0.3894
Epoch 12, Loss: 0.3961
Epoch 13, Loss: 0.3793
Epoch 14, Loss: 0.3711
Epoch 15, Loss: 0.3725
Epoch 16, Loss: 0.3714
Epoch 17, Loss: 0.3675
Epoch 18, Loss: 0.3629
Epoch 19, Loss: 0.3669
Epoch 20, Loss: 0.3947
Epoch 21, Loss: 0.3614
Epoch 22, Loss: 0.3645
Epoch 23, Loss: 0.3609
Epoch 24, Loss: 0.3557
Epoch 25, Loss: 0.3534
Epoch 26, Loss: 0.3624
Epoch 27, Loss: 0.3521
Epoch 28, Loss: 0.3499
Epoch 29, Loss: 0.3490
Epoch 30, Loss: 0.3495
Epoch 31, Loss: 0.3479
Epoch 32, Loss: 0.3565
Epoch 33, Loss: 0.3511
Epoch 34, Loss: 0.3419
Epoch 35, Loss: 0.3448
Epoch 36, Loss: 0.3447
Epoch 37, Loss: 0.3472
Epoch 38, Loss: 0.3524
Epoch 39, Loss: 0.3426
Epoch 40, Loss: 0.3446
Epoch 41, Loss: 0.3423
Epoch 42, Loss: 0.3372
Epoch 43, Loss: 0.3382
Epoch 44, Loss: 0.33

## Hyperparamater adjustment

Adjusting hypermaters:- (model no longer learns)


`batch_size`: `256`

`epoch`: `20`

`LR`: `10^-3`

`d_model`: `32`

`n_head`: `8`

`n_layer`: `6` 

In [None]:


vocab_size = 10
batch_size = 1536
generated_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)

train_size = int(0.8 * len(generated_ds))
test_size = len(generated_ds) - train_size
train_ds, test_ds = random_split(generated_ds, [train_size, test_size]) 

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size)


model = MinimalTransformer(vocab_size=vocab_size, d_model=512,n_heads=16, num_layers=1).to(device)

train_model(model, train_loader, epochs=56)
evaluate_model(model, test_loader)


Epoch 1, Loss: 2.7638
Epoch 2, Loss: 1.8719
Epoch 3, Loss: 1.1850
Epoch 4, Loss: 0.7841
Epoch 5, Loss: 0.5680
Epoch 6, Loss: 0.4283
Epoch 7, Loss: 0.3474
Epoch 8, Loss: 0.3066
Epoch 9, Loss: 0.2869
Epoch 10, Loss: 0.2835
Epoch 11, Loss: 0.2811
Epoch 12, Loss: 0.3027
Epoch 13, Loss: 0.3268
Epoch 14, Loss: 0.3183
Epoch 15, Loss: 0.3314
Epoch 16, Loss: 0.3211
Epoch 17, Loss: 0.3060
Epoch 18, Loss: 0.3006
Epoch 19, Loss: 0.2930
Epoch 20, Loss: 0.2812
Epoch 21, Loss: 0.2802
Epoch 22, Loss: 0.2741
Epoch 23, Loss: 0.2739
Epoch 24, Loss: 0.2709
Epoch 25, Loss: 0.2697
Epoch 26, Loss: 0.2689
Epoch 27, Loss: 0.2649
Epoch 28, Loss: 0.2640
Epoch 29, Loss: 0.2667
Epoch 30, Loss: 0.2674
Epoch 31, Loss: 0.2657
Epoch 32, Loss: 0.2685
Epoch 33, Loss: 0.2680
Epoch 34, Loss: 0.2682
Epoch 35, Loss: 0.2666
Epoch 36, Loss: 0.2650
Epoch 37, Loss: 0.2663
Epoch 38, Loss: 0.2652
Epoch 39, Loss: 0.2643
Epoch 40, Loss: 0.2623
Epoch 41, Loss: 0.2624
Epoch 42, Loss: 0.2609
Epoch 43, Loss: 0.2627
Epoch 44, Loss: 0.26

## Change in Fib Generator

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import random
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class FibonacciModDataset(Dataset):
    def __init__(self, seq_len=10, mod=10, num_samples=10000):
        self.mod = mod

        self.global_seq = self.generate_fib_sequence(1000, mod)
        self.samples = []
        for _ in range(num_samples):
            start_idx = torch.randint(0, len(self.global_seq) - seq_len - 1, (1,)).item()
            seq = self.global_seq[start_idx:start_idx + seq_len + 1]
            x = torch.tensor(seq[:-1], dtype=torch.long)
            y = torch.tensor(seq[1:], dtype=torch.long)
            self.samples.append((x, y))

    def generate_fib_sequence(self, length, mod):
        seq = [1, 1] # these are the starting values
        while len(seq) < length: # this prevents overlap
            seq.append((seq[-1] + seq[-2]) % mod)
        return seq

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=32, n_heads=2, num_layers=1, max_seq_len=20):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = nn.Embedding(max_seq_len, d_model)
        self.layers = nn.ModuleList([
            nn.MultiheadAttention(d_model, n_heads, batch_first=True)
            for _ in range(num_layers)
        ])
        self.out_proj = nn.Linear(d_model, vocab_size)

    def forward(self, tokens):
        B, T = tokens.shape
        pos = torch.arange(T, device=tokens.device)
        x = self.token_embed(tokens) + self.pos_embed(pos).unsqueeze(0)
        attn_mask = torch.triu(torch.ones(T, T, device=tokens.device) * float('-inf'), diagonal=1)
        for attn in self.layers:
            attn_out, _ = attn(x, x, x, attn_mask=attn_mask)
            x = x + attn_out
        return self.out_proj(x)

def train_model(model, dataloader, epochs=12, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    model.train()
    start_time = time.time()
    
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            
            logits = model(x)
            loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
    
    end_time = time.time()
    print(f"Total Training Time: {end_time - start_time:.2f} seconds")

def evaluate_model(model, dataloader):
    correct, total = 0, 0
    model.eval()
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            pred = logits.argmax(dim=-1)
            correct += (pred == y).sum().item()
            total += y.numel()
    print(f"Accuracy: {correct / total:.2%}")

vocab_size = 10
batch_size = 128
generated_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)



train_size = int(0.8 * len(generated_ds))
test_size = len(generated_ds) - train_size
train_ds, test_ds = random_split(generated_ds, [train_size, test_size]) 

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size)


model = MinimalTransformer(vocab_size=vocab_size).to(device)

train_model(model, train_loader, epochs=30)
evaluate_model(model, test_loader)

Epoch 1, Loss: 2.1516
Epoch 2, Loss: 1.4913
Epoch 3, Loss: 0.9155
Epoch 4, Loss: 0.7043
Epoch 5, Loss: 0.5824
Epoch 6, Loss: 0.4974
Epoch 7, Loss: 0.4328
Epoch 8, Loss: 0.3848
Epoch 9, Loss: 0.3515
Epoch 10, Loss: 0.3262
Epoch 11, Loss: 0.3076
Epoch 12, Loss: 0.2928
Epoch 13, Loss: 0.2821
Epoch 14, Loss: 0.2727
Epoch 15, Loss: 0.2648
Epoch 16, Loss: 0.2573
Epoch 17, Loss: 0.2517
Epoch 18, Loss: 0.2483
Epoch 19, Loss: 0.2439
Epoch 20, Loss: 0.2394
Epoch 21, Loss: 0.2363
Epoch 22, Loss: 0.2316
Epoch 23, Loss: 0.2269
Epoch 24, Loss: 0.2222
Epoch 25, Loss: 0.2185
Epoch 26, Loss: 0.2171
Epoch 27, Loss: 0.2162
Epoch 28, Loss: 0.2146
Epoch 29, Loss: 0.2118
Epoch 30, Loss: 0.2122
Total Training Time: 6.78 seconds
Accuracy: 91.96%
