In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

### Loss analysis

In [2]:
from torch.utils.data import DataLoader
from src.mintrans import FibonacciModDataset, MinimalTransformer, evaluate_model, train_model
import torch


data = FibonacciModDataset(num_samples=10)
print(data.__getitem__(0))

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0]))


### With the default `10` epoch

In [3]:
vocab_size = 10
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.1729
Epoch 2, Loss: 1.2150
Epoch 3, Loss: 0.9172
Epoch 4, Loss: 0.7751
Epoch 5, Loss: 0.6967
Epoch 6, Loss: 0.6472
Epoch 7, Loss: 0.6076
Epoch 8, Loss: 0.5777
Epoch 9, Loss: 0.5520
Epoch 10, Loss: 0.5352
Accuracy: 81.46%


### Epoch increased to `100`

In [4]:
train_model(model, train_loader, epochs=40)
evaluate_model(model, train_loader)

Epoch 1, Loss: 0.5147
Epoch 2, Loss: 0.5000
Epoch 3, Loss: 0.4894
Epoch 4, Loss: 0.4834
Epoch 5, Loss: 0.4745
Epoch 6, Loss: 0.4646
Epoch 7, Loss: 0.4616
Epoch 8, Loss: 0.4557
Epoch 9, Loss: 0.4459
Epoch 10, Loss: 0.4444
Epoch 11, Loss: 0.4399
Epoch 12, Loss: 0.4384
Epoch 13, Loss: 0.4349
Epoch 14, Loss: 0.4289
Epoch 15, Loss: 0.4268
Epoch 16, Loss: 0.4252
Epoch 17, Loss: 0.4303
Epoch 18, Loss: 0.4188
Epoch 19, Loss: 0.4125
Epoch 20, Loss: 0.4268
Epoch 21, Loss: 0.4236
Epoch 22, Loss: 0.4155
Epoch 23, Loss: 0.4152
Epoch 24, Loss: 0.4113
Epoch 25, Loss: 0.4143
Epoch 26, Loss: 0.4087
Epoch 27, Loss: 0.4060
Epoch 28, Loss: 0.4023
Epoch 29, Loss: 0.4089
Epoch 30, Loss: 0.4046
Epoch 31, Loss: 0.4199
Epoch 32, Loss: 0.4043
Epoch 33, Loss: 0.4000
Epoch 34, Loss: 0.4046
Epoch 35, Loss: 0.4011
Epoch 36, Loss: 0.4086
Epoch 37, Loss: 0.4040
Epoch 38, Loss: 0.3979
Epoch 39, Loss: 0.3941
Epoch 40, Loss: 0.3942
Accuracy: 86.68%


We have `~5.42%` accuracy increase with `10` times more epoch

### Increasing the batch size from `32` to `64` with epoch as `10`

Accuracy goes from ~75-80%

In [5]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.2914
Epoch 2, Loss: 1.5690
Epoch 3, Loss: 1.1068
Epoch 4, Loss: 0.9250
Epoch 5, Loss: 0.8132
Epoch 6, Loss: 0.7357
Epoch 7, Loss: 0.6830
Epoch 8, Loss: 0.6449
Epoch 9, Loss: 0.6154
Epoch 10, Loss: 0.5877
Accuracy: 81.44%


### Switching back to default `batch_size` of `32`

In [6]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.0949
Epoch 2, Loss: 1.1679
Epoch 3, Loss: 0.9011
Epoch 4, Loss: 0.7663
Epoch 5, Loss: 0.6871
Epoch 6, Loss: 0.6363
Epoch 7, Loss: 0.6009
Epoch 8, Loss: 0.5723
Epoch 9, Loss: 0.5491
Epoch 10, Loss: 0.5301
Accuracy: 82.67%
