In [17]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

### Loss analysis

In [18]:
from torch.utils.data import DataLoader
from src.mintrans import FibonacciModDataset, MinimalTransformer, evaluate_model, train_model
import torch


data = FibonacciModDataset(num_samples=10)
print(data.__getitem__(0))

(tensor([8, 6, 4, 0, 4, 4, 8, 2, 0]), tensor([6, 4, 0, 4, 4, 8, 2, 0, 2]))


### With the default `10` epoch

In [19]:
vocab_size = 10
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.0602
Epoch 2, Loss: 1.1740
Epoch 3, Loss: 0.8985
Epoch 4, Loss: 0.7712
Epoch 5, Loss: 0.7027
Epoch 6, Loss: 0.6506
Epoch 7, Loss: 0.6140
Epoch 8, Loss: 0.5821
Epoch 9, Loss: 0.5587
Epoch 10, Loss: 0.5316
Accuracy: 80.60%


### Epoch increased to `100`

In [20]:
train_model(model, train_loader, epochs=40)
evaluate_model(model, train_loader)

Epoch 1, Loss: 0.5116
Epoch 2, Loss: 0.4916
Epoch 3, Loss: 0.4788
Epoch 4, Loss: 0.4686
Epoch 5, Loss: 0.4590
Epoch 6, Loss: 0.4480
Epoch 7, Loss: 0.4384
Epoch 8, Loss: 0.4409
Epoch 9, Loss: 0.4311
Epoch 10, Loss: 0.4269
Epoch 11, Loss: 0.4181
Epoch 12, Loss: 0.4223
Epoch 13, Loss: 0.4145
Epoch 14, Loss: 0.4117
Epoch 15, Loss: 0.4083
Epoch 16, Loss: 0.3995
Epoch 17, Loss: 0.4002
Epoch 18, Loss: 0.4028
Epoch 19, Loss: 0.3982
Epoch 20, Loss: 0.3950
Epoch 21, Loss: 0.3922
Epoch 22, Loss: 0.3904
Epoch 23, Loss: 0.3969
Epoch 24, Loss: 0.3898
Epoch 25, Loss: 0.3896
Epoch 26, Loss: 0.3864
Epoch 27, Loss: 0.3873
Epoch 28, Loss: 0.3790
Epoch 29, Loss: 0.3807
Epoch 30, Loss: 0.3861
Epoch 31, Loss: 0.3761
Epoch 32, Loss: 0.3728
Epoch 33, Loss: 0.3689
Epoch 34, Loss: 0.3861
Epoch 35, Loss: 0.3781
Epoch 36, Loss: 0.3716
Epoch 37, Loss: 0.3714
Epoch 38, Loss: 0.3650
Epoch 39, Loss: 0.3740
Epoch 40, Loss: 0.3652
Accuracy: 87.31%


We have `~5.42%` accuracy increase with `10` times more epoch

### Increasing the batch size from `32` to `64` with epoch as `10`

Accuracy goes from ~75-80%

In [21]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.3182
Epoch 2, Loss: 1.8917
Epoch 3, Loss: 1.3045
Epoch 4, Loss: 1.0016
Epoch 5, Loss: 0.8647
Epoch 6, Loss: 0.7786
Epoch 7, Loss: 0.7189
Epoch 8, Loss: 0.6718
Epoch 9, Loss: 0.6379
Epoch 10, Loss: 0.6109
Accuracy: 80.25%


### Switching back to default `batch_size` of `32`

In [22]:
vocab_size = 10 # that is mod in our case
train_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)



model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)
evaluate_model(model, train_loader)

Epoch 1, Loss: 2.2086
Epoch 2, Loss: 1.6410
Epoch 3, Loss: 1.3067
Epoch 4, Loss: 1.0093
Epoch 5, Loss: 0.8591
Epoch 6, Loss: 0.7811
Epoch 7, Loss: 0.7294
Epoch 8, Loss: 0.6971
Epoch 9, Loss: 0.6655
Epoch 10, Loss: 0.6406
Accuracy: 79.93%


## Splitted dataset 

In [23]:
from torch.utils.data import random_split

vocab_size = 10
generated_ds = FibonacciModDataset(num_samples=5000, mod=vocab_size)
train_size = int(0.8 * len(generated_ds)) # 80% to train
test_size = len(generated_ds) - train_size # rest of the size

train_ds, test_ds = random_split(generated_ds, [train_size, test_size]) # randomly splits our dataset

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

model = MinimalTransformer(vocab_size=vocab_size)
train_model(model, train_loader)

evaluate_model(model, test_loader)

Epoch 1, Loss: 2.2762
Epoch 2, Loss: 1.7730
Epoch 3, Loss: 1.3019
Epoch 4, Loss: 0.9687
Epoch 5, Loss: 0.8391
Epoch 6, Loss: 0.7652
Epoch 7, Loss: 0.7061
Epoch 8, Loss: 0.6672
Epoch 9, Loss: 0.6377
Epoch 10, Loss: 0.6122
Accuracy: 80.67%


## Splitted dataset and increase in number of epoch.

In [24]:
import torch.nn as nn 

def train_model(model, dataloader, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()
    model.train()

    accuracy_per_e = []

    for epoch in range(epochs):
        total_loss = 0
        correct, total = 0, 0

        for x, y in dataloader:
            optimizer.zero_grad()
            logits = model(x)
            loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()

            pred = logits.argmax(dim=-1)
            correct += (pred == y).sum().item()
            total += y.numel()
            total_loss += loss.item()

        acc = correct / total
        avg_loss = total_loss / len(dataloader)
        accuracy_per_e.append(acc)

        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

    print(f"Mean accuracy across epochs (Training): {sum(accuracy_per_e) / len(accuracy_per_e):.4%}")

train_model(model, train_loader, epochs=150)
evaluate_model(model, test_loader)

Epoch 1, Loss: 0.5923
Epoch 2, Loss: 0.5775
Epoch 3, Loss: 0.5640
Epoch 4, Loss: 0.5523
Epoch 5, Loss: 0.5457
Epoch 6, Loss: 0.5290
Epoch 7, Loss: 0.5207
Epoch 8, Loss: 0.5153
Epoch 9, Loss: 0.5033
Epoch 10, Loss: 0.4943
Epoch 11, Loss: 0.4923
Epoch 12, Loss: 0.4793
Epoch 13, Loss: 0.4771
Epoch 14, Loss: 0.4728
Epoch 15, Loss: 0.4720
Epoch 16, Loss: 0.4653
Epoch 17, Loss: 0.4607
Epoch 18, Loss: 0.4620
Epoch 19, Loss: 0.4616
Epoch 20, Loss: 0.4479
Epoch 21, Loss: 0.4470
Epoch 22, Loss: 0.4413
Epoch 23, Loss: 0.4384
Epoch 24, Loss: 0.4449
Epoch 25, Loss: 0.4363
Epoch 26, Loss: 0.4322
Epoch 27, Loss: 0.4319
Epoch 28, Loss: 0.4230
Epoch 29, Loss: 0.4251
Epoch 30, Loss: 0.4327
Epoch 31, Loss: 0.4233
Epoch 32, Loss: 0.4158
Epoch 33, Loss: 0.4159
Epoch 34, Loss: 0.4127
Epoch 35, Loss: 0.4128
Epoch 36, Loss: 0.4181
Epoch 37, Loss: 0.4117
Epoch 38, Loss: 0.4078
Epoch 39, Loss: 0.4115
Epoch 40, Loss: 0.4118
Epoch 41, Loss: 0.4033
Epoch 42, Loss: 0.4010
Epoch 43, Loss: 0.4047
Epoch 44, Loss: 0.40

## Hyperparamater adjustment

Adjusting hypermaters:- (model no longer learns)


`batch_size`: `256`

`epoch`: `20`

`LR`: `10^-3`

`d_model`: `32`

`n_head`: `8`

`n_layer`: `6` 

In [28]:
from src.mintrans import *

train_model(model, train_loader, epochs=20)
evaluate_model(model, test_loader)


Epoch 1, Loss: 0.3684
Epoch 2, Loss: 0.3575
Epoch 3, Loss: 0.3562
Epoch 4, Loss: 0.3797
Epoch 5, Loss: 0.3784
Epoch 6, Loss: 0.3565
Epoch 7, Loss: 0.3588
Epoch 8, Loss: 0.3552
Epoch 9, Loss: 0.3580
Epoch 10, Loss: 0.3598
Epoch 11, Loss: 0.4149
Epoch 12, Loss: 0.3637
Epoch 13, Loss: 0.3575
Epoch 14, Loss: 0.3587
Epoch 15, Loss: 0.3579
Epoch 16, Loss: 0.3645
Epoch 17, Loss: 0.3702
Epoch 18, Loss: 0.3878
Epoch 19, Loss: 0.3620
Epoch 20, Loss: 0.3567
Accuracy: 87.48%
