In [1]:
import sys
import os
import torch as T
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [2]:
SEED = 42

device = T.device('cuda' if T.cuda.is_available() else 'cpu')
print(f'Using device: {device}')


T.manual_seed(SEED)

if T.cuda.is_available():
    T.cuda.manual_seed_all(SEED)

Using device: cuda


## Dataset Loading

In [3]:
from torch.utils.data import TensorDataset, DataLoader

### 1. Dataset Hyperparameters

In [4]:
BATCH_SIZE = 64

### 2. Loading Dataset

In [5]:
def load_dataset(path):
    encoder_data, decoder_data, targets = T.load(path)
    return TensorDataset(encoder_data, decoder_data, targets)

In [6]:
train_datasets = {
    "color": load_dataset("../datasets/train_context_0.pt"),
    "shape": load_dataset("../datasets/train_context_1.pt"),
    "quantity": load_dataset("../datasets/train_context_2.pt"),
}

train_loaders = {
    ctx: DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)
    for ctx, ds in train_datasets.items()
}

validation_dataset = load_dataset('../datasets/validation_dataset.pt')
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = load_dataset('../datasets/test_dataset.pt')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

## Transformer Model Creation

In [8]:
from src.transformer import Transformer

### 1. Transformer Hyperparameters

In [11]:
VOCABULARY_SIZE = 70        # 64 cards + 4 categories + SEP + EOS
EMBEDDING_SIZE = 216        # larger embedding to capture card features
N_ATTENTION_HEADS = 6       # more heads for better multi-feature attention
N_BLOCKS = 3                # same depth as before
MAX_SEQUENCE_LENGTH = 10    # longer max sequence to accommodate multiple past trials
FF_DIMS = 256               # larger feedforward layer for better representation
DROPOUT_PROB = 0.2        # reduce dropout slightly to retain signal in small batches
CARD_DIMS = (4, 4, 4)

### 2. Transformer Initialisation

In [12]:
transformer = Transformer(
    VOCABULARY_SIZE, VOCABULARY_SIZE, CARD_DIMS, EMBEDDING_SIZE, N_ATTENTION_HEADS,
    N_BLOCKS, MAX_SEQUENCE_LENGTH, FF_DIMS, DROPOUT_PROB, device=device
)

## Training Transformer

In [13]:
import itertools
import numpy as np
import torch as T
from torch.utils.data import DataLoader
from torch import nn, optim

### 1. Train, Validate, Evaluate Model Functions

In [14]:
def train_model(
    train_loader: DataLoader,
    validation_loader: DataLoader,
    model: Transformer,
    criterion: nn.CrossEntropyLoss,
    optimizer: optim.Optimizer,
    max_epochs: int = 20,
    device: str | T.device = "cpu",
    patience: int = 3,
):
    best_val_loss = np.inf
    patience_counter = 0

    train_losses, train_accs, train_perplexities = [], [], []
    val_losses, val_accs, val_perplexities = [], [], []
    best_model_state = model.state_dict()

    for epoch in range(max_epochs):
        print(f"\nEpoch {epoch + 1}/{max_epochs}")
        print("-" * 40)

        # --- Training ---
        model.train()
        epoch_train_losses = []
        total_correct = 0
        total_samples = 0

        for batch_idx, (encoder_input, decoder_input, target) in enumerate(train_loader):
            encoder_input, decoder_input, target = encoder_input.to(device), decoder_input.to(device), target.view(-1).to(device)
    
            # Forward pass
            logits = model(encoder_input, decoder_input)  # [batch, seq_len, vocab]
            logits = logits[:, -1, :]  # only the final step prediction [batch, vocab]

            loss = criterion(logits, target)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accuracy
            preds = logits.argmax(dim=1)
            total_correct += (preds == target).sum().item()
            total_samples += target.size(0)

            epoch_train_losses.append(loss.item())

            if batch_idx % 100 == 0 or batch_idx == len(train_loader) - 1:
                print(f"Train Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")

        train_loss = np.mean(epoch_train_losses)
        train_acc = total_correct / total_samples
        train_perplexity = np.exp(train_loss)

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_perplexities.append(train_perplexity)

        print(f"[Epoch {epoch+1}] Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train Perplexity: {train_perplexity:.4f}")

        # --- Validation ---
        model.eval()
        val_batch_losses = []
        val_correct = 0
        val_samples = 0

        with T.no_grad():
            for encoder_input, decoder_input, target in validation_loader:
                encoder_input, decoder_input, target = encoder_input.to(device), decoder_input.to(device), target.view(-1).to(device)

                logits = model(encoder_input, decoder_input) # [batch, seq_len, vocab]
                logits = logits[:, -1, :]  # only the final step prediction [batch, vocab]

                loss = criterion(logits, target)

                preds = logits.argmax(dim=1)
                val_correct += (preds == target).sum().item()
                val_samples += target.size(0)

                val_batch_losses.append(loss.item())

        val_loss = np.mean(val_batch_losses)
        val_acc = val_correct / val_samples
        val_perplexity = np.exp(val_loss)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        val_perplexities.append(val_perplexity)

        print(f"[Epoch {epoch+1}] Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val Perplexity: {val_perplexity:.4f}")
        print("-" * 40)

        # --- Early Stopping ---
        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     patience_counter = 0
        #     best_model_state = model.state_dict()
        #     print(f"Validation loss improved — saving model (Loss: {val_loss:.4f})")
        # else:
        #     patience_counter += 1
        #     print(f"No improvement ({patience_counter}/{patience})")
        #     if patience_counter >= patience:
        #         print("\nEarly stopping triggered. Restoring best model.")
        #         model.load_state_dict(best_model_state)
        #         break

    print("\nTraining complete")

    return {
        "train_losses": train_losses,
        "train_accs": train_accs,
        "train_perplexities": train_perplexities,
        "val_losses": val_losses,
        "val_accs": val_accs,
        "val_perplexities": val_perplexities,
        "best_val_loss": best_val_loss,
    }


In [15]:
def train_model_round_robin(
    train_loaders: dict[str, DataLoader], validation_loader: DataLoader, model: nn.Module,
    criterion: nn.CrossEntropyLoss, optimizer: optim.Optimizer, max_epochs: int = 20,
    device: str | T.device = "cpu", patience: int = 3,
):
    best_val_loss = np.inf
    patience_counter = 0
    best_model_state = model.state_dict()

    history = {k: [] for k in [
        "train_losses", "train_accs", "train_perplexities",
        "val_losses", "val_accs", "val_perplexities"
    ]}

    for epoch in range(max_epochs):
        print(f"\n[Epoch {epoch + 1}/{max_epochs}]")
        print("-" * 60)

        model.train()
        epoch_losses, total_correct, total_samples = [], 0, 0

        # --- Build a round-robin iterator across all loaders ---
        loaders_cycle = itertools.cycle(train_loaders.items())
        active_iters = {ctx: iter(dl) for ctx, dl in train_loaders.items()}

        # Find smallest loader length to roughly balance epoch size
        min_len = min(len(dl) for dl in train_loaders.values())
        total_batches = min_len * len(train_loaders)

        for batch_idx in range(total_batches):
            context, _ = next(loaders_cycle)
            loader_iter = active_iters[context]

            try:
                encoder_input, decoder_input, target = next(loader_iter)
            except StopIteration:
                # Restart exhausted iterator
                active_iters[context] = iter(train_loaders[context])
                encoder_input, decoder_input, target = next(active_iters[context])

            encoder_input, decoder_input, target = (
                encoder_input.to(device),
                decoder_input.to(device),
                target.view(-1).to(device)
            )

            # Forward pass
            logits = model(encoder_input, decoder_input)
            logits = logits[:, -1, :]  # predict final token only
            loss = criterion(logits, target)

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Metrics
            preds = logits.argmax(dim=1)
            total_correct += (preds == target).sum().item()
            total_samples += target.size(0)
            epoch_losses.append(loss.item())

            if batch_idx % 50 == 0 or batch_idx == total_batches - 1:
                print(f"[{context}] Batch {batch_idx+1}/{total_batches} | Loss: {loss.item():.4f}")

        # --- Epoch stats ---
        train_loss = np.mean(epoch_losses)
        train_acc = total_correct / total_samples
        train_perplexity = np.exp(train_loss)

        print(f"[Epoch {epoch+1}] Train Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | Perplexity: {train_perplexity:.4f}")

        # --- Validation ---
        model.eval()
        val_losses, val_correct, val_samples = [], 0, 0

        with T.no_grad():
            for encoder_input, decoder_input, target in validation_loader:
                encoder_input, decoder_input, target = (
                    encoder_input.to(device),
                    decoder_input.to(device),
                    target.view(-1).to(device)
                )
                logits = model(encoder_input, decoder_input)
                logits = logits[:, -1, :]
                loss = criterion(logits, target)
                preds = logits.argmax(dim=1)
                val_correct += (preds == target).sum().item()
                val_samples += target.size(0)
                val_losses.append(loss.item())

        val_loss = np.mean(val_losses)
        val_acc = val_correct / val_samples
        val_perplexity = np.exp(val_loss)

        print(f"[Epoch {epoch+1}] Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | Perplexity: {val_perplexity:.4f}")
        print("-" * 60)

        # --- Logging ---
        history["train_losses"].append(train_loss)
        history["train_accs"].append(train_acc)
        history["train_perplexities"].append(train_perplexity)
        history["val_losses"].append(val_loss)
        history["val_accs"].append(val_acc)
        history["val_perplexities"].append(val_perplexity)

        # --- Early stopping ---
        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     best_model_state = model.state_dict()
        #     patience_counter = 0
        # else:
        #     patience_counter += 1
        #     if patience_counter >= patience:
        #         print("Early stopping — restoring best model.")
        #         model.load_state_dict(best_model_state)
        #         break

    print("\n Training complete (Round-Robin Mode)")
    return history


In [16]:
def test_model(test_loader: DataLoader, model: Transformer, criterion: nn.CrossEntropyLoss, device: str | T.device = "cpu"):

    model.eval()
    test_batch_losses = []
    test_correct = 0
    test_tokens = 0

    with T.no_grad():
        for encoder_input, decoder_input, target in test_loader:
            encoder_input, decoder_input, target = encoder_input.to(device), decoder_input.to(device), target.view(-1).to(device)
            
            logits = model(encoder_input, decoder_input)[:, -1, :]
            loss = criterion(logits, target)

            preds = logits.argmax(dim=1)
            test_correct += (preds == target).sum().item()
            test_tokens += target.size(0)

            test_batch_losses.append(loss.item())

    test_loss = np.mean(test_batch_losses)
    test_acc = test_correct / test_tokens
    test_perplexity = np.exp(test_loss)

    return {
        "test_loss": test_loss,
        "test_acc": test_acc,
        "test_perplexity": test_perplexity
    }

### 2. Train Transformer Model

In [17]:
LEARNING_RATE = 3e-4
BATCH_SIZE = 64
WEIGHT_DECAY = 1e-2
WARMUP_STEPS = 400
LABEL_SMOOTHING = 0.1
MAX_EPOCHS = 100

In [18]:
criterion =  nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTHING)
optimizer = optim.AdamW(transformer.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=WARMUP_STEPS)  

results = train_model_round_robin(
    train_loaders, validation_loader, transformer, criterion, 
    optimizer, max_epochs=MAX_EPOCHS, device=device
)


[Epoch 1/100]
------------------------------------------------------------
[color] Batch 1/300 | Loss: 4.6329
[quantity] Batch 51/300 | Loss: 1.9630
[shape] Batch 101/300 | Loss: 1.9775
[color] Batch 151/300 | Loss: 1.9750
[quantity] Batch 201/300 | Loss: 1.9506
[shape] Batch 251/300 | Loss: 1.9705
[quantity] Batch 300/300 | Loss: 1.9675
[Epoch 1] Train Loss: 2.0072 | Acc: 0.2449 | Perplexity: 7.4427
[Epoch 1] Val Loss: 1.9667 | Acc: 0.2528 | Perplexity: 7.1468
------------------------------------------------------------

[Epoch 2/100]
------------------------------------------------------------
[color] Batch 1/300 | Loss: 1.9694
[quantity] Batch 51/300 | Loss: 1.9635
[shape] Batch 101/300 | Loss: 2.0081
[color] Batch 151/300 | Loss: 1.9806
[quantity] Batch 201/300 | Loss: 1.9820
[shape] Batch 251/300 | Loss: 1.9797
[quantity] Batch 300/300 | Loss: 1.9589
[Epoch 2] Train Loss: 1.9763 | Acc: 0.2514 | Perplexity: 7.2162
[Epoch 2] Val Loss: 1.9650 | Acc: 0.2528 | Perplexity: 7.1347
-----

### 3. Test Transformer Model

In [21]:
results = test_model(test_loader, transformer, criterion, device)
print(f"Test Loss: {results["test_loss"]:.4f} | Test Acc: {results["test_acc"]:.4f} | Test Perplexity: {results["test_perplexity"]:.4f}")

Test Loss: 3.0553 | Test Acc: 0.2467 | Test Perplexity: 21.2285


## Model Inference

In [65]:
from datasets.wcst import WCST
wcst = WCST(10)

In [None]:
def model_inference(model: Transformer, source_sequence, start_tokens):
    model.eval()
    generated = start_tokens
    
    with T.no_grad():
        logits = model(source_sequence, generated)
    
    # Greedy Selection
    next_token = T.argmax(logits[:, -1, :], dim=-1, keepdim=True)

    generated = T.cat([generated, next_token], dim=1)

    return generated


In [73]:
encoder_input, decoder_input, target = train_datasets["quantity"][:10]
encoder_input = encoder_input.to(device)
decoder_input = decoder_input.to(device)
target = target.to(device)

prediction = model_inference(transformer, encoder_input, decoder_input)

print("# Actual Trials")
test_batch = [np.asarray(item.cpu()) for item in [encoder_input, T.concatenate([decoder_input, target], dim=1)]]
output = wcst.visualise_batch(test_batch)

print("# Predicted Trials")
prediction_batch = [np.asarray(item.cpu()) for item in [encoder_input, T.concatenate([decoder_input, prediction], dim=1)]]
output = wcst.visualise_batch(prediction_batch)

# Actual Trials
[array(['blue', 'cross', '1'], dtype='<U6'), array(['blue', 'cross', '3'], dtype='<U6'), array(['green', 'star', '4'], dtype='<U6'), array(['yellow', 'square', '2'], dtype='<U6'), array(['yellow', 'circle', '4'], dtype='<U6'), 'SEP', 'C3', 'EOS', array(['yellow', 'star', '2'], dtype='<U6'), 'SEP', 'C4']
[array(['blue', 'circle', '4'], dtype='<U6'), array(['blue', 'star', '3'], dtype='<U6'), array(['yellow', 'circle', '2'], dtype='<U6'), array(['green', 'circle', '1'], dtype='<U6'), array(['green', 'star', '4'], dtype='<U6'), 'SEP', 'C1', 'EOS', array(['blue', 'star', '1'], dtype='<U6'), 'SEP', 'C4']
[array(['blue', 'cross', '2'], dtype='<U6'), array(['yellow', 'circle', '1'], dtype='<U6'), array(['red', 'cross', '3'], dtype='<U6'), array(['green', 'circle', '4'], dtype='<U6'), array(['green', 'star', '3'], dtype='<U6'), 'SEP', 'C3', 'EOS', array(['blue', 'circle', '3'], dtype='<U6'), 'SEP', 'C3']
[array(['yellow', 'circle', '1'], dtype='<U6'), array(['red', 'square', '3