In [1]:
import torch
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR, SequentialLR

from models import *
from my_datasets import *

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
def train_model(model, optimizer, dataloader, lr_scheduler, tqdm_tick=10):
    all_losses, all_accs = [], []
    pbar = tqdm(dataloader)
    for i, batch in enumerate(pbar):
        tokens, labels = batch["tokens"].to(device), batch["labels"].to(device)
        out = model(tokens, labels=labels)
        loss = out.loss
        loss.backward(); optimizer.step(); optimizer.zero_grad(); lr_scheduler.step()

        # Track stuff
        pred = (out.logits > 0).long()
        all_accs.append((pred == labels).float().mean())
        all_losses.append(loss.detach().cpu().item())
        if (i+1) % tqdm_tick == 0:
            avg_loss = torch.tensor(all_losses)[:-tqdm_tick].mean().item()
            avg_acc = torch.tensor(all_accs)[:-tqdm_tick].mean().item()
            pbar.set_description(
                f"loss {avg_loss:.3f}, acc {avg_acc:.3f}"
            )
    
    return {
        "model": model,
        all_losses: all_losses
    }

In [3]:
n = 16
k = 3
model = MyTheoryARModel(n, num_steps=k)
model.train().to(device)

bsz = 512
num_steps = 8192
dataset_len = bsz * num_steps
dataset = AutoregCustomTokensDataset(n, dataset_len)
dataloader = DataLoader(dataset, batch_size=bsz)

optimizer = AdamW(model.parameters(), lr=5e-4)

warmup_steps = int(num_steps * 0.1)
lr_scheduler = SequentialLR(
    optimizer,
    schedulers = [
        LinearLR(optimizer, 0.01, 1.0, warmup_steps),
        LinearLR(optimizer, 1.0, 0.01, num_steps - warmup_steps),
    ],
    milestones = [warmup_steps]
)

In [None]:
ret = train_model(model, optimizer, dataloader, lr_scheduler)

  0%|          | 0/8192 [00:00<?, ?it/s]

