In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.profiler import profile, record_function, ProfilerActivity

from model import Net
from train import get_args, train
from dataset import CLSDataset

In [3]:
# python train.py --batch-size 1 --embedding-dim 128 --hidden-dim 128 --num-layers 2
args = get_args(args=["--batch", "1", "--embedding-dim", "128", "--hidden-dim", "128", "--num-layers", "6"])

In [4]:
train_set = CLSDataset()
train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = Net(args, train_set.dictionary).to(device)

{
    "vocab_size": 1264,
    "max_position_embeddings": 6000,
    "n_embed": 128,
    "n_layer": 6,
    "n_head": 8,
    "ffn_dim": 128,
    "static_position_embeddings": true,
    "pad_token_id": 1
}
{
    "vocab_size": 1264,
    "max_position_embeddings": 800,
    "n_embed": 128,
    "n_layer": 6,
    "n_head": 8,
    "ffn_dim": 128,
    "static_position_embeddings": true,
    "pad_token_id": 1
}


In [None]:
torch.save(model, "model.pt")
# check size of model
!du -sh model.pt

11M	model.pt


In [None]:
# check the memory size of the model

inputs = next(iter(train_loader))
inputs = {k: v.to(device) for k, v in inputs.items()}

In [None]:
def num_bytes2human_readable(num_bytes):
    flag = False
    if num_bytes < 0:
        flag = True
        num_bytes = -num_bytes
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num_bytes < 1024.0:
            return "%3.1f %s" % (num_bytes, x) if not flag else "-%3.1f %s" % (num_bytes, x)
        num_bytes /= 1024.0
    return "%3.1f %s" % (num_bytes, 'TB') if not flag else "-%3.1f %s" % (num_bytes, 'TB')

In [None]:
def estimate_memory(model, inputs):
    model.cpu()
    inputs = {k: v.cpu() for k, v in inputs.items()}
    torch.cuda.empty_cache()
    before_mem = torch.cuda.memory_allocated(device)

    model.to(device)
    model_memory = torch.cuda.memory_allocated(device) - before_mem

    inputs = {k: v.to(device) for k, v in inputs.items()}
    inputs_memory = torch.cuda.memory_allocated(device) - before_mem - model_memory

    loss = model.get_loss(**inputs)
    compute_memory = torch.cuda.memory_allocated(device) - before_mem - model_memory - inputs_memory

    loss.backward()
    back_prop_memory = torch.cuda.memory_allocated(device) - before_mem - model_memory - inputs_memory - compute_memory

    # inputs_memory = sum([v.element_size() * v.nelement() for v in inputs.values()])
    return(
        num_bytes2human_readable(model_memory),
        num_bytes2human_readable(inputs_memory),
        num_bytes2human_readable(compute_memory),
        num_bytes2human_readable(back_prop_memory),
    )


In [None]:
estimate_memory(model, inputs)

('10.1 MB', '8.0 KB', '168.4 MB', '-161.7 MB')

In [5]:
max_length = 0
for samples in train_loader:
    choices = samples["choices"]
    max_length = max(max_length, choices.shape[2])


In [6]:
max_length

170