In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

# BERT MLM

In [None]:
bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer  = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
data = []
for _ in range(10):
    input_ids = torch.randint(0, len(tokenizer), (16, 256))
    data.append(input_ids)


In [None]:
bert.cuda();

In [None]:
import time

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-3)

start = time.time()

for input_ids in data:
    out = bert(input_ids=input_ids.cuda(), labels=input_ids.cuda())
    out.loss.backward()
    optimizer.step()
end = time.time()
print((end-start) / 10.0)

In [None]:
print("Time for 1M steps (in days):", int(1e6 * (end-start) / 10 / 3600 / 24))

# Gradient accumulation

In [None]:
def f(theta):
    return (0.5 * theta**2).sum()

Here we have:
\begin{equation}
\dfrac{\partial}{\partial \theta_i} f(\theta) = \dfrac{\partial}{\partial \theta_i}  \sum_{i=1}^n\dfrac{1}{2} \theta_i^2 = \theta_i.
\end{equation}

In [None]:
theta = torch.arange(10, dtype=float, requires_grad=True)
theta

In [None]:
theta = torch.arange(10, dtype=float, requires_grad=True)

loss = f(theta)
loss.backward()
print("Gradient attached to theta:", theta.grad)

In [None]:
theta = torch.arange(10, dtype=float, requires_grad=True)
print("Initial gradient:", theta.grad)
for i in range(2):
    loss = f(theta)
    loss.backward()
    input("Continue")
    print(f"Gradient attached to theta at step {i+1}:", theta.grad)

A gradient accumulation is readily performed like:

In [None]:
# bert = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer  = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
data = []
batch_size = 2
for _ in range(10):
    input_ids = torch.randint(0, len(tokenizer), (batch_size, 8))
    data.append(input_ids)

In [None]:
bert.cuda();

In [None]:
optimizer = torch.optim.AdamW(bert.parameters(), lr=1e-3)
optimizer.zero_grad()

iteration_steps = 0
optimization_steps = 0
gradient_accumulation = 2

for input_ids in data:
    out = bert(input_ids=input_ids.cuda(), labels=input_ids.cuda())
    loss = out.loss

    loss = loss / gradient_accumulation # To average the gradient, otherwise it performs summation.
    loss.backward()

    iteration_steps += 1

    if (iteration_steps % gradient_accumulation) == 0:
        optimizer.step()
        optimizer.zero_grad()
        optimization_steps += 1

print("Total number of data iterations:", iteration_steps)
print("Total number of opimization steps:", optimization_steps)

# GPT2-Large memory requirements

Make sure to free the cuda memory before running this (you can relaunch the notebook for instance).

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
gpt = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-large")


In [None]:
gpt = gpt.cuda();
num_params = sum(p.numel() for p in gpt.parameters())
print(f"The number of parameters of GPT2-Large is: {num_params}")

In [None]:
!nvidia-smi