In [1]:
import numpy as np
from datasets import Dataset

In [2]:
seq_len, dataset_size = 512, 512
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

In [3]:
from pynvml import *

In [4]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
print_gpu_utilization()

GPU memory occupied: 215 MB.


In [6]:
import torch

In [7]:
torch.ones((1, 1)).to("cuda")

print_gpu_utilization()

GPU memory occupied: 335 MB.


In [8]:
from transformers import AutoModelForSequenceClassification

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased").to("cuda")

print_gpu_utilization()

2023-06-12 18:35:14.430117: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-12 18:35:14.608547: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.pr

GPU memory occupied: 861 MB.


In [12]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [13]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()


training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 27.5414, 'train_samples_per_second': 18.59, 'train_steps_per_second': 4.648, 'train_loss': 0.018498804420232773, 'epoch': 1.0}
Time: 27.54
Samples/second: 18.59
GPU memory occupied: 4501 MB.


In [14]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 31.3987, 'train_samples_per_second': 16.306, 'train_steps_per_second': 4.077, 'train_loss': 1.4724218090123031e-05, 'epoch': 1.0}
Time: 31.40
Samples/second: 16.31
GPU memory occupied: 4511 MB.


In [15]:
training_args = TrainingArguments(
    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 41.0883, 'train_samples_per_second': 12.461, 'train_steps_per_second': 3.115, 'train_loss': 4.305037464291672e-07, 'epoch': 1.0}
Time: 41.09
Samples/second: 12.46
GPU memory occupied: 4511 MB.


In [16]:
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 21.3262, 'train_samples_per_second': 24.008, 'train_steps_per_second': 6.002, 'train_loss': 4.0745362639427185e-08, 'epoch': 1.0}
Time: 21.33
Samples/second: 24.01
GPU memory occupied: 4515 MB.


In [17]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 26.9022, 'train_samples_per_second': 19.032, 'train_steps_per_second': 4.758, 'train_loss': 4.656612873077393e-10, 'epoch': 1.0}
Time: 26.90
Samples/second: 19.03
GPU memory occupied: 4515 MB.


In [18]:
training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 38.0921, 'train_samples_per_second': 13.441, 'train_steps_per_second': 3.36, 'train_loss': 0.0, 'epoch': 1.0}
Time: 38.09
Samples/second: 13.44
GPU memory occupied: 4515 MB.


In [19]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

{'train_runtime': 27.6753, 'train_samples_per_second': 18.5, 'train_steps_per_second': 4.625, 'train_loss': 0.0, 'epoch': 1.0}
Time: 27.68
Samples/second: 18.50
GPU memory occupied: 4515 MB.
