In [1]:
import numpy as np
from datasets import Dataset

In [2]:
seq_len, dataset_size = 512, 512

dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}

training_data = Dataset.from_dict(dummy_data)
eval_data = Dataset.from_dict(dummy_data)  # Create a dummy evaluation dataset

training_data.set_format("pt")
eval_data.set_format("pt")

In [3]:
from pynvml import *

In [4]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [5]:
print_gpu_utilization()

GPU memory occupied: 215 MB.


In [6]:
import torch

In [7]:
torch.ones((1, 1)).to("cuda")

print_gpu_utilization()

GPU memory occupied: 335 MB.


In [10]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, TrainingArguments, Trainer

2023-06-12 18:48:03.250860: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-12 18:48:03.280935: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id

print_gpu_utilization()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 335 MB.


In [12]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [13]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()


training_args = TrainingArguments(per_device_train_batch_size=1, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)



{'loss': 0.0019, 'learning_rate': 1.1718750000000001e-06, 'epoch': 0.98}
{'eval_loss': 2.3283062977608182e-10, 'eval_runtime': 9.286, 'eval_samples_per_second': 55.137, 'eval_steps_per_second': 6.892, 'epoch': 0.98}
{'train_runtime': 57.3275, 'train_samples_per_second': 8.931, 'train_steps_per_second': 8.931, 'train_loss': 0.001902676303870976, 'epoch': 1.0}
Time: 57.33
Samples/second: 8.93
GPU memory occupied: 3479 MB.


In [15]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)



{'train_runtime': 35.617, 'train_samples_per_second': 14.375, 'train_steps_per_second': 3.594, 'train_loss': 2.3283062144940914e-09, 'epoch': 1.0}
Time: 35.62
Samples/second: 14.38
GPU memory occupied: 3479 MB.


In [16]:
training_args = TrainingArguments(
    per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args
)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)

{'train_runtime': 46.2662, 'train_samples_per_second': 11.066, 'train_steps_per_second': 2.767, 'train_loss': 0.0, 'epoch': 1.0}
Time: 46.27
Samples/second: 11.07
GPU memory occupied: 3479 MB.


In [17]:
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)

{'train_runtime': 26.1324, 'train_samples_per_second': 19.593, 'train_steps_per_second': 4.898, 'train_loss': 0.0, 'epoch': 1.0}
Time: 26.13
Samples/second: 19.59
GPU memory occupied: 3483 MB.


In [18]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)

{'train_runtime': 32.7121, 'train_samples_per_second': 15.652, 'train_steps_per_second': 3.913, 'train_loss': 0.0, 'epoch': 1.0}
Time: 32.71
Samples/second: 15.65
GPU memory occupied: 3483 MB.


In [19]:
training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)

{'train_runtime': 43.6654, 'train_samples_per_second': 11.726, 'train_steps_per_second': 2.931, 'train_loss': 0.0, 'epoch': 1.0}
Time: 43.67
Samples/second: 11.73
GPU memory occupied: 3483 MB.


In [20]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    optim="adafactor",
    **default_args,
)

trainer = Trainer(model=model, args=training_args, train_dataset=training_data, eval_dataset=eval_data)
result = trainer.train()
print_summary(result)

{'train_runtime': 33.7098, 'train_samples_per_second': 15.188, 'train_steps_per_second': 3.797, 'train_loss': 0.0, 'epoch': 1.0}
Time: 33.71
Samples/second: 15.19
GPU memory occupied: 3483 MB.
