In [1]:
pip install transformers datasets accelerate nvidia-ml-py3 wandb python_dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
from datasets import Dataset

seq_len, dataset_size = 512, 512
dummy_data = {
	"input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
	"labels": np.random.randint(0, 2, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

In [3]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB")
    
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [4]:
print_gpu_utilization()

GPU memory occupied: 442 MB


In [5]:
import torch

torch.ones((1, 1)).to("cuda")
print_gpu_utilization()

GPU memory occupied: 547 MB


In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
print_gpu_utilization()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 1835 MB


In [7]:
import wandb
import dotenv
import os
import gc

dotenv.load_dotenv()

wandb.login(key=os.environ["WANDB_KEY"])

[34m[1mwandb[0m: Currently logged in as: [33mfalconlee236[0m ([33mOptiMap[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/isang-yun/.netrc


True

In [8]:
default_args = {
    "output_dir": "tmp",
    "eval_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "wandb", #w&b logging on
    "eval_strategy": "no",
}

In [9]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)
wandb.finish()

I0000 00:00:1737707127.191031  181567 cuda_executor.cc:1004] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1737707127.193664  181350 service.cc:146] XLA service 0x5654693c1360 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737707127.193691  181350 service.cc:154]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1737707127.194182  181350 se_gpu_pjrt_client.cc:897] Using BFC allocator.
I0000 00:00:1737707127.194235  181350 gpu_helpers.cc:114] XLA backend allocating 11731746816 bytes on device 0 for BFCAllocator.
I0000 00:00:1737707127.194269  181350 gpu_helpers.cc:154] XLA backend will use up to 3910582272 bytes on device 0 for CollectiveBFCAllocator.
I0000 00:00:1737707127.194427  181350 cuda_executor.cc:100

I0000 00:00:1737707129.177616  181350 cuda_dnn.cc:530] Loaded cuDNN version 90100
2025-01-24 08:27:14.080935: W external/xla/xla/service/hlo_rematerialization.cc:3005] Can't reduce memory use below 7.51GiB (8067496761 bytes) by rematerialization; only reduced to 9.54GiB (10239437064 bytes), down from 10.70GiB (11490555772 bytes) originally
2025-01-24 08:29:25.613002: W external/xla/xla/service/hlo_rematerialization.cc:3005] Can't reduce memory use below 7.51GiB (8067496761 bytes) by rematerialization; only reduced to 9.71GiB (10429593364 bytes), down from 10.85GiB (11655448448 bytes) originally


Time: 591.53
Samples/second: 0.87
GPU memory occupied: 11777 MB


0,1
train/epoch,▁
train/global_step,▁

0,1
total_flos,477148858023936.0
train/epoch,1.0
train/global_step,128.0
train_loss,0.72563
train_runtime,591.5288
train_samples_per_second,0.866
train_steps_per_second,0.216


In [10]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

training_args = TrainingArguments(per_device_train_batch_size=1, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)
wandb.finish()

  xldata.append(torch.load(xbio))


Time: 1047.24
Samples/second: 0.49
GPU memory occupied: 11779 MB


0,1
train/epoch,▁█
train/global_step,▁█
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
total_flos,477148858023936.0
train/epoch,1.0
train/global_step,512.0
train/grad_norm,29.10516
train/learning_rate,0.0
train/loss,0.8578
train_loss,0.85375
train_runtime,1047.2399
train_samples_per_second,0.489
train_steps_per_second,0.489


In [11]:
del model
gc.collect()
torch.cuda.empty_cache()

## 발견 사항

1epoch → 512개 데이터를 학습하는데 
GPU 0 Memory Allocated (%)	12349341696 → 12.34GB를 씀
저 위에 있는 11330MB는 model load + cuda kernel이기 때문에 
실제 학습할때는 대량 1GB정도 사용 
그렇다면 이를 4 batch size가 아닌, 1 batch size를 쓰면 어떨까?
위의 이론에 따르면 사용하는 메모리의 수가 줄어야 한다. → 왜냐하면 한번에 GPU에 올리는 데이터 양이 적어지기 때문
거기에 학습 시간은 늘어나야함

### 결과
batch size => 4 일 때 
* GPU 0 Memory Allocated (%)	12351438848 -> 12.351GB 사용
* 학습 시간 591.53 second
batch size => 1 일 때 
* GPU 0 Memory Allocated (%)	12349341696 → 12.349GB를 씀
* 학습 시간 1047.24 second


실제로 배치 크기가 줄어나면 학습 시간은 늘어나고 메모리 사용량은 소폭 감소한 것을 확인