##어떤 모델을 선택했는가?


In [4]:
!pip install datasets



In [5]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [6]:
!pip install pynvml



In [7]:
print_gpu_utilization()

GPU memory occupied: 3693 MB.


In [8]:
!pip install transformers datasets torch pynvml

import torch
from transformers import T5ForConditionalGeneration
from pynvml import *



In [9]:
model = T5ForConditionalGeneration.from_pretrained("t5-large").to("cuda")


In [10]:
print("t5-large 모델 적용 후 :")
print_gpu_utilization()

t5-large 모델 적용 후 :
GPU memory occupied: 6507 MB.


In [12]:
input_text = "안녕 반가워 오늘이 finetuning study 막날이라니 슬퍼서 눈물이 나와 뚝뚝 ㅋㅋ"
input_ids = torch.tensor([[model.config.pad_token_id] * 32]).to("cuda")
print("After creating input tensor:")
print_gpu_utilization()

with torch.no_grad():
    output = model.generate(input_ids)

print("After running inference:")
print_gpu_utilization()


After creating input tensor:
GPU memory occupied: 6603 MB.
After running inference:
GPU memory occupied: 6603 MB.


In [13]:
!pip install wandb
import wandb

wandb.login()
wandb.init(project="t5-large-memory-test")




[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchallengef0802[0m ([33msongyeog[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
import torch
import wandb
from transformers import T5ForConditionalGeneration
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def get_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    gpu_memory_mb = info.used // 1024 ** 2
    return gpu_memory_mb

def log_gpu_utilization(step, description):
    gpu_memory = get_gpu_utilization()
    print(f"Step {step}: {description} - GPU memory occupied: {gpu_memory} MB.")
    wandb.log({"GPU memory (MB)": gpu_memory}, step=step)

wandb.init(project="t5-large-memory-test")

step = 0

log_gpu_utilization(step, "Before loading the model")
step += 1

model = T5ForConditionalGeneration.from_pretrained("t5-large").to("cuda")
log_gpu_utilization(step, "After loading the model")
step += 1

input_text = "translate English to French: This is a test sentence."
input_ids = torch.tensor([[model.config.pad_token_id] * 32]).to("cuda")
log_gpu_utilization(step, "After creating input tensor")
step += 1

with torch.no_grad():
    output = model.generate(input_ids)

log_gpu_utilization(step, "After running inference")
step += 1

Step 0: Before loading the model - GPU memory occupied: 3797 MB.
Step 1: After loading the model - GPU memory occupied: 3797 MB.
Step 2: After creating input tensor - GPU memory occupied: 3797 MB.
Step 3: After running inference - GPU memory occupied: 3805 MB.


3. 실험 환경
    1. 어떤 GPU를 사용했는지
    2. GPU의 VRAM은 얼마나 있는지
    3. CPU 코어는 몇개인지

In [19]:
import torch
import psutil
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetCount, nvmlDeviceGetName

def get_gpu_info():
    nvmlInit()
    device_count = nvmlDeviceGetCount()

    for i in range(device_count):
        handle = nvmlDeviceGetHandleByIndex(i)
        gpu_name = nvmlDeviceGetName(handle)
        vram_info = nvmlDeviceGetMemoryInfo(handle)
        total_vram = vram_info.total // (1024**2)

        print(f"GPU {i}: {gpu_name}")
        print(f"Total VRAM: {total_vram} MB\n")

def get_cpu_info():
    cpu_count = psutil.cpu_count(logical=True)  # 논리적 CPU 코어 수
    physical_cores = psutil.cpu_count(logical=False)  # 물리적 코어 수

    print(f"Total CPU cores (Logical): {cpu_count}")
    print(f"Physical CPU cores: {physical_cores}\n")

print("**Experiment Environment**")
get_gpu_info()
get_cpu_info()

if torch.cuda.is_available():
    print(f"PyTorch CUDA Available: ✅ Yes (Device: {torch.cuda.get_device_name(0)})")
else:
    print("PyTorch CUDA Available: ❌ No")


**Experiment Environment**
GPU 0: NVIDIA A100-SXM4-40GB
Total VRAM: 40960 MB

Total CPU cores (Logical): 12
Physical CPU cores: 6

PyTorch CUDA Available: ✅ Yes (Device: NVIDIA A100-SXM4-40GB)


#Gradient Accumulation 적용

In [20]:
!pip install transformers datasets torch pynvml accelerate wandb




###TF32 적용

In [21]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [22]:
import torch
import wandb
import psutil
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

wandb.init(project="t5-large-tf32-gradient-accumulation")


In [23]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [24]:
tokenizer = T5Tokenizer.from_pretrained("t5-large")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [25]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]



In [26]:
model = T5ForConditionalGeneration.from_pretrained("t5-large").to("cuda")

In [27]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    logging_dir="./logs",
    logging_steps=1,
    report_to="wandb",
    save_strategy="no"
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

In [34]:
def get_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return info.used // (1024 ** 2)
def log_gpu_utilization(step, description=""):
    gpu_memory = get_gpu_utilization()
    print(f"Step {step}: {description} - GPU memory occupied: {gpu_memory} MB.")
    wandb.log({"GPU memory (MB)": gpu_memory}, step=step)


step = 0

# 학습 시작 전 GPU 사용량
log_gpu_utilization(step, "Before Training")
step += 1

# 학습 시작
trainer.train()

# 학습 후 GPU 사용량
log_gpu_utilization(step, "After Training")

wandb.finish()

Step 0: Before Training - GPU memory occupied: 6615 MB.


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1,10.3117
2,8.6873
3,7.1787
4,5.321
5,4.2756
6,3.532
7,2.8278
8,2.1581
9,1.8136
10,1.5564




Step 1: After Training - GPU memory occupied: 34845 MB.


0,1
GPU memory (MB),▁▁▁█
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,███▇█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
train/loss,█▇▆▅▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
GPU memory (MB),3805.0
total_flos,1.7974222651392e+16
train/epoch,2.89136
train/global_step,66.0
train/grad_norm,0.33215
train/learning_rate,0.0
train/loss,0.5779
train_loss,1.27851
train_runtime,324.4573
train_samples_per_second,26.546


In [35]:
model.gradient_checkpointing_enable()

In [36]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    logging_dir="./logs",
    gradient_checkpointing=True,
    logging_steps=1,
    report_to="wandb",
    save_strategy="no"
)

In [38]:
wandb.init(project="t5-large-gradient-checkpointing")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

step = 0

log_gpu_utilization(step, "Before Training")
step += 1

trainer.train()

log_gpu_utilization(step, "After Training")

wandb.finish()

Step 0: Before Training - GPU memory occupied: 34845 MB.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,0.5751
2,0.5989
3,0.5863
4,0.5629
5,0.571
6,0.5324
7,0.5795
8,0.5297
9,0.5339
10,0.564


Step 1: After Training - GPU memory occupied: 15663 MB.


0,1
GPU memory (MB),▁
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,▅▂▂▃▃▂▂▅▂▂▂▂▁▂▂▂▂▄█▃▂▂▃▂▂▃▃▂▃▃▃▂▃▃▂▃▂▃▄▃
train/learning_rate,███▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,███▇▇▇▇▇▆▇▁▆▆▆▆▆▆▆▇▆▇▇▆▆▆▆▁▆▇▇▇▆▆▇▆▆▆▆▆▇

0,1
GPU memory (MB),34845.0
total_flos,1.7974222651392e+16
train/epoch,2.89136
train/global_step,66.0
train/grad_norm,0.38358
train/learning_rate,0.0
train/loss,0.5204
train_loss,0.51032
train_runtime,453.6838
train_samples_per_second,18.985


##Mixed Precision 적용

In [39]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    fp16=True, #이 부분
    logging_dir="./logs",
    logging_steps=1,
    report_to="wandb",
    save_strategy="no"
)


In [40]:
wandb.init(project="t5-large-gradient-checkpointing + mixed precision")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

step = 0

log_gpu_utilization(step, "Before Training")
step += 1

trainer.train()

log_gpu_utilization(step, "After Training")

wandb.finish()

Step 0: Before Training - GPU memory occupied: 15663 MB.


Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


Step 1: After Training - GPU memory occupied: 10037 MB.


0,1
GPU memory (MB),▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
GPU memory (MB),15663.0
total_flos,1.7974222651392e+16
train/epoch,2.89136
train/global_step,66.0
train/grad_norm,
train/learning_rate,5e-05
train/loss,0.0
train_loss,0.0
train_runtime,485.6355
train_samples_per_second,17.736


# 34.8GB → 10GB로 최종적으로 메모리 사용량을 줄일 수 있었다.