In [2]:
import time

import pandas as pd
import torch
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer

# GPU 사용 여부 확인
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

device: cuda


# Model

In [3]:
# 모델 불러오기 (로컬 경로 or Huggingface 모델명)
model_path = "meta-llama/Llama-3.2-1B"  # 또는 "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16).to(
    device
)
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [16]:
# 시퀀스 길이 리스트 (점점 증가)
sequence_lengths = [64, 128, 256, 512, 1024, 2048, 4096]

# 측정 결과 저장용
results = []

for seq_len in sequence_lengths:
    prompt = "Hello world. " * (seq_len // 3)  # 대충 seq_len에 맞는 텍스트 생성
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=seq_len).to(device)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    start = time.time()
    with torch.no_grad():
        _ = model.generate(**inputs, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)
    end = time.time()

    # GPU 사용량 (MB)
    max_memory = torch.cuda.max_memory_allocated(device) / (1024 * 1024)

    results.append({
        "seq_len": seq_len,
        "shape": inputs['input_ids'].shape,
        "inference_time": round(end - start, 4),
        "max_memory_MB": round(max_memory, 2)
    })

# 결과 출력
import pandas as pd
df = pd.DataFrame(results)
print(df)

   seq_len      shape  inference_time  max_memory_MB
0       64    (1, 64)          0.0101        2373.51
1      128   (1, 128)          0.0104        2378.33
2      256   (1, 256)          0.0148        2390.36
3      512   (1, 512)          0.0265        2414.44
4     1024  (1, 1024)          0.0437        2462.58
5     2048  (1, 2048)          0.0850        2558.88
6     4096  (1, 4096)          0.1615        2751.46
