In [1]:
!pip install --upgrade transformers peft accelerate datasets bitsandbytes accelerate

[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from itertools import islice
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [2]:
# 1. 모델 및 토크나이저 로딩
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.88it/s]


In [3]:
# 2. QLoRA 설정
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # 🔥 핵심
)
model = get_peft_model(model, peft_config)

In [4]:
# model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3])

In [5]:
# 3. 데이터셋 로딩 및 전처리
# dataset = load_dataset("codeparrot/github-code-clean", split="train[:5000]")  # 일부만 사용 (예제 목적)
!export HF_DATASETS_VERBOSE=1
dataset = load_dataset(
    path="/datasets/github-code/github-code-clean",
    data_dir="/datasets/github-code/hf_data",
    cache_dir="/datasets/github-code/hf_cache",
    trust_remote_code=True,
    streaming=True
)

# 앞에서 10000개만 추출
subset = list(islice(dataset["train"], 10000))

# Hugging Face Dataset으로 변환
dataset = Dataset.from_list(subset)

In [6]:
dataset

Dataset({
    features: ['code', 'repo_name', 'path', 'language', 'license', 'size'],
    num_rows: 10000
})

In [7]:
# 4. 코드 전용 프롬프트 포맷
def format_example(example):
    code = example["code"]
    return {"text": f"# Python code snippet:\n{code.strip()}"}

dataset = dataset.map(format_example)


Map: 100%|██████████| 10000/10000 [00:02<00:00, 4799.91 examples/s]


In [8]:
# 5. 토크나이징 + 라벨 추가 (language modeling 용)
def tokenize_and_add_labels(example):
    text = example.get("code") or example.get("text") or example.get("content")
    result = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512
    )
    result["labels"] = result["input_ids"].copy()  # causal LM용 라벨 설정
    return result

tokenized_dataset = dataset.map(
    tokenize_and_add_labels,
    batched=True,
    remove_columns=dataset.column_names  # 모델에 필요한 input_ids, attention_mask, labels만 남김
)

Map: 100%|██████████| 10000/10000 [00:18<00:00, 527.71 examples/s]


In [9]:
from torch.optim import AdamW

# 학습 가능한 파라미터만 추출 (LoRA 파라미터만 학습 대상인 경우)
optimizer = AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=2e-4,  # 일반적으로 2e-4 ~ 1e-4 사이에서 시작
    weight_decay=0.01
)

In [10]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
from accelerate import Accelerator

accelerator = Accelerator()
print(accelerator.state)  # 멀티 GPU 및 설정 상태 확인


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:


# DataLoader 설정
train_dataloader = DataLoader(
    tokenized_dataset,
    batch_size=2,  # 주인님의 GPU 메모리에 따라 조절하세요 (예: 2 ~ 8)
    shuffle=True,
    collate_fn=default_data_collator  # input_ids, attention_mask, labels 자동 정렬
)

model, train_dataloader, optimizer = accelerator.prepare(
    model, train_dataloader, optimizer
)

In [11]:
# 6. 학습 설정
training_args = TrainingArguments(
    output_dir="./gemma-2b-code-finetuned",
    remove_unused_columns=False,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    fp16=True,
    optim="paged_adamw_32bit",
    report_to="none"
)


In [12]:
# 7. Trainer 구성
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
from tqdm import tqdm
num_epochs=1
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 로그 출력
        if step % 100 == 0:
            accelerator.print(f"[Epoch {epoch}] Step {step} - Loss: {loss.item():.4f}")

    accelerator.print(f"===> Epoch {epoch} 완료. 평균 Loss: {total_loss / len(train_dataloader):.4f}")

  0%|          | 0/5000 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 35.12 MiB is free. Process 815127 has 15.73 GiB memory in use. Of the allocated memory 15.34 GiB is allocated by PyTorch, and 28.82 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(type(tokenized_dataset))

In [None]:

# 9. 모델 저장
model.save_pretrained("./gemma-2b-code-finetuned")
tokenizer.save_pretrained("./gemma-2b-code-finetuned")