# LoRA

### LoRA를 활용한 GPT-2 감성 분석 모델 튜닝

In [1]:
!pip install peft datasets transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
from huggingface_hub import login
login(token="hf_xxx")

In [3]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### 1. 기반 모델 load

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### 2. LoRA 설정 (LoraConfig)

In [6]:
# LoRA 설정
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn", "c_fc", "c_proj"],
    lora_dropout=0.1
)

In [7]:
# 기반 모델 + LoRA 설정 == 학습할 모델
from peft import get_peft_model

model = get_peft_model(base_model, lora_config).to(device)



##### 3. 데이터 준비

In [8]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [None]:
pos_sample = [data for data in dataset["train"] if data["label"] == 1][:500]
neg_sample = [data for data in dataset["train"] if data["label"] == 0][:500]

train_texts = [data["text"] for data in pos_sample + neg_sample]
train_labels = [data["label"] for data in pos_sample + neg_sample]

In [None]:
pos_eval = [data for data in dataset["test"] if data["label"] == 1][:100]
neg_eval = [data for data in dataset["test"] if data["label"] == 0][:100]

eval_texts = [data["text"] for data in pos_eval + neg_eval]
eval_labels = [data["label"] for data in pos_eval + neg_eval]

In [11]:
# 토큰화 함수
def preprocess_data(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )
    encodings["labels"] = torch.tensor(labels, dtype=torch.long)
    
    return encodings

In [12]:
# 토큰화 함수 적용 -> 전처리
train_encodings = preprocess_data(train_texts, train_labels)
eval_encodings = preprocess_data(eval_texts, eval_labels)

In [13]:
# 데이터셋 변환을 위한 클래스
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            key: val[idx] for key, val in self.encodings.items()
        }

In [14]:
# 데이터셋 형태로 변환
train_dataset = IMDBDataset(train_encodings)
eval_dataset = IMDBDataset(eval_encodings)

In [None]:
# 배치를 만들어 줄 함수
def collate_fn(batch):
    batch = {
        key: torch.stack([item[key] for item in batch]) for key in batch[0]
    }
    return batch

##### 4. 학습 준비

In [None]:
# 학습 설정
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    save_steps=100,
    save_total_limit=2,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True    
)

In [17]:
# Trainer 설정 (학습할 모델 + 학습 설정 + 데이터셋)
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=collate_fn
)

In [18]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5106,0.597282
2,0.4646,0.445758
3,0.3631,0.481778
4,0.3135,0.421033
5,0.1858,0.419017


TrainOutput(global_step=1250, training_loss=0.5302536145687103, metrics={'train_runtime': 140.2144, 'train_samples_per_second': 35.66, 'train_steps_per_second': 8.915, 'total_flos': 1324603146240000.0, 'train_loss': 0.5302536145687103, 'epoch': 5.0})

##### 5. 추론

In [19]:
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

        return "긍정" if prediction == 1 else "부정"

In [35]:
# test_review = "I enjoyed watching the movie!"
test_review = "It was boring!"
result = predict_sentiment(test_review)
result

'부정'

In [None]:
# base model 추론 결과와 비교
reload_base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def predict_sentiment_origin(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    with torch.no_grad():
        outputs = reload_base_model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

        return prediction

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# base model 추론 결과와 비교
# test_review = "I enjoyed watching the movie!"
test_review = "It was boring!"
result = predict_sentiment_origin(test_review)
result

0