# PEFT LoRA (Google Colab) ??
?? Hugging Face PEFT ???? LoRA ??????? torch/transformers ????????? SMS Spam ???????????????????


In [None]:
# ????????????????????
%pip install -q --upgrade torch==2.8.0 --index-url https://download.pytorch.org/whl/cu121             transformers==4.51.3 peft==0.13.2 accelerate==1.0.1             pandas==2.2.2 numpy==2.0.2 tqdm==4.67.1 matplotlib==3.10.7             requests==2.32.5 safetensors==0.6.2


In [None]:
import io
import os
import random
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)
from peft import LoraConfig, TaskType, get_peft_model
from tqdm import tqdm
import requests

torch.manual_seed(42)
random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


## ????????? + ?? + ?????
- ????????????? UCI ?? SMS Spam Collection?
- ????????? 7:1:2 ????????


In [None]:
def create_balanced_dataset(df):
    label_col = df["Label"]
    spam_mask = (label_col == "spam") | (label_col == 1)
    ham_mask = (label_col == "ham") | (label_col == 0)
    spam_count = int(spam_mask.sum())
    if spam_count == 0 or ham_mask.sum() == 0:
        raise ValueError("?????? spam ? ham ??????????")
    ham_subset = df[ham_mask].sample(spam_count, random_state=123)
    balanced = pd.concat([ham_subset, df[spam_mask]])
    return balanced.sample(frac=1, random_state=123).reset_index(drop=True)

def random_split(df, train_frac=0.7, val_frac=0.1):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    val_end = train_end + int(len(df) * val_frac)
    train_df = df[:train_end]
    val_df = df[train_end:val_end]
    test_df = df[val_end:]
    return train_df, val_df, test_df

def ensure_sms_file(local_path="sms_spam_collection/SMSSpamCollection.tsv"):
    if os.path.exists(local_path):
        return local_path
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        raw = zf.read("SMSSpamCollection").decode("utf-8")
    out_path = "SMSSpamCollection.tsv"
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(raw)
    return out_path

def load_sms_dataframe():
    data_path = ensure_sms_file()
    df = pd.read_csv(data_path, sep="	", names=["Label", "Text"], header=None)
    balanced = create_balanced_dataset(df)
    label_map = {"ham": 0, "spam": 1}
    balanced["Label"] = balanced["Label"].apply(lambda v: label_map[v] if v in label_map else int(v))
    return random_split(balanced, train_frac=0.7, val_frac=0.1)

class SpamSequenceDataset(Dataset):
    '''?? Hugging Face ???????????'''

    def __init__(self, df, tokenizer, max_length=96):
        encodings = tokenizer(
            df["Text"].tolist(),
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt",
        )
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = torch.tensor(df["Label"].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],
        }


In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

train_df, val_df, test_df = load_sms_dataframe()
max_length = 96

train_dataset = SpamSequenceDataset(train_df, tokenizer, max_length=max_length)
val_dataset = SpamSequenceDataset(val_df, tokenizer, max_length=max_length)
test_dataset = SpamSequenceDataset(test_df, tokenizer, max_length=max_length)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print(f"Train/Val/Test sizes: {len(train_dataset)}/{len(val_dataset)}/{len(test_dataset)}")


## ?? Hugging Face PEFT ?? LoRA
- ??? AutoModelForSequenceClassification ??? LoRA?
- ? LoRA ???????????????


In [None]:
base_model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2)
base_model.config.pad_token_id = tokenizer.pad_token_id
base_model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["c_attn", "c_fc", "c_proj"],
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
model.to(device)


## ????????? LoRA ???
???? warmup??????????????


In [None]:
def evaluate(model, data_loader):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            preds = outputs.logits.argmax(dim=-1)
            total_loss += loss.item()
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)
    avg_loss = total_loss / max(len(data_loader), 1)
    acc = correct / total if total else 0.0
    return acc, avg_loss

def train(model, train_loader, val_loader, epochs=3, lr=5e-4):
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    total_steps = epochs * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=max(10, int(0.1 * total_steps)),
        num_training_steps=total_steps,
    )

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()

        val_acc, val_loss = evaluate(model, val_loader)
        print(
            f"Epoch {epoch+1}: train_loss={running_loss/len(train_loader):.4f} "
            f"| val_loss={val_loss:.4f} | val_acc={val_acc*100:.1f}%"
        )


In [None]:
train(model, train_loader, val_loader, epochs=3, lr=5e-4)

test_acc, test_loss = evaluate(model, test_loader)
print(f"Test: loss={test_loss:.4f}, acc={test_acc*100:.1f}%")

sample_texts = [
    "Hey, want to grab lunch together?",
    "URGENT! You've won ! Click now to claim your prize!",
    "The meeting has been moved to 2pm",
    "Free iPhone! Limited time offer! Call immediately!",
]

model.eval()
for text in sample_texts:
    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        logits = model(**encoded).logits
        probs = torch.softmax(logits, dim=-1)[0]
    pred = torch.argmax(probs).item()
    label = "spam" if pred == 1 else "ham"
    print(f"{label.upper()} | p(spam)={probs[1]:.3f} | text={text[:50]}...")
