In [4]:
# run_cnn_baselines.py
# PyTorch CNN baselines for text classification
# - "bertcnn": BERT encoder + CNN head (recommended)
# - "textcnn": lightweight baseline
# Fix: auto-cap max_len to model's max_position_embeddings (e.g., KcBERT=300)

import os, math, random, numpy as np, pandas as pd, torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, classification_report
from transformers import (
    AutoTokenizer, AutoModel, AutoConfig, set_seed
)

# ---------------- CFG ----------------
class CFG:
    model_type   = "bertcnn"          # "bertcnn" or "textcnn"
    bert_name    = "beomi/KcBERT-base"
    max_len      = 512                # will be capped to model_limit automatically
    batch_size   = 16
    epochs       = 5
    lr           = 2e-5               # for bertcnn
    lr_textcnn   = 1e-3               # for textcnn
    weight_decay = 0.01
    val_ratio    = 0.15
    seed         = 2025
    data_dir     = "data"
    out_dir      = "outputs"
    device       = "cuda" if torch.cuda.is_available() else "cpu"

os.makedirs(CFG.out_dir, exist_ok=True)
set_seed(CFG.seed)
device = torch.device(CFG.device)

# --------- IO ----------
def load_train_df(path):
    df = pd.read_csv(path)
    # expects columns: conversation, class
    df = df.rename(columns={"conversation":"text","class":"label"})
    df["text"] = df["text"].astype(str).str.replace("\n", " ")
    df["label"] = df["label"].astype(str)
    return df

def load_test_df(path):
    df = pd.read_csv(path)
    df["text"] = df["conversation"].astype(str).str.replace("\n", " ")
    return df

# 제출 코드 매핑(필요 시 수정)
NAME2CODE = {
    '협박 대화': '00',
    '갈취 대화': '01',
    '직장 내 괴롭힘 대화': '02',
    '기타 괴롭힘 대화': '03',
    '일반 대화': '04',
}

# -------- Tokenizer & Collate --------
def prepare_tokenizer_and_maxlen():
    # cap CFG.max_len to model's max_position_embeddings
    cfg = AutoConfig.from_pretrained(CFG.bert_name)
    model_limit = getattr(cfg, "max_position_embeddings", 512)
    CFG.max_len = min(CFG.max_len, model_limit)

    tok = AutoTokenizer.from_pretrained(CFG.bert_name, use_fast=True)
    tok.model_max_length = CFG.max_len
    return tok

def make_collate(tokenizer):
    def _fn(batch):
        texts, labels = zip(*batch)
        enc = tokenizer(
            list(texts),
            padding="max_length",
            truncation=True,
            max_length=CFG.max_len,
            return_tensors="pt"
        )
        y = torch.tensor(labels, dtype=torch.long)
        return {"input_ids": enc["input_ids"], "attention_mask": enc["attention_mask"]}, y
    return _fn

class RawDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels=None):
        self.texts = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        y = -1 if self.labels is None else self.labels[i]
        return self.texts[i], y

# -------- Models --------
class TextCNN(nn.Module):
    def __init__(self, vocab_size, pad_id, num_labels, emb_dim=300, kernels=(3,4,5), channels=128, dropout=0.5):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_id)
        self.convs = nn.ModuleList([nn.Conv1d(emb_dim, channels, k) for k in kernels])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(channels * len(kernels), num_labels)
    def forward(self, input_ids, attention_mask=None):
        x = self.emb(input_ids)              # [B, L, E]
        x = x.transpose(1, 2)                # [B, E, L]
        feats = []
        for conv in self.convs:
            h = F.relu(conv(x))              # [B, C, L']
            if attention_mask is not None:
                Lp = h.size(-1)
                mask = attention_mask[:, :Lp].unsqueeze(1)  # [B,1,L']
                h = h.masked_fill(mask == 0, float("-inf"))
            h = torch.max(h, dim=-1).values  # Global Max Pool -> [B, C]
            feats.append(h)
        z = torch.cat(feats, dim=1)          # [B, C*len(k)]
        z = self.dropout(z)
        return self.fc(z)                    # [B, num_labels]

class BertCnnHead(nn.Module):
    def __init__(self, bert_name, num_labels, kernels=(2,3,4), channels=128, dropout=0.1):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(bert_name)
        hid = self.backbone.config.hidden_size
        self.convs = nn.ModuleList([nn.Conv1d(hid, channels, k) for k in kernels])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(channels * len(kernels), num_labels)
    def forward(self, input_ids, attention_mask):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        H = out.last_hidden_state              # [B, L, H]
        x = H.transpose(1, 2)                  # [B, H, L]
        feats = []
        for conv in self.convs:
            h = F.relu(conv(x))                # [B, C, L']
            Lp = h.size(-1)
            mask = attention_mask[:, :Lp].unsqueeze(1)  # [B,1,L']
            h = h.masked_fill(mask == 0, float("-inf"))
            h = torch.max(h, dim=-1).values    # [B, C]
            feats.append(h)
        z = torch.cat(feats, dim=1)
        z = self.dropout(z)
        return self.fc(z)                      # [B, num_labels]

# -------- Train/Eval --------
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    ys, ps = [], []
    for enc, y in loader:
        enc = {k:v.to(device) for k,v in enc.items()}
        y = y.to(device)
        logits = model(**enc)
        ps.append(logits.softmax(1).cpu())
        ys.append(y.cpu())
    P = torch.cat(ps).numpy()
    Y = torch.cat(ys).numpy()
    preds = P.argmax(1)
    f1 = f1_score(Y, preds, average="macro")
    return f1, Y, preds

def train_one_epoch(model, loader, optimizer, scheduler=None):
    model.train()
    total = 0.0
    for enc, y in tqdm(loader, ncols=100, leave=False):
        enc = {k:v.to(device) for k,v in enc.items()}
        y = y.to(device)
        logits = model(**enc)
        loss = F.cross_entropy(logits, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        if scheduler: scheduler.step()
        total += loss.item()
    return total / max(1, len(loader))

# -------- Main --------
def main():
    # 1) Data
    train_df = load_train_df(os.path.join(CFG.data_dir, "train.csv"))
    test_df  = load_test_df(os.path.join(CFG.data_dir,  "test.csv"))

    # labels
    # keep the fixed NAME2CODE order if present
    def order_key(name):
        return list(NAME2CODE.keys()).index(name) if name in NAME2CODE else 999
    labels = sorted(train_df["label"].unique().tolist(), key=order_key)
    lid = {l:i for i,l in enumerate(labels)}
    id2code = {i: NAME2CODE.get(labels[i], f"{i:02d}") for i in range(len(labels))}
    y_all = np.array([lid[l] for l in train_df["label"].tolist()])
    texts = train_df["text"].tolist()

    # 2) tokenizer (with max_len cap)
    tok = prepare_tokenizer_and_maxlen()

    # 3) stratified split
    sss = StratifiedShuffleSplit(n_splits=1, test_size=CFG.val_ratio, random_state=CFG.seed)
    tr_idx, va_idx = next(sss.split(np.arange(len(texts)), y_all))
    tr_texts, va_texts = [texts[i] for i in tr_idx], [texts[i] for i in va_idx]
    tr_labels, va_labels = y_all[tr_idx].tolist(), y_all[va_idx].tolist()

    # 4) datasets/loaders
    collate = make_collate(tok)
    train_ds = RawDataset(tr_texts, tr_labels)
    val_ds   = RawDataset(va_texts, va_labels)
    test_ds  = RawDataset(test_df["text"].tolist(), None)

    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True,  collate_fn=collate, num_workers=0)
    val_loader   = torch.utils.data.DataLoader(val_ds,   batch_size=CFG.batch_size, shuffle=False, collate_fn=collate, num_workers=0)
    test_loader  = torch.utils.data.DataLoader(test_ds,  batch_size=CFG.batch_size, shuffle=False, collate_fn=collate, num_workers=0)

    # 5) model
    if CFG.model_type == "textcnn":
        model = TextCNN(
            vocab_size=tok.vocab_size,
            pad_id=tok.pad_token_id if tok.pad_token_id is not None else 0,
            num_labels=len(labels),
            emb_dim=300, kernels=(3,4,5), channels=128, dropout=0.2
        )
        lr = CFG.lr_textcnn
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        scheduler = None
    else:
        model = BertCnnHead(CFG.bert_name, num_labels=len(labels), kernels=(2,3,4), channels=128, dropout=0.1)
        no_decay = ["bias", "LayerNorm.weight"]
        grouped = [
            {"params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay":CFG.weight_decay},
            {"params":[p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay":0.0},
        ]
        lr = CFG.lr
        optimizer = torch.optim.AdamW(grouped, lr=lr)
        total_steps = math.ceil(len(train_loader) * CFG.epochs)
        # simple linear warmup→decay
        warmup = max(1, int(0.1 * total_steps))
        scheduler = torch.optim.lr_scheduler.SequentialLR(
            optimizer,
            schedulers=[
                torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup),
                torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=total_steps - warmup),
            ],
            milestones=[warmup]
        )

    model = model.to(device)

    # 6) train
    best_f1, best_state = -1.0, None
    for ep in range(1, CFG.epochs+1):
        tr_loss = train_one_epoch(model, train_loader, optimizer, scheduler)
        f1, y_true, y_pred = evaluate(model, val_loader)
        print(f"[{CFG.model_type}] Epoch {ep}/{CFG.epochs}  loss={tr_loss:.4f}  val_macro_f1={f1:.4f}")
        if f1 > best_f1:
            best_f1, best_state = f1, {k:v.cpu() for k,v in model.state_dict().items()}
            report = classification_report(y_true, y_pred, target_names=labels, digits=4)

    # 7) save and predict
    torch.save({"state_dict": best_state, "labels": labels}, os.path.join(CFG.out_dir, f"{CFG.model_type}_best.pt"))
    with open(os.path.join(CFG.out_dir, f"{CFG.model_type}_val_report.txt"), "w", encoding="utf-8") as f:
        f.write(report)

    # load best to eval
    model.load_state_dict(best_state)
    model.eval()
    preds = []
    with torch.no_grad():
        for enc, _ in tqdm(test_loader, ncols=100, desc="Predict"):
            enc = {k:v.to(device) for k,v in enc.items()}
            logits = model(**enc)
            preds.append(logits.argmax(1).cpu())
    pred_ids = torch.cat(preds).numpy()
    pred_codes = [id2code[int(i)] for i in range(len(labels))]  # ensure mapping built
    pred_codes = [NAME2CODE.get(labels[i], f"{i:02d}") for i in pred_ids]

    sub = pd.DataFrame({"idx": test_df["idx"], "class": pred_codes})
    out_path = os.path.join(CFG.out_dir, f"submission_{CFG.model_type}.csv")
    sub.to_csv(out_path, index=False)
    print("saved:", out_path)

if __name__ == "__main__":
    main()


                                                                                                    

[bertcnn] Epoch 1/5  loss=0.7510  val_macro_f1=0.9122


                                                                                                    

[bertcnn] Epoch 2/5  loss=0.2144  val_macro_f1=0.9206


                                                                                                    

[bertcnn] Epoch 3/5  loss=0.1109  val_macro_f1=0.9302


                                                                                                    

[bertcnn] Epoch 4/5  loss=0.0336  val_macro_f1=0.9288


                                                                                                    

[bertcnn] Epoch 5/5  loss=0.0142  val_macro_f1=0.9340


Predict: 100%|██████████████████████████████████████████████████████| 32/32 [00:01<00:00, 18.95it/s]

saved: outputs\submission_bertcnn.csv



