In [None]:
import os, json, math
import pandas as pd, numpy as np, torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import f1_score, classification_report

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    BertForSequenceClassification, get_linear_schedule_with_warmup, set_seed
)

class CFG:
    model_name = "skt/kobert-base-v1"
    max_len    = 384
    lr         = 3e-5
    batch_size = 16
    epochs     = 3
    seed       = 2025
    val_ratio  = 0.1
    data_dir   = "data"
    out_dir    = "outputs"
    nl_token   = " [NL] "   # 학습 시 줄바꿈 치환

os.makedirs(f"{CFG.out_dir}/reports", exist_ok=True)
os.makedirs(f"{CFG.out_dir}/models", exist_ok=True)
set_seed(CFG.seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_train_df(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # 외부 스키마 -> 내부 스키마로 매핑
    df.rename(columns={"conversation":"text", "class":"label"}, inplace=True)
    df["text"] = df["text"].astype(str).str.replace("\n", CFG.nl_token)
    df["label"] = df["label"].astype(str)
    return df

def load_test_df(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["text_original"] = df["conversation"].astype(str)  # 원본 보존
    df.rename(columns={"conversation":"text"}, inplace=True)
    # 수정: 학습 데이터와 동일하게 줄바꿈 치환
    df["text"] = df["text"].astype(str).str.replace("\n", CFG.nl_token)
    return df

def build_label_maps(train_df):
    unique_labels = train_df["label"].unique().tolist()
    # 수정: 숫자형이면 정수로 정렬, 아니면 문자열 정렬
    try:
        labels = sorted(unique_labels, key=lambda x: int(x))
    except (ValueError, TypeError):
        labels = sorted(unique_labels)
    lid = {l:i for i,l in enumerate(labels)}
    inv = {v:k for k,v in lid.items()}
    return labels, lid, inv

class TxtClsDS(Dataset):
    def __init__(self, df, lid=None, has_label=True):
        self.texts = df["text"].tolist()
        self.has_label = has_label
        self.labels = [lid[l] for l in df["label"].tolist()] if has_label else None
    def __len__(self): return len(self.texts)
    def __getitem__(self, i):
        if self.has_label: return self.texts[i], self.labels[i]
        return self.texts[i], -1

def make_collate(tokenizer, max_len):
    def _collate(batch):
        texts = [b[0] for b in batch]
        enc = tokenizer(texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
        labels = torch.tensor([b[1] for b in batch], dtype=torch.long)
        return enc, labels
    return _collate

# 수정: 재현성을 위한 worker seed 함수 (num_workers > 0일 때만 필요)
# def seed_worker(worker_id):
#     np.random.seed(CFG.seed + worker_id)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    logits_all, labels_all = [], []
    for enc, labels in loader:
        enc = {k:v.to(device) for k,v in enc.items()}
        labels = labels.to(device)
        out = model(**enc)
        logits_all.append(out.logits.detach().cpu())
        labels_all.append(labels.detach().cpu())
    logits = torch.cat(logits_all, 0).numpy()
    y_true = torch.cat(labels_all, 0).numpy()
    y_pred = logits.argmax(1)
    return f1_score(y_true, y_pred, average="macro"), y_true, y_pred

def train_epoch(model, loader, optimizer, scheduler):
    model.train()
    total = 0.0
    for enc, labels in loader:
        enc = {k:v.to(device) for k,v in enc.items()}
        labels = labels.to(device)
        out = model(**enc, labels=labels)
        loss = out.loss
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total += loss.item()
    return total / max(1, len(loader))

def load_tok_model(model_name, num_labels):
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    cfg = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
    try:
        mdl = AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)
    except Exception:
        # Auto가 실패하면 BERT 헤드로 강제 로드
        mdl = BertForSequenceClassification.from_pretrained(model_name, config=cfg)
    return tok, mdl

def main():
    train_df = load_train_df(f"{CFG.data_dir}/train.csv")
    test_df  = load_test_df(f"{CFG.data_dir}/test.csv")

    labels, lid, inv = build_label_maps(train_df)
    full_ds = TxtClsDS(train_df, lid, has_label=True)

    # 자동 검증 분할
    val_size = max(1, int(len(full_ds) * CFG.val_ratio))
    train_size = len(full_ds) - val_size
    train_ds, val_ds = random_split(full_ds, [train_size, val_size], generator=torch.Generator().manual_seed(CFG.seed))

    tok, model = load_tok_model(CFG.model_name, num_labels=len(labels))
    model = model.to(device)

    collate = make_collate(tok, CFG.max_len)
    # 수정: num_workers=0으로 설정 (Windows 호환성)
    train_loader = DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True,  
                             num_workers=0, collate_fn=collate)
    val_loader   = DataLoader(val_ds,   batch_size=CFG.batch_size, shuffle=False, 
                             num_workers=0, collate_fn=collate)

    total_steps = len(train_loader) * CFG.epochs
    optim = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=0.01)
    sched = get_linear_schedule_with_warmup(optim, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

    best_f1, best_path, last_report = -1.0, None, ""
    for ep in range(1, CFG.epochs+1):
        loss = train_epoch(model, train_loader, optim, sched)
        f1, y_true, y_pred = evaluate(model, val_loader)
        print(f"[Epoch {ep}] loss={loss:.4f}  val_macro_f1={f1:.4f}")
        if f1 > best_f1:
            best_f1 = f1
            best_path = f"{CFG.out_dir}/models/kobert_best.pt"
            torch.save({"state_dict": model.state_dict(), "labels": labels}, best_path)
            last_report = classification_report(y_true, y_pred, target_names=labels, digits=4)

    with open(f"{CFG.out_dir}/reports/val_classification_report.txt","w",encoding="utf-8") as f:
        f.write(last_report)
    with open(f"{CFG.out_dir}/reports/baseline_results.json","w",encoding="utf-8") as f:
        json.dump({"val_macro_f1": float(best_f1)}, f, ensure_ascii=False, indent=2)

    # 수정: 배치 단위로 테스트 예측 (메모리 안정성)
    print("\n=== Test Prediction ===")
    test_ds = TxtClsDS(test_df, has_label=False)
    test_loader = DataLoader(test_ds, batch_size=CFG.batch_size, shuffle=False, 
                            num_workers=0, collate_fn=collate)
    
    model.eval()
    all_preds = []
    for enc, _ in test_loader:
        enc = {k:v.to(device) for k,v in enc.items()}
        with torch.no_grad():
            logits = model(**enc).logits
        all_preds.append(logits.argmax(1).cpu())
    
    pred_ids = torch.cat(all_preds).numpy()
    pred_labels = [inv[int(i)] for i in pred_ids]

    # 수정: 원본 conversation을 제출 파일에 사용
    sub = pd.DataFrame({
        "idx": test_df["idx"], 
        "conversation": test_df["text_original"],  # 원본 텍스트 사용
        "class": pred_labels
    })
    os.makedirs(CFG.out_dir, exist_ok=True)
    sub.to_csv(f"{CFG.out_dir}/submission.csv", index=False)
    print(f"Saved: {CFG.out_dir}/submission.csv")
    print(f"Predictions shape: {sub.shape}")
    print(f"Sample predictions:\n{sub.head()}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
