# 라이브러리 설치 및 임포트, 시드 고정

In [1]:
!unzip ./3595-4FOLD.zip

Archive:  ./3595-4FOLD.zip
  inflating: fold0.csv               
  inflating: fold1.csv               
  inflating: fold2.csv               
  inflating: fold3.csv               
  inflating: sample_submission.csv   
  inflating: test_preprocessed.csv   


In [2]:
!pip install -r requirements.txt \
  --extra-index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124


In [3]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from transformers import pipeline
import torch
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import roc_auc_score
import datetime as dt
import random
import re
import os
from tqdm import tqdm
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
seed_everything(SEED) # Seed 고정

# 데이터 불러오기

In [5]:
# 전체 fold 파일 경로 리스트 (0~4)
val_fold_idx = 2  # 예: 3을 넣으면 fold3.csv가 validation으로, 나머지(0,1,2,4)가 train으로 사용


fold_paths = [f"./fold{i}.csv" for i in range(4)]

FOLD_VAL   = fold_paths[val_fold_idx]
FOLD_TRAIN = [path for idx, path in enumerate(fold_paths) if idx != val_fold_idx]

print("▶ Train folds:", FOLD_TRAIN)
print("▶ Validation fold:", FOLD_VAL)

TEST_CSV        = "./test_preprocessed.csv"
SUBMISSION_CSV  = "./sample_submission.csv"

▶ Train folds: ['./fold0.csv', './fold1.csv', './fold3.csv']
▶ Validation fold: ./fold2.csv


In [6]:
# ==============================================================
# 1) fold1~4 → 학습, fold0 → 검증
# ==============================================================

# 학습용 데이터프레임
train_df = pd.concat(
    [pd.read_csv(p, encoding="utf-8-sig") for p in FOLD_TRAIN],
    ignore_index=True
)

# 검증용 데이터프레임
val_df   = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")

# ── 필요 없는 열 제거 & 컬럼명 통일 ──
train_df = train_df[['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)
val_df   = val_df  [['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)

# ── 학습 세트 셔플 ──
train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("최종 학습 샘플 수:", len(train_df))
print("최종 학습 클래스 분포:", train_df['label'].value_counts().to_dict())
print("검증 샘플 수:", len(val_df))
print("검증 클래스 분포:", val_df['label'].value_counts().to_dict())

최종 학습 샘플 수: 91143
최종 학습 클래스 분포: {0: 45572, 1: 45571}
검증 샘플 수: 30381
검증 클래스 분포: {1: 15191, 0: 15190}


In [7]:
# ==============================================================
# 2) Hugging Face Dataset 변환
# ==============================================================
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

In [8]:
# ==============================================================
# 3) 토큰화
# ==============================================================

MODEL_NAME = "LGAI-EXAONE/EXAONE-3.5-32B-Instruct"  # 사전학습 모델 이름 (Hugging Face 모델 허브)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# 학습/검증 데이터를 토큰화
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 토크나이저가 반환한 컬럼과 원본 텍스트 컬럼 정리 (모델 입력에 필요 없는 컬럼 제거)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

# 라벨 컬럼명 변경
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

Map:   0%|          | 0/91143 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 91143/91143 [00:06<00:00, 13542.67 examples/s]
Map: 100%|██████████| 30381/30381 [00:02<00:00, 13210.89 examples/s]


In [9]:
# ==============================================================
# 4) Data Collator
# ==============================================================
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [10]:
# 장치 설정 (GPU 사용 가능 여부)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # A100에서는 bfloat16 사용 권장
    bnb_4bit_quant_type="nf4",             # NF4 양자화 방식
    bnb_4bit_use_double_quant=True         # 메모리 효율 추가 향상 옵션
)

# 사전훈련 모델 로드 (시퀀스 분류용 헤드 포함) 및 GPU 이동
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, quantization_config=bnb_config, torch_dtype=torch.bfloat16, trust_remote_code=True)
model.to(device)

Loading checkpoint shards: 100%|██████████| 27/27 [00:27<00:00,  1.02s/it]
Some weights of ExaoneForSequenceClassification were not initialized from the model checkpoint at LGAI-EXAONE/EXAONE-3.5-32B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ExaoneForSequenceClassification(
  (transformer): ExaoneModel(
    (wte): Embedding(102400, 5120, padding_idx=0)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-63): 64 x ExaoneBlock(
        (ln_1): ExaoneRMSNorm()
        (attn): ExaoneAttention(
          (attention): ExaoneSdpaAttention(
            (rotary): ExaoneRotaryEmbedding()
            (k_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
            (v_proj): Linear4bit(in_features=5120, out_features=1024, bias=False)
            (q_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
            (out_proj): Linear4bit(in_features=5120, out_features=5120, bias=False)
          )
        )
        (ln_2): ExaoneRMSNorm()
        (mlp): ExaoneGatedMLP(
          (c_fc_0): Linear4bit(in_features=5120, out_features=27392, bias=False)
          (c_fc_1): Linear4bit(in_features=5120, out_features=27392, bias=False)
          (c_proj): Linear4bit(in_features=27392, out_featur

In [11]:
# LoRA 설정 구성
R = 32
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules= ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

# 원본 모델에 LoRA 어댑터 추가
model = get_peft_model(model, lora_config)


In [12]:
model.print_trainable_parameters()

trainable params: 46,147,584 || all params: 31,525,069,824 || trainable%: 0.1464


In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = logits[:, 1]  # 클래스 1의 확률 추정값
    roc_auc = roc_auc_score(labels, probs)
    return {"roc_auc": roc_auc}

In [14]:
# 훈련 파라미터 설정
training_args = TrainingArguments(
    output_dir="./xone_model2_checkpoint", # 출력 디렉토리
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8, # 배치 크기 (GPU 메모리에 맞게 조절)
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    # eval_strategy="steps", # 매 epoch 후 검증
    save_strategy="epoch", # 매 epoch 후 모델 저장
    # gradient_accumulation_steps=1,
    # load_best_model_at_end=True, # 검증 성능 가장 좋은 모델 사용
    metric_for_best_model="roc_auc", # best 모델 선정 기준
    greater_is_better=True,
     # 👉 로그를 step마다 찍기
    logging_strategy="steps",   # 기본값이긴 하지만 명시하는 편이 안전
    logging_steps=1000,          # 100 step마다 loss 출력
    logging_first_step=True,    # 0-step 로그도 보고 싶다면
    save_total_limit=2, # 체크포인트 저장 제한
    seed=SEED,
    dataloader_drop_last=False,
    # 👇 로그를 어디로 보낼지: 콘솔 + TensorBoard 권장
    report_to="none",
    label_names=["labels"]
)

In [15]:
# Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# 모델 훈련 시작
trainer.train()

Step,Training Loss
1,3.2031
1000,0.8937
2000,0.5655
3000,0.5421
4000,0.5227
5000,0.5256
6000,0.5198
7000,0.5071
8000,0.5156
9000,0.5179


In [None]:
# fine-tuned 모델을 로컬에 저장
output_dir = "./xone_model2"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("모델이 저장되었습니다:", output_dir)


In [None]:
# model.eval 키고 test 추론

In [None]:
# 테스트 데이터 불러오기
test_df = pd.read_csv(TEST_CSV, encoding='utf-8-sig')
submission_df = pd.read_csv(SUBMISSION_CSV, encoding='utf-8-sig')

print("테스트 샘플 수:", len(test_df))
# 각 테스트 샘플에 대해 추론
pred_probs = []

In [None]:
trainer.model.eval()

In [None]:
# 만약 세션을 재시작해서 model이 메모리에 없다면, 다음처럼 불러올 수 있습니다:
# model = AutoModelForSequenceClassification.from_pretrained("fine-tuned-model")
# tokenizer = AutoTokenizer.from_pretrained("fine-tuned-model")
# (여기서는 trainer.model에 이미 최적 모델이 들어 있다고 가정)
# 추론 파이프라인 구성 (GPU 사용, 모든 클래스 점수 출력)
clf = pipeline(
    "text-classification",
    model=trainer.model, # 미세조정된 모델
    tokenizer=tokenizer,
    return_all_scores=True,
    # device=0
)

In [None]:
print("샘플 결과 예시:", clf(test_df['paragraph_text'][0]))


In [None]:
for text in test_df['paragraph_text']:
    scores = clf(text)[0]  # OK: 리스트 구조 한 번 벗겨냄
    prob_ai = None
    for s in scores:
        if s['label'] in ['LABEL_1', '1', 'generated']:
            prob_ai = s['score']
            break
    if prob_ai is None:
        prob_ai = scores[1]['score']  # fallback (정렬이 보장된다고 가정)
    pred_probs.append(prob_ai)

In [None]:
# 결과를 제출 데이터프레임에 기록
submission_df['generated'] = pred_probs

In [None]:
submission_df

In [None]:
submission_df.to_csv("./test_xone_fold2.csv", index=False, encoding="utf-8-sig")

In [None]:
# val 배치 추론

In [None]:
def tokenize_test(batch):
    return tokenizer(batch["text"], truncation=True)

val_ds = Dataset.from_pandas(val_df)

val_ds = val_ds.map(tokenize_test, batched=True,
                      remove_columns=["text", "label"])

In [None]:
def collate(features):
    """
    • 동적 padding → tensor 변환
    • tokenizer가 추가한 'length' 류 메타키 제거
    """
    batch = data_collator(features)
    return batch

In [30]:
BATCH_TEST = 8
loader = DataLoader(
    val_ds,
    batch_size=BATCH_TEST,
    shuffle=False,
    collate_fn=collate,
    pin_memory=True,
)

probs_list = []

with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = trainer.model(**batch).logits
        probs  = torch.softmax(logits, dim=-1)[:, 1]
        probs_list.append(probs.cpu())

probs = torch.cat(probs_list).to(torch.float32).numpy()
print(f"[✓] Inference done – {len(probs)} samples")

100%|██████████| 3798/3798 [46:03<00:00,  1.37it/s]

[✓] Inference done – 30381 samples





In [31]:
val_df['generated'] = probs

In [32]:
val_df['ID'] = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")['id']

In [33]:
val_df = val_df[['ID', 'generated', 'label']]

In [36]:
val_df.to_csv("./val_xone_fold2.csv", index=False, encoding="utf-8-sig")

In [35]:
val_df

Unnamed: 0,ID,generated,label
0,FOLD2_00000,1.000000,1
1,FOLD2_00001,0.259766,0
2,FOLD2_00002,1.000000,1
3,FOLD2_00003,0.275391,1
4,FOLD2_00004,0.365234,0
...,...,...,...
30376,FOLD2_30376,0.255859,1
30377,FOLD2_30377,0.363281,0
30378,FOLD2_30378,0.306641,0
30379,FOLD2_30379,0.308594,1
