In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

!pip install transformers==4.38.2 accelerate peft datasets


Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.1.2
  Downloading https://download.pytorch.org/whl/cu118/torch-2.1.2%2Bcu118-cp311-cp311-linux_x86_64.whl (2325.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m998.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.16.2
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.16.2%2Bcu118-cp311-cp311-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datasets import Dataset
from datasets.features import Value

import random
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# 1) 데이터 불러오기
train_df = pd.read_csv('/content/drive/MyDrive/LLM/train.csv', encoding='utf-8-sig')
test_df  = pd.read_csv('/content/drive/MyDrive/LLM/test.csv', encoding='utf-8-sig')

# 2) 바로 전체 본문을 입력으로 사용
train_df = train_df.rename(columns={'full_text':'text', 'generated':'label'})
train_df['label'] = train_df['label'].astype(float)

# 3) KFold 세팅
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 모델 이름 바꾸기
MODEL_NAME = 'monologg/kobigbird-bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 토큰 길이 분포 확인
length_stats = train_df['text'].apply(lambda x: len(tokenizer(x)['input_ids'])).describe()
print(" Token length statistics:")
print(length_stats)

In [None]:
# 4) 토크나이저 함수
def tokenize_fn(examples):
    return tokenizer(
        examples['text'],
        max_length=1024,
        truncation=True,
        padding='max_length'
    )

all_fold_preds = []
fold_aucs = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f"===== Fold {fold+1}/{n_splits} =====")

    # 5) HuggingFace Dataset 생성
    train_fold = Dataset.from_pandas(train_df.iloc[train_idx].reset_index(drop=True))
    val_fold   = Dataset.from_pandas(train_df.iloc[val_idx].reset_index(drop=True))
    # 칼럼명 변경 및 타입 캐스팅
    train_fold = train_fold.rename_column('label', 'labels').cast_column('labels', Value('float32'))
    val_fold   = val_fold.rename_column('label', 'labels').cast_column('labels', Value('float32'))

    # 6) 토크나이징
    train_fold = train_fold.map(tokenize_fn, batched=True)
    val_fold   = val_fold.map(tokenize_fn, batched=True)

    # 7) 모델 초기화 (회귀)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=1,
        problem_type='regression'
    )

    # 8) 훈련 인자
    args = TrainingArguments(
        output_dir=f'./outputs/fold{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        eval_steps=500,
        save_steps=500,
        logging_steps=200,
        learning_rate=2e-5,
        weight_decay=0.01,
        max_grad_norm=1.0,
        warmup_ratio=0.1,
        gradient_accumulation_steps=2,
        seed=42,
        report_to=["none"]
    )

    # 9) 성능 측정 함수
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.squeeze()
        auc = roc_auc_score(labels, preds)
        return {'eval_auc': auc}

    # 10) Trainer 선언
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_fold,
        eval_dataset=val_fold,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # 11) 학습 & 평가
    trainer.train()
    metrics = trainer.evaluate()
    auc_score = metrics.get('eval_auc', 0.0)
    fold_aucs.append(auc_score)
    print(f"Fold {fold+1} AUC: {auc_score:.4f}")

    # 12) 검증 예측 저장
    val_preds = trainer.predict(val_fold).predictions.squeeze()
    train_df.loc[val_idx, 'fold_prob'] = val_preds

    # 13) 테스트 예측
    test_ds = Dataset.from_pandas(
        test_df.rename(columns={'paragraph_text':'text'}).reset_index(drop=True)
    ).map(tokenize_fn, batched=True)
    test_preds = trainer.predict(test_ds).predictions.squeeze()
    all_fold_preds.append(test_preds)

# 14) CV 결과 출력
avg_auc = float(np.mean(fold_aucs))
print(f"Average CV AUC: {avg_auc:.4f}")

# 15) 제출 파일 생성
preds = np.mean(np.vstack(all_fold_preds), axis=0)
sample_sub = pd.read_csv('/content/drive/MyDrive/LLM/sample_submission.csv', encoding='utf-8-sig')
sample_sub['generated'] = preds

submission_path = '/content/drive/MyDrive/LLM/kobigbird-model_seed42.csv'
sample_sub.to_csv(submission_path, index=False)

# 16) 모델과 토크나이저 저장
save_dir = f'/content/drive/MyDrive/LLM/kobigbird-model_seed42/fold{fold}'
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
import os
os.kill(os.getpid(), 9)