In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score
from datasets import Dataset
from datasets.features import Value


# Data Load & Split

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/LLM/train.csv', encoding='utf-8-sig')
test_df = pd.read_csv('/content/drive/MyDrive/LLM/test.csv', encoding='utf-8-sig')

In [None]:
expanded = []
for idx, row in train_df.iterrows():
    paragraphs = [p.strip() for p in row['full_text'].split('\n\n') if p.strip()]
    for p in paragraphs:
        expanded.append({
            'doc_id': idx,
            'paragraph': p,
            'label': float(row['generated'])
        })
train_exp = pd.DataFrame(expanded)

In [None]:
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

In [None]:
preds = []
fold_aucs = []

In [None]:
MODEL_NAME = 'monologg/koelectra-base-discriminator'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
def tokenize_fn(examples, tokenizer=tokenizer):
    return tokenizer(
        examples['paragraph'],
        max_length=256,
        truncation=True,
        padding='max_length'
    )

In [None]:
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_exp, groups=train_exp['doc_id'])):
    print(f"===== Fold {fold+1}/{n_splits} =====")

===== Fold 1/5 =====
===== Fold 2/5 =====
===== Fold 3/5 =====
===== Fold 4/5 =====
===== Fold 5/5 =====


In [None]:
    # 데이터셋 생성 및 레이블 타입 변환
    train_fold = Dataset.from_pandas(train_exp.iloc[train_idx].reset_index(drop=True))
    val_fold   = Dataset.from_pandas(train_exp.iloc[val_idx].reset_index(drop=True))
    train_fold = train_fold.rename_column('label', 'labels').cast_column('labels', Value('float32'))
    val_fold   = val_fold.rename_column('label', 'labels').cast_column('labels', Value('float32'))

Casting the dataset:   0%|          | 0/77738 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/19434 [00:00<?, ? examples/s]

In [None]:
    # 문단 토크나이징
    train_fold = train_fold.map(tokenize_fn, batched=True)
    val_fold   = val_fold.map(tokenize_fn, batched=True)

Map:   0%|          | 0/77738 [00:00<?, ? examples/s]

Map:   0%|          | 0/19434 [00:00<?, ? examples/s]

In [None]:
    # 6. 모델 초기화 (회귀 형태)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=1,
        problem_type='regression'
    )

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
        # Training args
args = TrainingArguments(
    output_dir=f'./outputs/fold{fold}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_steps=500,
    save_steps=500,
    logging_steps=200,
    learning_rate=2e-5,
    weight_decay=0.01,
    max_grad_norm=1.0,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    seed=42 + fold,
    report_to=["none"]
)

In [None]:
    # 8. AUC 평가 함수 정의
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.squeeze()
        return {'eval_auc': roc_auc_score(labels, preds)}

In [None]:
    # 9. Trainer 초기화 및 학습
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_fold,
        eval_dataset=val_fold,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

  trainer = Trainer(


In [None]:
    # 학습 수행
    trainer.train()


Step,Training Loss
200,0.0913
400,0.0727
600,0.0413
800,0.0338
1000,0.0378
1200,0.0395
1400,0.0346
1600,0.0332
1800,0.0396
2000,0.04


TrainOutput(global_step=19436, training_loss=0.02379661812737391, metrics={'train_runtime': 3757.3656, 'train_samples_per_second': 82.758, 'train_steps_per_second': 5.173, 'total_flos': 4.090708715162419e+16, 'train_loss': 0.02379661812737391, 'epoch': 4.0})

In [None]:
    # 10. 검증 및 AUC 기록
    metrics = trainer.evaluate()
    auc_value = metrics.get('eval_auc', 0.0)
    fold_aucs.append(auc_value)
    print(f"Fold {fold+1} Validation AUC: {auc_value:.4f}")

Fold 5 Validation AUC: 0.9335


In [None]:
    # 11. 검증 데이터 예측 확률 저장
    val_probs = trainer.predict(val_fold).predictions.squeeze()
    train_exp.loc[val_idx, 'fold_prob'] = val_probs

In [None]:
    # 12. 테스트 데이터 예측
    test_ds = Dataset.from_pandas(
        test_df.rename(columns={'paragraph_text': 'paragraph'}).reset_index(drop=True)
    )
    test_ds = test_ds.map(tokenize_fn, batched=True)
    test_preds = trainer.predict(test_ds).predictions.squeeze()
    preds.append(test_preds)

Map:   0%|          | 0/1962 [00:00<?, ? examples/s]

In [None]:
# 13. 평균 CV AUC 출력
average_auc = float(np.mean(fold_aucs))
print(f"Average CV AUC: {average_auc:.4f}")

Average CV AUC: 0.9335


In [None]:
avg_cv_auc = np.mean(fold_aucs)
print(f"Validation AUC: {avg_cv_auc:.4f}")

Validation AUC: 0.9335


In [None]:
preds = np.mean(np.vstack(preds), axis=0)

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/LLM/sample_submission.csv', encoding='utf-8-sig')
sample_submission['generated'] = preds
submission_path = '/content/drive/MyDrive/LLM/koelectra-base-discriminator_epoch4.csv'
sample_submission.to_csv(submission_path, index=False)