In [None]:
!pip install transformers datasets torch

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

# KoE5 모델 및 Tokenizer 불러오기
model_name = "monologg/koelectra-base-v3-finetuned-korquad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# KorQuAD 2.0 데이터셋 로드
datasets = load_dataset('korquad_v2')

In [None]:
# 질문과 본문을 입력으로 받는 전처리 함수
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions, 
        examples["context"], 
        max_length=384, 
        truncation=True, 
        padding="max_length",
        return_tensors="pt"
    )

    start_positions = examples["answers"]["answer_start"]
    end_positions = [start + len(ans) for start, ans in zip(start_positions, examples["answers"]["text"])]
    
    inputs.update({
        "start_positions": start_positions,
        "end_positions": end_positions
    })
    return inputs

# 데이터셋 전처리 적용
tokenized_datasets = datasets.map(preprocess_function, batched=True)

In [None]:
# 학습 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

In [None]:
# 모델 학습
trainer.train()

In [None]:
# 모델 평가
results = trainer.evaluate()
print(results)