In [None]:
!pip install transformers datasets

In [None]:
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from transformers import TrainerCallback

In [None]:
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# kogpt 모델 로드
model_name = 'skt/kogpt2-base-v2' 
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name, bos_token='</s>', eos_token='</s>', unk_token='<unk>',
    pad_token='</s>')
model = GPT2LMHeadModel.from_pretrained(model_name)


In [None]:
from google.colab import files

uploaded = files.upload()

# data.txt(텍스트 데이터셋) 파일 로드
with open("data.txt", "r", encoding="utf-8") as f:
    raw_data = f.read()

texts = [line.strip() for line in raw_data.split("\n") if line.strip()]


train_size = int(0.85 * len(texts))
train_texts = texts[:train_size]
test_texts = texts[train_size:]

train_dataset = Dataset.from_dict({'text': train_texts})
test_dataset = Dataset.from_dict({'text': test_texts})


In [None]:
#토크나이징
def tokenize_function(examples):
    model_inputs = tokenizer(examples["text"], padding='max_length', max_length=512, truncation=True)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
# 파인튜닝1 
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    evaluation_strategy="no", 
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
)

trainer.train()

In [None]:
import json
from google.colab import files
from datasets import Dataset

uploaded = files.upload()

# qadata.txt(질의응답 json형태 데이터셋) 파일 로드
with open("qadata.txt", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

qa_dataset = Dataset.from_dict(qa_data)

train_testsplit = qa_dataset.train_test_split(test_size=0.15)
train_dataset = train_testsplit['train']
test_dataset = train_testsplit['test']


In [None]:
#토크나이징
def preprocess_function(examples):
    inputs = [q + tokenizer.eos_token + a for q, a in zip(examples['question'], examples['answer'])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)


In [None]:
#파인튜닝2
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,  
    eval_dataset=tokenized_test_dataset,   
)

trainer.train()


In [None]:
# 테스트 예시
test_questions = [
        "의무유급제도란 무엇인가요?",
        "제적이란 무엇인가요?",
        "유급학기의 계절수업 성적은 어떻게 되나요?",
        "의무유급 학기의 성적은 어떻게 되나요?",
        "수료/수여대상자가 의무유급대상이 될 경우 어떻게 되나요?",
        "성적은 어떻게 평가되나요?",
        "매 학기에 정기시험은 몇회 있나요?",
        "A등급의 비율은 얼마로 제한되나요?",
        "상대평가 대상에서 제외되는 대상은 누구인가요?",
        "성적 확인이 제한되는 경우는 어떤 것이 있나요?",
        "학사경고 사항은 어디에 기재되나요?",
        "학사경고자는 어떤 조치가 따르나요?",
]

for question in test_questions:
    inputs = tokenizer.encode(question + tokenizer.eos_token, return_tensors='pt').to(model.device)
    outputs = model.generate(inputs, max_length=512, do_sample=True, top_k=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(question):].strip()
    print(f"질문: {question}")
    print(f"답변: {answer}")
    print("-" * 50)



In [None]:
# loss 값을 저장할 클래스
class LossRecorder(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.train_epochs = []
        self.eval_losses = []
        self.eval_epochs = []
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if 'loss' in logs:
                self.train_losses.append(logs['loss'])
                self.train_epochs.append(state.epoch)
            if 'eval_loss' in logs:
                self.eval_losses.append(logs['eval_loss'])
                self.eval_epochs.append(state.epoch)

loss_recorder = LossRecorder()

In [None]:
# 추가학습
training_args.num_train_epochs = 10

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    callbacks=[loss_recorder],
)

trainer.train()



In [None]:
# 손실 값 가져오기
train_epochs = loss_recorder.train_epochs
train_losses = loss_recorder.train_losses
eval_epochs = loss_recorder.eval_epochs
eval_losses = loss_recorder.eval_losses

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(train_epochs, train_losses, label='Training Loss')
plt.plot(eval_epochs, eval_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.grid(True)
plt.show()


# 베이스라인 모델 로드
baseline_model = GPT2LMHeadModel.from_pretrained(model_name)

for question in test_questions:
    inputs = tokenizer.encode(question + tokenizer.eos_token, return_tensors='pt')
    outputs = baseline_model.generate(inputs, max_length=50, do_sample=True, top_k=50)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text[len(question):].strip()
    print(f"[베이스라인] 질문: {question}")
    print(f"[베이스라인] 답변: {answer[len(question):]}")
    print("-" * 50)
