꿈틀에서 생성된 동화를 AI 모델에 학습시키는 코드입니다.

# DB에 접근해 동화 내용 가져오기

In [66]:
import pymysql
import json
import os
from dotenv import load_dotenv

load_dotenv()
DB_HOST = os.environ.get('DB_HOST')
DB_USERNAME = os.environ.get('DB_USERNAME')
DB_PASSWORD = os.environ.get('DB_PASSWORD')
DB_DATABASE = os.environ.get('DB_DATABASE')

In [69]:
conn = pymysql.connect(host = DB_HOST, user = DB_USERNAME, password = DB_PASSWORD, db = DB_DATABASE, charset = 'utf8')

try:
  with conn.cursor() as cursor:
    sql = "SELECT content FROM fairytale WHERE created_at BETWEEN '2024-07-01 00:00:00' AND '2024-08-03 23:59:59';" # 원하는 기간 설정
    cursor.execute(sql)
    result = cursor.fetchall()

    train_texts = []
    for datas in result:
      dict = json.loads(datas[0])
      fairy = ''
      cnt = 0
      for fairytale in dict.values():
        cnt += 1
        fairy += f"{fairytale} "
        if cnt >= len(dict.values()):
          train_texts.append(fairy.rstrip())
except:
  print('예외가 발생했습니다.')

# 슬라이딩 윈도우 설정

In [None]:
import torch
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import re
import warnings
import random
warnings.filterwarnings('ignore')
from datasets import Dataset, DatasetDict
from torch.utils.data import Dataset as TorchDataset

In [None]:
# 슬라이딩 윈도우 파라미터
window_size = 10  # 윈도우 크기 (문장 수)
sliding_step = 5  # 슬라이딩 간격 (문장 수)

def sliding_window(sentences, window_size, step):
    """ 문장 리스트에 슬라이딩 윈도우를 적용하는 함수 """
    windows = []
    for start in range(0, len(sentences) - window_size + 1, step):
        window = sentences[start:start + window_size]
        windows.append(window)

    # 마지막 윈도우 추가 (중복 방지)
    if len(sentences) % step != 0:
        last_window = sentences[-window_size:]
        if last_window not in windows:
            windows.append(last_window)
    return windows


# 동화 데이터에 슬라이딩 윈도우 적용
all_story_windows = []

for story in train_texts:
    # 동화에서 문장 분리
    sentences = sent_tokenize(story)

    # 슬라이딩 윈도우 적용
    story_windows = sliding_window(sentences, window_size, sliding_step)

    all_story_windows.append(story_windows)

# 문장 수가 충분하지 않은 경우를 확인
for i, story in enumerate(train_texts):
    sentences = sent_tokenize(story)
    if len(sentences) < window_size:
        print(f"Story {i} has less than {window_size} sentences and cannot have any windows.")

all_new_story = []

for story_window in all_story_windows:
    sentences_list = []

    for sentences in story_window:
        sentences_list.extend(sentences)

    new_story = ' '.join(sentences_list)
    new_story = new_story.replace('\n', ' ').strip()

    all_new_story.append(new_story)

train_texts = all_new_story

# 모델 학습

In [None]:
import torch
import numpy as np
from transformers import Trainer, TrainingArguments, GPT2LMHeadModel, PreTrainedTokenizerFast, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader, SubsetRandomSampler
import wandb

wandb.login()

# GPU 환경에서 사용 가능하도록 변경
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습시킨 꿈틀 AI 모델
current_dir = os.path.dirname(os.path.abspath(__file__))
model_path = os.path.join(current_dir, '../models/story_generator')

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)


# 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'bos_token': '<BOS>'})
tokenizer.add_special_tokens({'eos_token': '<EOS>'})
tokenizer.add_special_tokens({'sep_token': '<SEP>'})

model.resize_token_embeddings(len(tokenizer))

# 꿈틀 데이터셋 클래스 정의
class DreamTwistDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# 데이터셋 인스턴스 생성
max_length = 512

# 꿈틀 트레인 데이터셋 설정
train_dataset = DreamTwistDataset(train_texts, tokenizer, max_length)

# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=max_length
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50 ,
    per_device_train_batch_size=1,
    save_steps=10_000,
    save_total_limit=2,
    eval_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    logging_first_step=True,
    learning_rate=5e-5,
    overwrite_output_dir=True,
)

# 트레이너 객체 생성 및 훈련
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

모델 내보내기

In [None]:
output_dir = './trained_model/new_model'
# 모델 저장
model.save_pretrained(output_dir)
# 토크나이저 저장
tokenizer.save_pretrained(output_dir)