In [None]:
!pip install pytorch-lightning

In [None]:
import argparse
import logging

import numpy as np
import pandas as pd
import torch
from pytorch_lightning import Trainer,LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

In [None]:
parser = argparse.ArgumentParser(description='Simsimi based on KoGPT-2')

parser.add_argument('--chat',
                    action='store_true',
                    default=False,
                    help='response generation on given user input')

parser.add_argument('--sentiment',
                    type=str,
                    default='0',
                    help='sentiment for system. 0 is neutral, 1 is negative, 2 is positive.')

parser.add_argument('--model_params',
                    type=str,
                    default='model_chp/model_-last.ckpt',
                    help='model binary for starting chat')

parser.add_argument('--train',
                    action='store_true',
                    default=False,
                    help='for training')

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

Q_TKN = "<usr>"
A_TKN = "<sys>"
BOS = "</s>"
EOS = "</s>"
MASK = "<unused0>"
SENT = "<unused1>"
PAD = "<pad>"


TOKENIZER = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token=BOS,
    eos_token=EOS,
    unk_token="<unk>",
    pad_token=PAD,
    mask_token=MASK,
    additional_special_tokens=[
        Q_TKN,
        A_TKN,
        SENT
    ]
)
#사용자 지정 토큰을 만들면 additional_special_tokens에 추가해야된다.


# 하나의 토큰으로 처리할 단어들
new_tokens = ['결말', '도입', '발단', '위기', '전개',  '절정','경멸', '관심',  '두려움',  '미움', '분노', '슬픔',
        '중립', '행복']

# 토크나이저에 새로운 토큰 추가
TOKENIZER.add_tokens(new_tokens)

model = GPT2LMHeadModel.from_pretrained("skt/kogpt2-base-v2")

model.resize_token_embeddings(len(TOKENIZER))

In [None]:
import pandas as pd
df = pd.read_csv('map_story.csv');df

In [None]:
df['scene_content2']= df['scene_content'].shift(-1);df

In [None]:
df_1 = df.dropna(subset=['scene_content2']);df_1
df_1 = df_1.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1)
df_1

In [None]:
df_2 = df_1.rename(columns={'scene_content':'Q','scene_content2':'A','narrative_stage' : 'stage','mapped_emotion':'emotion' });df_2

In [None]:
df_3 = df_2.iloc[:270000,:]
df_3

In [None]:
class CharDataset(Dataset):
    def __init__(self, chats, max_len=32):
        self._data = chats
        self.first = True
        self.q_token = Q_TKN
        self.a_token = A_TKN
        self.sent_token = SENT
        self.bos = BOS
        self.eos = EOS
        self.mask = MASK
        self.pad = PAD
        self.max_len = max_len
        self.tokenizer = TOKENIZER
       # 위에서 tokenizer 적용 과정에서 썼던 token들 지정
    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        turn = self._data.iloc[idx]
        q = str(turn['Q'])
        a = str(turn['A'])
        stage=  str(turn['stage'])
        emotion = str(turn['emotion'])
       # q_toked = self.tokenizer.tokenize(self.bos + stage + self.sent_token + emotion + self.sent_token + self.q_token + q +
       #                                   self.sent_token)
        q_toked = self.tokenizer.tokenize(self.q_token + q )
      # 이전 대사와 다음 대사만 가지고 일단 실험할 것이기 때문에 대사에 관련된 토큰만 집어넣음
        q_len = len(q_toked)
        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)
        a_len = len(a_toked)
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len
            if a_len <= 0:
                q_toked = q_toked[-(int(self.max_len/2)):]
                q_len = len(q_toked)
                a_len = self.max_len - q_len
                assert a_len > 0
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)
            assert a_len == len(a_toked), f'{a_len} ==? {len(a_toked)}'
        # labels = [mask, mask, ...., mask,..,<usr>, A,.. <eos>]
        labels = [
            self.mask,
        ] * q_len + a_toked[:]
        if self.first:
            logging.info("contexts : {}".format(q))
            logging.info("toked ctx: {}".format(q_toked))
            logging.info("response : {}".format(a))
            logging.info("toked response : {}".format(a_toked))
            logging.info('labels {}'.format(labels))
            self.first = False
        mask = [0] * q_len + [1] * a_len + [0] * (self.max_len - q_len - a_len)
        self.max_len
        labels_ids = self.tokenizer.convert_tokens_to_ids(labels)
        while len(labels_ids) < self.max_len:
            labels_ids += [self.tokenizer.pad_token_id]
        #labeling된 데이터에 padding 추가하여 max_length만큼 길이 조정
        token_ids = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        while len(token_ids) < self.max_len:
            token_ids += [self.tokenizer.pad_token_id]
        #input으로 들어갈 토큰들의 임베딩 행렬에 padding추가하여 max_length만큼 길이 조정
        return(token_ids, np.array(mask),
               labels_ids)

In [None]:
def collate_fn(batch):
    data = np.array([np.array(item[0]) for item in batch])  # numpy 배열로 변환
    mask = np.array([np.array(item[1]) for item in batch])
    label = np.array([np.array(item[2]) for item in batch])


    return (torch.LongTensor(data), torch.LongTensor(mask),
            torch.LongTensor(label))


In [None]:
import tqdm
from tqdm import tqdm

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_set = CharDataset(df_3, max_len=32)
train_dataloader = DataLoader(
    train_set,
    batch_size=16,
    num_workers=0,
    shuffle=False,
    collate_fn=collate_fn,
)

model.to(device)

learning_rate = 5e-6
criterion = torch.nn.CrossEntropyLoss(reduction="none")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

epoch = 3
Sneg = -1e18

# 20% 단위로 진행 상태를 출력하는 기능 추가
for epoch_num in range(epoch):
    dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch_num}")
    total_loss = 0.0
    total_batches = 0
    total_steps = len(dataloader)
    step_20_percent = total_steps // 5  # 전체 스텝의 20%에 해당하는 스텝 수 계산

    for batch_idx, samples in enumerate(dataloader):
        optimizer.zero_grad()
        token_ids, mask, label = samples
        token_ids, mask, label = (token_ids.to(device),
                                  mask.to(device),
                                  label.to(device))

        # 모델에 token_ids 전달
        out = model(token_ids)
        out = out.logits

        # Mask 적용
        mask_3d = mask.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out, Sneg * torch.ones_like(out))

        # Loss 계산 및 역전파
        loss = criterion(mask_out.transpose(2, 1), label)
        avg_loss = loss.sum() / mask.sum()
        avg_loss.backward()
        optimizer.step()

        total_loss += avg_loss.item()
        total_batches += 1

        # 20%, 40%, 60%, 80%, 100% 완료 시점에서 Average Loss 출력
        if (batch_idx + 1) % step_20_percent == 0 or (batch_idx + 1) == total_steps:
            percent_complete = (batch_idx + 1) / total_steps * 100
            partial_loss = total_loss / total_batches
            print(f"Epoch {epoch_num} - {percent_complete:.0f}% Complete - Average Loss: {partial_loss:.4f}")

    # Epoch 끝에서 최종 Loss 출력
    epoch_loss = total_loss / total_batches
    print(f"Epoch {epoch_num} - Final Average Loss: {epoch_loss:.4f}")