In [2]:
import transformers
import torch
import random
from tqdm.auto import tqdm
import pandas as pd

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encoder_inputs, decoder_inputs, decoder_targets):
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.decoder_targets = decoder_targets

    def __getitem__(self, idx):
        return {"encoder_input_ids" : self.encoder_inputs['input_ids'][idx], 
                "encoder_attention_mask" : self.encoder_inputs['attention_mask'][idx],
                "decoder_input_ids" : self.decoder_inputs['input_ids'][idx],
                "decoder_input_attention_mask" : self.decoder_inputs['attention_mask'][idx],
                "decoder_target_ids" : self.decoder_targets['input_ids'][idx]}
    def __len__(self):
        return len(self.encoder_inputs['input_ids'])
    
class Dataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, shuffle, train_path, val_path, test_path, predict_path):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        self.shuffle = shuffle
        
        self.train_path = train_path
        self.val_path = val_path
        self.test_path = test_path
        self.predict_path = predict_path

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
        
        special_tokens_dict = {'additional_special_tokens': ['[BOS]', '[SEP]']}
        self.tokenizer.add_special_tokens(special_tokens_dict)
        self.tokenizer.add_special_tokens({'bos_token': '<s>'})

    def tokenizing(self, series, max_len):
        tokens = self.tokenizer(series.tolist(), padding='max_length', max_length=max_len ,return_tensors="pt", truncation=True)
        return tokens # input_ids, attention_mask

    def preprocessing(self,dataframe):
        # session_dialog_add_special_tokens
        dataframe['session_dialog'] = dataframe['session_dialog'].apply(lambda x: '[BOS] ' + ' [SEP] '.join(eval(x))+ ' </s>') # ['A','B','C'] -> '[BOS] A [SEP] B [SEP] C [SEP] </s>'
        dataframe['session_persona'] = dataframe['session_persona'].apply(lambda x: '<s> '+','.join(eval(x)).replace('.','')+'.'+' </s>') # ['A.','B.','C.'] -> 'A,B,C.'
        dataframe['session_persona_target'] =  dataframe['session_persona'].apply(lambda x: x[4:]) # <s> 제외

        # tokenizing
        encoder_inputs = self.tokenizing(dataframe['session_dialog'], max_len=500)
        decoder_inputs = self.tokenizing(dataframe['session_persona'], max_len=200)
        decoder_targets = self.tokenizing(dataframe['session_persona_target'], max_len=200)

        return encoder_inputs, decoder_inputs, decoder_targets

    def setup(self, stage='fit'):
        if stage == 'fit':
            train_df = pd.read_csv(self.train_path)
            val_df = pd.read_csv(self.val_path)

            train_encoder_inputs, train_decoder_inputs, train_decoder_targets = self.preprocessing(train_df)
            val_encoder_inputs, val_decoder_inputs, val_decoder_targets  = self.preprocessing(val_df)

            self.train_dataset = Dataset(train_encoder_inputs, train_decoder_inputs, train_decoder_targets)
            self.val_dataset = Dataset(val_encoder_inputs, val_decoder_inputs, val_decoder_targets)
        else:
            test_df = pd.read_csv(self.test_path)
            predict_df = pd.read_csv(self.predict_path)

            test_encoder_inputs, test_decoder_inputs, test_decoder_targets= self.preprocessing(test_df)
            predict_encoder_inputs, predict_decoder_inputs, perdict_decoder_targets = self.preprocessing(predict_df)

            self.test_dataset = Dataset(test_encoder_inputs, test_decoder_inputs, test_decoder_targets)
            self.predict_dataset = Dataset(predict_encoder_inputs, predict_decoder_inputs, perdict_decoder_targets)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=self.shuffle)
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)
    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)
    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset, batch_size=self.batch_size)

config = {"model_name": 'gogamza/kobart-base-v2',
              "model_detail" : "kobart-baeline",

              "batch_size": 16, 
              "shuffle":True,
              "learning_rate":1e-5,
              "epoch": 10,

              "train_path":'./data/train/train.csv', 
              "dev_path":'./data/val/validation.csv',
              "test_path":'./data/val/validation.csv', 
              "predict_path":'./data/val/validation_csv',
              }

In [21]:
dataloader = Dataloader(config['model_name'], config['batch_size'], config['shuffle'], config['train_path'], config['dev_path'], config['test_path'], config['predict_path'])

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [22]:
dataloader.setup()

In [13]:
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
import re

class Model(pl.LightningModule):
    def __init__(self, model_name, lr, tokenizer):
        super().__init__()
        self.save_hyperparameters()

        self.model = transformers.BartForConditionalGeneration.from_pretrained(model_name, cache_dir='./model')
        self.model.config.decoder_start_token_id = tokenizer.bos_token_id
        self.model.resize_token_embeddings(len(tokenizer))

        self.tokenizer = tokenizer
        self.lr = lr

        self.loss = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)

    def compute_rouge_score(self, logits, target) -> dict:
        # logits : (batch_size, decoder_max_len, vocab_size)
        # target : (batch_size, decoder_max_len)
        rouge = Rouge()
        
        pred_strings = []
        target_strings = []

        pred_ids = torch.argmax(logits, dim=-1) # (batch_size, decoder_max_len)
        for batch in range(pred_ids.shape[0]):
            pred_string = re.findall(r'^.*(?=<\/s>)',dataloader.tokenizer.decode(pred_ids[batch])) # truncate until </s>
            if pred_string:
                pred_string = pred_string[0].strip() if pred_string[0] else ' '
            else:
                pred_string = ' '
             
            target_string = self.tokenizer.decode(target[batch], skip_special_tokens=True)
            pred_strings.append(pred_string)
            target_strings.append(target_string)

        
        scores = rouge.get_scores(pred_strings, target_strings, avg=True)
        # f1, precision, recall
        # {'rouge-1': {'f': .., 'p': .., 'r': ..}, 
        # 'rouge-2': {'f': .., 'p': .., 'r': ..}, 
        # 'rouge-l': {'f': .., 'p': .., 'r': ..}}
        
        return scores 

    def compute_bleu_score(self, logits, target) -> float:
        # logits : (batch_size, decoder_max_len, vocab_size)
        # target : (batch_size, decoder_max_len)
        total_bleu_score = 0

        pred_ids = torch.argmax(logits, dim=-1)
        for batch in range(pred_ids.shape[0]):
            pred_string = re.findall(r'^.*(?=<\/s>)',dataloader.tokenizer.decode(pred_ids[batch])) # truncate until </s>
            if pred_string:
                pred_string = pred_string[0].strip() if pred_string[0] else ' '
            else:
                pred_string = ' '
            
            target_string = self.tokenizer.decode(target[batch], skip_special_tokens=True).strip()
            
            total_bleu_score += sentence_bleu(target_string, pred_string, weights=(1, 0, 0, 0))

        return total_bleu_score / pred_ids.shape[0]

    def forward(self, **x):
        '''
        x = {"encoder_input_ids" : (batch_size, encoder_max_len),
             "encoder_attention_mask" : (batch_size, encoder_max_len),
             "decoder_input_ids" : (batch_size, decoder_max_len), # target 역할도 수행
             "decoder_input_attention_mask" : (batch_size, decoder_max_len)}
        '''

        outputs = self.model(input_ids=x['encoder_input_ids'], attention_mask=x['encoder_attention_mask'], 
                            decoder_input_ids=x['decoder_input_ids'], decoder_attention_mask=x['decoder_input_attention_mask'])
        
        return outputs
    
    def training_step(self, batch, batch_idx):
        
        outputs = self(**batch) # (batch_size, decoder_max_len, vocab_size)
        loss = self.loss(outputs.logits.view(-1, outputs.logits.shape[-1]), batch['decoder_target_ids'].view(-1)) # (batch_size*decoder_max_len, vocab_size), (batch_size*decoder_max_len)
        self.log("train_loss", loss)

        rouge_score = self.compute_rouge_score(outputs.logits, batch['decoder_target_ids'])
        bleu_score = self.compute_bleu_score(outputs.logits, batch['decoder_target_ids'])

        self.log_dict({"rouge-1-recall" : round(rouge_score['rouge-1']['r'],3),
                       "rouge-1-precision" : round(rouge_score['rouge-1']['p'],3),
                       "rouge-1-f1" : round(rouge_score['rouge-1']['f'],3)})
        self.log_dict({"rouge-2-recall" : round(rouge_score['rouge-2']['r'],3),
                        "rouge-2-precision" : round(rouge_score['rouge-2']['p'],3),
                        "rouge-2-f1" : round(rouge_score['rouge-2']['f'],3)})
        self.log_dict({"rouge-l-recall" : round(rouge_score['rouge-l']['r'],3),
                        "rouge-l-precision" : round(rouge_score['rouge-l']['p'],3),
                        "rouge-l-f1" : round(rouge_score['rouge-l']['f'],3)})
        self.log_dict({"bleu_avg" : round(bleu_score,3)})
        
        return loss
    
    def validation_step(self, batch, batch_idx):

        outputs = self(**batch)
        loss = self.loss(outputs.logits.view(-1, outputs.logits.shape[-1]), batch['decoder_input_ids'].view(-1))
        self.log("val_loss", loss)

        rouge_score = self.compute_rouge_score(outputs.logits, batch['decoder_target_ids'])
        bleu_score = self.compute_bleu_score(outputs.logits, batch['decoder_target_ids'])

        self.log_dict({"val_rouge-1-recall" : round(rouge_score['rouge-1']['r'],3),
                       "val_rouge-1-precision" : round(rouge_score['rouge-1']['p'],3),
                       "val_rouge-1-f1" : round(rouge_score['rouge-1']['f'],3)})
        self.log_dict({"val_rouge-2-recall" : round(rouge_score['rouge-2']['r'],3),
                        "val_rouge-2-precision" : round(rouge_score['rouge-2']['p'],3),
                        "val_rouge-2-f1" : round(rouge_score['rouge-2']['f'],3)})
        self.log_dict({"val_rouge-l-recall" : round(rouge_score['rouge-l']['r'],3),
                        "val_rouge-l-precision" : round(rouge_score['rouge-l']['p'],3),
                        "val_rouge-l-f1" : round(rouge_score['rouge-l']['f'],3)})
        self.log_dict({"val_bleu_avg" : round(bleu_score,3)})

        return loss

    def test_step(self, batch, batch_idx):
            
        outputs = self(**batch)
        # loss = self.loss(logits.view(-1, logits.shape[-1]), batch['decoder_input_ids'].view(-1))
    
        return outputs.loss
    
    def predict_step(self, batch, batch_idx):

        outputs = self(**batch)
        return outputs.logits
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

In [14]:
model = Model.load_from_checkpoint('./checkpoints/gogamza/kobart-base-v2kobart-baeline-rouge-bleu-by-val_bleu_avg-v1.ckpt', model_name=config["model_name"], lr=config["learning_rate"], tokenizer=dataloader.tokenizer)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


## validation_predict.csv라는 이름으로 valdiation dataset에 대한 학습된 모델의 예측 결과를 저장
- column은 'id', 'session_dialog', 'predict_persona', 'session_persona'로 구성

In [8]:
## validation dataset 불러오기

val_df = pd.read_csv(config['dev_path'])

In [9]:
val_df

Unnamed: 0,id,session_dialog,session_persona
0,K2-07923-CL20177-CP22499-03-01-S2.json,"['벌써 6일이 지났네요. 그동안 어떻게 지내셨어요?', '운동 너무 좋죠! 저는 ...","['나는 중고차를 알아보려 한다.', '나는 첫차로 중고차를 사려고 한다.', '나..."
1,K2-20311-CL31505-CP51040-08-08-S2.json,"['안녕하세요 저는 30대 여성입니다.', '반갑습니다. 저는 동해에 거주해서 바닷...","['나는 30대 여성이다.', '나는 물 멍을 좋아한다. 나의 거주지는 동해이다.'..."
2,K2-26781-CL20066-CP22442-12-09-S2.json,"['오~ 재밌겠당. 어디로 캠핑 가셨는데요?', '재밌겠당.ㅎ 왜 놀러가서 화가 나...",['나는 글램핑을 안 해봤다.']
3,K2-25258-CL31722-CP52004-19-07-S2.json,"['안녕하세요 저는 30대 여성입니다.', '우와 멋져요~ 저는 사학과 전공했어요~...","['나는 30대 여성이다', '나는 사학과를 전공했다', '나는 주말에 청소를 하고..."
4,K2-00603-CL00969-CP04174-14-04-S2.json,"['안녕하세요. 저는 27살 경기도에 사는 남자입니다.', '오늘 1년만에 가족을 ...","['나는 27살 경기도에 사는 남자이다.', '1년 만에 가족을 만났다. 나는 바빠..."
...,...,...,...
8259,K2-32236-CL00289-CP01114-20-05-S2.json,"['20대 남자입니다! 반가워요 ㅎㅎ.', '황야의 아들들 이란 영화 아시나요? 제...","['나는 20대 남성이다.', '나는 황야의 아들들 이란 영화를 좋아한다.', '내..."
8260,K2-23009-CL13163-CP61924-08-09-S2.json,"['안녕하세요, 벌써 시간이 이렇게 지났네요! 이틀만이죠?', '네, 남녀공학인 학...","['나는 남녀공학 학교에 다녀서 이성 친구가 있다', '나는 얼른 자라서 결혼하고 ..."
8261,K2-28946-CL20593-CP22448-02-10-S2.json,"['안녕하세요? 1 시간 만인가요? 아까 대화 즐거웠어요.', '네. 근데 1 시간...","['나는 상대방에게 친근감을 느꼈다.', '요즘에는 주거 환경을 중요하게 생각한다...."
8262,K2-34376-CL21457-CP22584-17-01-S2.json,"['반갑습니다. 저는 화학부 전공하는 20대 여대생입니다.', '맞아요. 전 이과 ...","['나는 20대 여자이다. 나는 화학부 전공이다. 나는 대학생이다.', '나는 이과..."


In [48]:
model = model.to('cuda')

In [51]:
## validation dataset에 대해 예측값 생성 후 csv로 저장

def predict_save_csv(model, dataloader, save_path):
    predict_df = pd.DataFrame(columns=['predict_persona'])
    val_dataloader=dataloader.val_dataloader()
    for batch in tqdm(val_dataloader):
        model.eval()

        with torch.no_grad():
            outputs = model.model.generate(batch['encoder_input_ids'].to('cuda'), max_length=200, num_beams=4, early_stopping=True)
            preds = [dataloader.tokenizer.decode(outputs[i], skip_special_tokens=True) for i in range(outputs.shape[0])]
            chunk_df = pd.DataFrame({'predict_persona':preds})
            predict_df = pd.concat([predict_df, chunk_df], ignore_index=True)
    
    validation_predict_df = pd.concat([val_df, predict_df], axis=1)
    validation_predict_df.to_csv(save_path, index=False)

In [52]:
predict_save_csv(model, dataloader, './validation_predict.csv')

100%|██████████| 517/517 [12:38<00:00,  1.47s/it]


In [53]:
validation_predict_df = pd.read_csv('./validation_predict.csv')
validation_predict_df.head()

Unnamed: 0,id,session_dialog,session_persona,predict_persona
0,K2-07923-CL20177-CP22499-03-01-S2.json,"['벌써 6일이 지났네요. 그동안 어떻게 지내셨어요?', '운동 너무 좋죠! 저는 ...","['나는 중고차를 알아보려 한다.', '나는 첫차로 중고차를 사려고 한다.', '나...","나는 중고 중고차를 알아볼 것이다,나는 중고차가 부담스럽다,나는 가을에 음악을 듣는..."
1,K2-20311-CL31505-CP51040-08-08-S2.json,"['안녕하세요 저는 30대 여성입니다.', '반갑습니다. 저는 동해에 거주해서 바닷...","['나는 30대 여성이다.', '나는 물 멍을 좋아한다. 나의 거주지는 동해이다.'...","나는 30대 여성이다,나는 동해에 거주한다,나는 가끔 만족스럽다,나는 단감을 자주 ..."
2,K2-26781-CL20066-CP22442-12-09-S2.json,"['오~ 재밌겠당. 어디로 캠핑 가셨는데요?', '재밌겠당.ㅎ 왜 놀러가서 화가 나...",['나는 글램핑을 안 해봤다.'],나는 글램핑을 해본 적이 없다.
3,K2-25258-CL31722-CP52004-19-07-S2.json,"['안녕하세요 저는 30대 여성입니다.', '우와 멋져요~ 저는 사학과 전공했어요~...","['나는 30대 여성이다', '나는 사학과를 전공했다', '나는 주말에 청소를 하고...","나는 30대 여성이다,나는 사학과 전공이다,나는 주말에 청소를 하고 치킨을 안 먹는..."
4,K2-00603-CL00969-CP04174-14-04-S2.json,"['안녕하세요. 저는 27살 경기도에 사는 남자입니다.', '오늘 1년만에 가족을 ...","['나는 27살 경기도에 사는 남자이다.', '1년 만에 가족을 만났다. 나는 바빠...","나는 27살 경기도에 사는 남자이다,나는 가족과 1년 만에 만난다,나는 일식을 즐겨..."
