# 사전 준비

In [None]:
!pip install SentencePiece
!pip install transformers
!pip install Korpora

In [None]:
from Korpora import Korpora

corpus = Korpora.load("korean_parallel_koen_news")

In [2]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="ko", tgt_lang="en")

epoch1_model = M2M100ForConditionalGeneration.from_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_M2M100_epoch1_lr=1e-4')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

In [None]:
epoch2_model = M2M100ForConditionalGeneration.from_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_M2M100_epoch2_lr=1e-4')

In [None]:
epoch3_model = M2M100ForConditionalGeneration.from_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_M2M100_epoch3_lr=1e-4')

# 테스트 데이터 구성

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset

class TranslateSet(Dataset):
    def __init__(self, docs, tokenizer, max_len, src_lang_code, tgt_lang_code, ignore_id = -100):
        super().__init__()
        self.docs = docs
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.src_lang_code = src_lang_code
        self.tgt_lang_code = tgt_lang_code
        self.eos = tokenizer.eos_token

        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
        self.ignore_id = ignore_id

    # padding 및 데이터 size 일치화 함수 (input_ids)
    def add_padding(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_id] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]
            
        return inputs

    # padding(ignore_id) 및 데이터 size 일치화 함수 (labels)
    def add_ignored(self, inputs):
        if len(inputs) < self.max_len:
            ignored = np.array([self.ignore_id] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, ignored])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __len__(self):  
        return len(self.docs)

    def __getitem__(self, idx):
        instance = self.docs[idx]

        src_text = instance.text.strip()
        tgt_text = instance.pair.strip()

        # [lang_code] X [eos]
        input_text = self.src_lang_code + src_text + self.eos
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
        input_ids = self.add_padding(input_ids)

        # [lang_code] X [eos]
        label_text = self.tgt_lang_code + tgt_text + self.eos
        labels = self.tokenizer.encode(label_text, add_special_tokens=False)

        decoder_input_ids = [self.eos_id] + labels[:-1]
        decoder_input_ids = self.add_padding(decoder_input_ids)

        labels = self.add_ignored(labels)

        return {'input_ids': np.array(input_ids, dtype=np.intc),
                'decoder_input_ids': np.array(decoder_input_ids, dtype=np.intc),
                'labels': np.array(labels, dtype=np.int_)}

In [None]:
from torch.utils.data import DataLoader

test_set = TranslateSet(corpus.test, tokenizer, src_lang_code='__ko__', tgt_lang_code='__en__', max_len=64)
test_dataloader = DataLoader(test_set, shuffle=False, num_workers=2, batch_size=8)

# 모델 평가

In [None]:
!pip install torchmetrics

In [None]:
from torchmetrics.text.rouge import ROUGEScore
from tqdm.auto import tqdm as tqdm_auto

def model_rouge(model, device, tokenizer=tokenizer, test_dataloader=test_dataloader):
    rouge = ROUGEScore()
    rouge_total = dict()

    rouge1_p = rouge1_r = rouge2_p = rouge2_r = rougeL_p = rougeL_r = 0.0
    
    model.to(device)
    model.eval()

    for batch in tqdm_auto(test_dataloader, mininterval=0.01, leave=True):

        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels']
        labels = labels.tolist()

        # labels의 ignore_id(-100)을 제거
        for i in range (0, len(labels)):
            while -100 in labels[i]:
                labels[i].remove(-100)
        
        with torch.no_grad():   # 학습 X (그래디언트 계산 X)
            outputs = model.generate(inputs['input_ids'], max_length=512)

            outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
            labels = tokenizer.decode(labels[0], skip_special_tokens=True)
            
            rouge_total = rouge(outputs, labels)
            rouge1_p += rouge_total['rouge1_precision'].item()
            rouge1_r += rouge_total['rouge1_recall'].item()
            rouge2_p += rouge_total['rouge2_precision'].item()
            rouge2_r += rouge_total['rouge2_recall'].item()
            rougeL_p += rouge_total['rougeL_precision'].item()
            rougeL_r += rouge_total['rougeL_recall'].item()

    print('ROUGE-1 precision: %.3f\n' % (rouge1_p/len(test_dataloader)))
    print('ROUGE-1 recall: %.3f\n' % (rouge1_r/len(test_dataloader)))
    print('ROUGE-2 precision: %.3f\n' % (rouge2_p/len(test_dataloader)))
    print('ROUGE-2 recall: %.3f\n' % (rouge2_r/len(test_dataloader)))
    print('ROUGE-L precision: %.3f\n' % (rougeL_p/len(test_dataloader)))
    print('ROUGE-L recall: %.3f\n' % (rougeL_r/len(test_dataloader)))

In [None]:
device_GPU = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_rouge(epoch1_model, device_GPU)

  0%|          | 0/250 [00:00<?, ?it/s]

ROUGE-1 precision: 0.428

ROUGE-1 recall: 0.392

ROUGE-2 precision: 0.177

ROUGE-2 recall: 0.161

ROUGE-L precision: 0.353

ROUGE-L recall: 0.324



In [None]:
model_rouge(epoch2_model, device_GPU)

  0%|          | 0/250 [00:00<?, ?it/s]

ROUGE-1 precision: 0.428

ROUGE-1 recall: 0.392

ROUGE-2 precision: 0.177

ROUGE-2 recall: 0.161

ROUGE-L precision: 0.353

ROUGE-L recall: 0.324



In [None]:
model_rouge(epoch3_model, device_GPU)

  0%|          | 0/250 [00:00<?, ?it/s]

ROUGE-1 precision: 0.428

ROUGE-1 recall: 0.392

ROUGE-2 precision: 0.177

ROUGE-2 recall: 0.161

ROUGE-L precision: 0.353

ROUGE-L recall: 0.324



In [None]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [6]:
while 1:
    document = input("입력 > \n").strip()
    # quit 입력시 종료
    if document == "quit":
        break

    inputs = tokenizer(document, return_tensors="pt")

    output = epoch1_model.generate(inputs["input_ids"], num_beams=3, max_length=128)
    output = tokenizer.batch_decode(output, skip_special_tokens=True)

    print(f'결과 > \n{output}')

입력 > 
최근 자연어 처리에 관심이 높아지면서 정부와 기업은 물론 뜻있는 개인에 이르기까지 데이터를 무료로 공개하는 추세입니다.
결과 > 
['The government, and other people are getting the data free, as the current interest is increasing in the language process.']
입력 > 
하지만 데이터가 곳곳에 산재해 있다보니 품질 좋은 말뭉치임에도 그 존재조차 잘 알려지지 않은 경우가 많습니다.
결과 > 
["But it's often unknown even if it's a high-profile clown, as the data is out there."]
입력 > 
파일 포맷과 저장 형식 등이 각기 달라 사용이 쉽지 않습니다.
결과 > 
["It's not easy to use, different from file formats and storage forms."]
입력 > 
개별 사용자들은 다운로드나 전처리 코드를 그때그때 개발해서 써야 하는 수고로움이 있습니다.
결과 > 
['Individuals have a complaint for developing download code or pre-process code at the time.']
입력 > 
quit
