# 사전 준비

In [None]:
!pip install transformers
!pip install Korpora

**한국어-영어 병렬 말뭉치 데이터 불러오기**

In [2]:
from Korpora import Korpora

corpus = Korpora.load("korean_parallel_koen_news")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : KakaoBrain
    Repository : https://github.com/jungyeul/korean-parallel-corpora
    References :
        - Jungyeul Park, Jeen-Pyo Hong and Jeong-Won Cha (2016) Korean Language Resources for Everyone.
          In Proceedings of the 30th Pacific Asia Conference on Language, Information and Computation
          (PACLIC 30). October 28 - 30, 2016. Seoul, Korea. 
          (https://www.aclweb.org/anthology/Y16-2002/)

    # License
    Creative Commons Attribution Noncommercial No-Derivative-Works 3.0
    Details in https://creativecommons.org/licenses/by-nc-nd/3.0/



[korean_parallel] download korean-english-park.train.tar.gz: 8.72MB [00:00, 97.3MB/s]


decompress /root/Korpora/korean_parallel/korean-english-park.train.tar.gz


[korean_parallel] download korean-english-park.dev.tar.gz: 115kB [00:00, 2.81MB/s]


decompress /root/Korpora/korean_parallel/korean-english-park.dev.tar.gz


[korean_parallel] download korean-english-park.test.tar.gz: 238kB [00:00, 4.55MB/s]

decompress /root/Korpora/korean_parallel/korean-english-park.test.tar.gz





In [3]:
print(f"train data 구성: {corpus.train[0]}\ntrain data 길이: {len(corpus.train)}\n")    # 94123 = 61*1543, batch_size=16, 32 나머지 11/batch_size=64 나머지 43
print(f"dev data 구성: {corpus.dev[0]}\ndev data 길이: {len(corpus.dev)}\n")            # 1000 = 2^3*5^3, batch_size=8
print(f"test data 구성: {corpus.test[0]}\ntest data 길이: {len(corpus.test)}\n")        # 2000 = 2^4*5^3, batch_size=8

train data 구성: SentencePair(text='개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"', pair='Much of personal computing is about "can you top this?"')
train data 길이: 94123

dev data 구성: SentencePair(text='세계 에서 가장 강력한 수퍼컴퓨터를 1년 동안이나 독점해 오던 정부의 연구학자들이 이 1억1천만 달러 짜리 경이로운 설비를 공개하며, 이것이 핵전쟁으로부터 세계를 구하는데 도움이 될지도 모른다고 말했다.', pair="After keeping the world's most powerful supercomputer to themselves for a year, government researchers showed off the $110 million wonder and said it might help save the world from nuclear war.")
dev data 길이: 1000

test data 구성: SentencePair(text='토론에 참여한 사람들은 법 집행과 국가 안전보장에 대한 우려를 표명해야 할 필요성을 진지하게 받아 들이고 있습니다.', pair='Those involved in the discussions do take seriously the need to address concerns of law enforcement and national security.')
test data 길이: 2000



**KoBART 모델과 Tokenizer 불러오기**

In [4]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v2')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v2')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/496M [00:00<?, ?B/s]

# 데이터 구축

**데이터셋 구축**

In [5]:
import numpy as np
from torch.utils.data import Dataset

class TranslateSet(Dataset):
    def __init__(self, docs, tokenizer, max_len, ignore_id = -100):
        super().__init__()
        self.docs = docs
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
        self.ignore_id = ignore_id      # BartForConditionalGeneration의 labels 입력 index에 포함

    # padding 및 데이터 size 일치화 함수 (input_ids)
    def add_padding(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]
            
        return inputs

    # padding(ignore_id) 및 데이터 size 일치화 함수 (labels)
    def add_ignored(self, inputs):
        if len(inputs) < self.max_len:
            ignored = np.array([self.ignore_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, ignored])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __len__(self):  
        return len(self.docs)

    def __getitem__(self, idx):
        instance = self.docs[idx]

        input_ids = self.tokenizer.encode(instance.text)
        input_ids = self.add_padding(input_ids)

        labels = self.tokenizer.encode(instance.pair)
        labels.append(self.eos_id)

        dec_input_ids = [self.eos_id]       # BART 디코더 입력의 시작은 eos토큰의 인덱스로 시작
        dec_input_ids += labels[:-1]
        dec_input_ids = self.add_padding(dec_input_ids)

        labels = self.add_ignored(labels)

        return {'input_ids': np.array(input_ids, dtype=np.int_),
                'decoder_input_ids': np.array(dec_input_ids, dtype=np.int_),
                'labels': np.array(labels, dtype=np.int_)}

In [6]:
train_set = TranslateSet(corpus.train, tokenizer, max_len=128)       # text의 평균 길이는 60.78, pair의 평균 길이는 126.1
val_set = TranslateSet(corpus.dev, tokenizer, max_len=128)
test_set = TranslateSet(corpus.test, tokenizer, max_len=128)

**데이터로더 구축**

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, shuffle=False, batch_size=32)
val_dataloader = DataLoader(val_set, shuffle=False, batch_size=8)
test_dataloader = DataLoader(test_set, shuffle=False, batch_size=8)

In [8]:
next(iter(train_dataloader))

{'input_ids': tensor([[15006, 11908, 17409,  ...,     3,     3,     3],
         [14537, 14354, 10496,  ...,     3,     3,     3],
         [14381, 17553, 14638,  ...,     3,     3,     3],
         ...,
         [23081,  9879, 13714,  ...,     3,     3,     3],
         [29654, 14987,     3,  ...,     3,     3,     3],
         [21126, 21639, 14364,  ...,     3,     3,     3]]),
 'decoder_input_ids': tensor([[    1, 14759,   316,  ...,     3,     3,     3],
         [    1, 18482,   310,  ...,     3,     3,     3],
         [    1, 14676,   304,  ...,     3,     3,     3],
         ...,
         [    1, 14128,   272,  ...,     3,     3,     3],
         [    1, 15073,   310,  ...,     3,     3,     3],
         [    1, 14603,   315,  ...,     3,     3,     3]]),
 'labels': tensor([[14759,   316, 17426,  ...,  -100,  -100,  -100],
         [18482,   310, 22911,  ...,  -100,  -100,  -100],
         [14676,   304,   306,  ...,  -100,  -100,  -100],
         ...,
         [14128,   272,  

# 모델 학습

**모델 파라미터 설정**

In [9]:
import torch

# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 3
learning_rate = 5e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

step = 0
eval_steps = len(train_dataloader)

In [10]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

**KoBART 학습 진행**

In [11]:
from tqdm.auto import tqdm as tqdm_auto

for epoch in range(num_epochs):
    loss = 0
    train_loss = 0.0
    
    model.train()
    for batch in tqdm_auto(train_dataloader, mininterval=0.01, leave=True):
        optimizer.zero_grad()     # 그래디언트 초기화

        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)   # 모형으로 결과 예측

        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

        step += 1
        if step % eval_steps == 0:  # eval_steps 마다 loss를 출력

            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                model.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_dataloader, mininterval=0.01, leave=True):

                    # 배치를 GPU로 복사
                    inputs = {k: v.to(device) for k, v in batch.items()}
                    val_outputs = model(**inputs)     # 모형으로 결과 예측

                    loss = outputs.loss 
                    val_loss += loss.item()

                avg_val_loss = val_loss / len(val_dataloader)

            print('Step %d, validation loss: %.4f' % (step, avg_val_loss))
            
        avg_train_loss = train_loss / len(train_dataloader)

    print('epoch %d, train loss: %.4f \n' % (epoch, avg_train_loss))

  0%|          | 0/2942 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 2942, validation loss: 1.9034
epoch 0, train loss: 2.3681 



  0%|          | 0/2942 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 5884, validation loss: 1.7559
epoch 1, train loss: 2.0494 



  0%|          | 0/2942 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 8826, validation loss: 1.6034
epoch 2, train loss: 1.9528 



In [32]:
tokenizer.decode([21582, 18509, 300, 21514, 16556, 1700, 17223, 298, 
                  16651, 16802, 314, 17254, 313, 17762, 21235, 17065, 
                  15562, 16884, 15585, 1700, 17005, 16805, 314])

'hide new secretions from the parental units'