# 사전 준비

In [None]:
!pip install transformers
!pip install Korpora

**한국어-영어 병렬 말뭉치 데이터 불러오기**

In [None]:
from Korpora import Korpora

corpus = Korpora.load("korean_parallel_koen_news")

In [3]:
print(f"train data 구성: {corpus.train[0]}\ntrain data 길이: {len(corpus.train)}\n")    # 94123 = 61*1543, batch_size=16, 32 나머지 11/batch_size=64 나머지 43
print(f"dev data 구성: {corpus.dev[0]}\ndev data 길이: {len(corpus.dev)}\n")            # 1000 = 2^3*5^3, batch_size=8
print(f"test data 구성: {corpus.test[0]}\ntest data 길이: {len(corpus.test)}\n")        # 2000 = 2^4*5^3, batch_size=8

train data 구성: SentencePair(text='개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"', pair='Much of personal computing is about "can you top this?"')
train data 길이: 94123

dev data 구성: SentencePair(text='세계 에서 가장 강력한 수퍼컴퓨터를 1년 동안이나 독점해 오던 정부의 연구학자들이 이 1억1천만 달러 짜리 경이로운 설비를 공개하며, 이것이 핵전쟁으로부터 세계를 구하는데 도움이 될지도 모른다고 말했다.', pair="After keeping the world's most powerful supercomputer to themselves for a year, government researchers showed off the $110 million wonder and said it might help save the world from nuclear war.")
dev data 길이: 1000

test data 구성: SentencePair(text='토론에 참여한 사람들은 법 집행과 국가 안전보장에 대한 우려를 표명해야 할 필요성을 진지하게 받아 들이고 있습니다.', pair='Those involved in the discussions do take seriously the need to address concerns of law enforcement and national security.')
test data 길이: 2000



**multilingual BART**

In [None]:
!pip install SentencePiece

In [5]:
from transformers import MBartForConditionalGeneration, PreTrainedTokenizerFast, MBartTokenizerFast

tokenizer_1 = MBartTokenizerFast.from_pretrained('facebook/mbart-large-cc25')
model_1 = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

# 데이터 구축

**데이터셋 구축**

In [6]:
import numpy as np
from torch.utils.data import Dataset

class TranslateSet(Dataset):
    def __init__(self, docs, tokenizer, max_len, ignore_id = -100):
        super().__init__()
        self.docs = docs
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.bos_id = tokenizer.bos_token_id
        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
        self.ignore_id = ignore_id      # BartForConditionalGeneration의 labels 입력 index에 포함

    # padding 및 데이터 size 일치화 함수 (input_ids)
    def add_padding(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]
            
        return inputs

    # padding(ignore_id) 및 데이터 size 일치화 함수 (labels)
    def add_ignored(self, inputs):
        if len(inputs) < self.max_len:
            ignored = np.array([self.ignore_id]*(self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, ignored])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __len__(self):  
        return len(self.docs)

    def __getitem__(self, idx):
        instance = self.docs[idx]

        input_ids = self.tokenizer.encode(instance.text)
        input_ids = self.add_padding(input_ids)

        labels = self.tokenizer.encode(instance.pair)
        labels.append(self.eos_id)

        dec_input_ids = [self.eos_id]       # BART 디코더 입력의 시작은 eos토큰의 인덱스로 시작
        dec_input_ids += labels[:-1]
        dec_input_ids = self.add_padding(dec_input_ids)

        labels = self.add_ignored(labels)

        return {'input_ids': np.array(input_ids, dtype=np.int_),
                'decoder_input_ids': np.array(dec_input_ids, dtype=np.int_),
                'labels': np.array(labels, dtype=np.int_)}

In [7]:
train_set = TranslateSet(corpus.train, tokenizer_1, max_len=64)
val_set = TranslateSet(corpus.dev, tokenizer_1, max_len=64)
test_set = TranslateSet(corpus.test, tokenizer_1, max_len=64)

**데이터로더 구축**

In [8]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, shuffle=False, num_workers=2, batch_size=4)
val_dataloader = DataLoader(val_set, shuffle=False, num_workers=2, batch_size=8)
test_dataloader = DataLoader(test_set, shuffle=False, num_workers=2, batch_size=8)

In [None]:
next(iter(train_dataloader))

{'input_ids': tensor([[ 34988,   5358,  82362,  10993,    367, 184711,  50177,    697,     44,
             469,  32657,  21491, 179747,  25999,   1020,  27849, 140053,   2953,
               2, 250004,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1],
         [ 12624,  51173,   5779, 134894,   2020, 183767,  29770,   1504,  51173,
            5779, 134894,   1048,  32326,   3394,      6,  71106,      6, 249976,
             769,   9170, 134894,      6, 218231,    688,  37996,   1083,  66780,
           80823,      5,      2, 250004,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1, 

# 모델 학습

**모델 파라미터 설정**

In [9]:
# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device_GPU = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 1
learning_rate = 5e-4

optimizer = torch.optim.AdamW(model_1.parameters(), lr=learning_rate)

step = 0
eval_steps = len(train_dataloader)

In [10]:
# 256MB이상의 블록 분할 불가, GPU 메모리 50%이상 사용시 블록 회수를 시작
!PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.5,max_split_size_mb:256

In [None]:
model_1.to(device_GPU)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250027, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(250027, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerN

**KoBART 학습 진행**

In [11]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
from tqdm.auto import tqdm as tqdm_auto

accumulation_steps = 16  # Gradient Accumulation을 위한 스텝 수 (batch_size=64의 효과?)

for epoch in range(num_epochs):
    train_loss = 0.0

    model_1.train()
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm_auto(train_dataloader, mininterval=0.01, leave=True)):

        inputs = {k: v.to(device_GPU) for k, v in batch.items()}
        outputs = model_1(**inputs)   # 모형으로 결과 예측

        loss = outputs.loss
        loss = loss / accumulation_steps  # 그래디언트 업데이트 스텝 수로 나누어줍니다.
        loss.backward()

        if (step + 1) % accumulation_steps == 0:  # 스텝 수만큼 그래디언트를 업데이트합니다.
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

        if (step + 1) % eval_steps == 0:  # eval_steps 마다 loss를 출력

            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                model_1.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_dataloader, mininterval=0.01, leave=True):

                    # 배치를 GPU로 복사
                    inputs = {k: v.to(device_GPU) for k, v in batch.items()}
                    val_outputs = model_1(**inputs)     # 모형으로 결과 예측

                    val_loss += outputs.loss.item()

                avg_val_loss = val_loss / len(val_dataloader)

            print('Step %d, validation loss: %.4f' % ((step + 1) // accumulation_steps, avg_val_loss))
            
        if (step + 1) % (len(train_dataloader) // accumulation_steps) == 0:     # 1 batch가 끝나면 gpu 캐시 비우기
            gc.collect()
            torch.cuda.empty_cache()

    if (step + 1) % accumulation_steps != 0:  # 남은 그래디언트를 업데이트
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = (train_loss * accumulation_steps) / len(train_dataloader)
    print('epoch %d, train loss: %.4f \n' % (epoch, avg_train_loss))

    gc.collect()
    torch.cuda.empty_cache()

In [None]:
model_1.save_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_Model_epoch1')

2 epoch 부터 실행

In [12]:
from transformers import MBartForConditionalGeneration, PreTrainedTokenizerFast

def load_model():       # 저장 되어있는 모델 불러오기
    saved_model = MBartForConditionalGeneration.from_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_Model_epoch2')

    return saved_model

In [13]:
tuning_model = load_model()

In [14]:
import gc 
from tqdm.auto import tqdm as tqdm_auto

tuning_model.to(device_GPU)
accumulation_steps = 16  # Gradient Accumulation을 위한 스텝 수 (batch_size=64의 효과?)

for epoch in range(num_epochs):
    train_loss = 0.0

    tuning_model.train()
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm_auto(train_dataloader, mininterval=0.01, leave=True)):

        inputs = {k: v.to(device_GPU) for k, v in batch.items()}
        outputs = tuning_model(**inputs)   # 모형으로 결과 예측

        loss = outputs.loss
        loss = loss / accumulation_steps  # 그래디언트 업데이트 스텝 수로 나누어줍니다.
        loss.backward()

        if (step + 1) % accumulation_steps == 0:  # 스텝 수만큼 그래디언트를 업데이트합니다.
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

        if (step + 1) % eval_steps == 0:  # eval_steps 마다 loss를 출력

            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                tuning_model.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_dataloader, mininterval=0.01, leave=True):

                    # 배치를 GPU로 복사
                    inputs = {k: v.to(device_GPU) for k, v in batch.items()}
                    val_outputs = tuning_model(**inputs)     # 모형으로 결과 예측

                    val_loss += outputs.loss.item()

                avg_val_loss = val_loss / len(val_dataloader)

            print('Step %d, validation loss: %.4f' % ((step + 1) // accumulation_steps, avg_val_loss))
            
        if (step + 1) % (len(train_dataloader) // accumulation_steps) == 0:     # 1 batch가 끝나면 gpu 캐시 비우기
            gc.collect()
            torch.cuda.empty_cache()

    if (step + 1) % accumulation_steps != 0:  # 남은 그래디언트를 업데이트
        optimizer.step()
        optimizer.zero_grad()

    avg_train_loss = (train_loss * accumulation_steps) / len(train_dataloader)
    print('epoch %d, train loss: %.4f \n' % (epoch, avg_train_loss))

    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/23531 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 1470, validation loss: 2.6399
epoch 0, train loss: 3.2642 



In [15]:
tuning_model.save_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_Model_epoch3')

In [None]:
# epoch1 train loss: 3.9408 / validation loss: 2.8911
# epoch2 train loss: 3.2639 / validation loss: 2.6594
# epoch3 train loss: 3.2642 / validation loss: 2.6399