# 사전 준비

In [None]:
!pip install transformers
!pip install Korpora

**한국어-영어 병렬 말뭉치 데이터 불러오기**

In [None]:
from Korpora import Korpora

corpus = Korpora.load("korean_parallel_koen_news")

In [None]:
print(f"train data 구성: {corpus.train[0]}\ntrain data 길이: {len(corpus.train)}\n")    # 94123 = 61*1543, batch_size=16, 32 나머지 11/batch_size=64 나머지 43
print(f"dev data 구성: {corpus.dev[0]}\ndev data 길이: {len(corpus.dev)}\n")            # 1000 = 2^3*5^3, batch_size=8
print(f"test data 구성: {corpus.test[0]}\ntest data 길이: {len(corpus.test)}\n")        # 2000 = 2^4*5^3, batch_size=8

train data 구성: SentencePair(text='개인용 컴퓨터 사용의 상당 부분은 "이것보다 뛰어날 수 있느냐?"', pair='Much of personal computing is about "can you top this?"')
train data 길이: 94123

dev data 구성: SentencePair(text='세계 에서 가장 강력한 수퍼컴퓨터를 1년 동안이나 독점해 오던 정부의 연구학자들이 이 1억1천만 달러 짜리 경이로운 설비를 공개하며, 이것이 핵전쟁으로부터 세계를 구하는데 도움이 될지도 모른다고 말했다.', pair="After keeping the world's most powerful supercomputer to themselves for a year, government researchers showed off the $110 million wonder and said it might help save the world from nuclear war.")
dev data 길이: 1000

test data 구성: SentencePair(text='토론에 참여한 사람들은 법 집행과 국가 안전보장에 대한 우려를 표명해야 할 필요성을 진지하게 받아 들이고 있습니다.', pair='Those involved in the discussions do take seriously the need to address concerns of law enforcement and national security.')
test data 길이: 2000



**M2M100**

In [None]:
!pip install SentencePiece

In [4]:
from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration

tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="ko", tgt_lang="en")
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

In [None]:
tokenizer.all_special_tokens

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 '__af__',
 '__am__',
 '__ar__',
 '__ast__',
 '__az__',
 '__ba__',
 '__be__',
 '__bg__',
 '__bn__',
 '__br__',
 '__bs__',
 '__ca__',
 '__ceb__',
 '__cs__',
 '__cy__',
 '__da__',
 '__de__',
 '__el__',
 '__en__',
 '__es__',
 '__et__',
 '__fa__',
 '__ff__',
 '__fi__',
 '__fr__',
 '__fy__',
 '__ga__',
 '__gd__',
 '__gl__',
 '__gu__',
 '__ha__',
 '__he__',
 '__hi__',
 '__hr__',
 '__ht__',
 '__hu__',
 '__hy__',
 '__id__',
 '__ig__',
 '__ilo__',
 '__is__',
 '__it__',
 '__ja__',
 '__jv__',
 '__ka__',
 '__kk__',
 '__km__',
 '__kn__',
 '__ko__',
 '__lb__',
 '__lg__',
 '__ln__',
 '__lo__',
 '__lt__',
 '__lv__',
 '__mg__',
 '__mk__',
 '__ml__',
 '__mn__',
 '__mr__',
 '__ms__',
 '__my__',
 '__ne__',
 '__nl__',
 '__no__',
 '__ns__',
 '__oc__',
 '__or__',
 '__pa__',
 '__pl__',
 '__ps__',
 '__pt__',
 '__ro__',
 '__ru__',
 '__sd__',
 '__si__',
 '__sk__',
 '__sl__',
 '__so__',
 '__sq__',
 '__sr__',
 '__ss__',
 '__su__',
 '__sv__',
 '__sw__',
 '__ta__',
 '__th__',
 '_

# 데이터 구축

**데이터셋 구축**

In [5]:
import numpy as np
from torch.utils.data import Dataset

class TranslateSet(Dataset):
    def __init__(self, docs, tokenizer, max_len, src_lang_code, tgt_lang_code, ignore_id = -100):
        super().__init__()
        self.docs = docs
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.src_lang_code = src_lang_code
        self.tgt_lang_code = tgt_lang_code
        self.eos = tokenizer.eos_token

        self.eos_id = tokenizer.eos_token_id
        self.pad_id = tokenizer.pad_token_id
        self.ignore_id = ignore_id

    # padding 및 데이터 size 일치화 함수 (input_ids)
    def add_padding(self, inputs):
        if len(inputs) < self.max_len:
            pad = np.array([self.pad_id] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, pad])
        else:
            inputs = inputs[:self.max_len]
            
        return inputs

    # padding(ignore_id) 및 데이터 size 일치화 함수 (labels)
    def add_ignored(self, inputs):
        if len(inputs) < self.max_len:
            ignored = np.array([self.ignore_id] * (self.max_len - len(inputs)))
            inputs = np.concatenate([inputs, ignored])
        else:
            inputs = inputs[:self.max_len]

        return inputs

    def __len__(self):  
        return len(self.docs)

    def __getitem__(self, idx):
        instance = self.docs[idx]

        src_text = instance.text.strip()
        tgt_text = instance.pair.strip()

        # [lang_code] X [eos]
        input_text = self.src_lang_code + src_text + self.eos
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
        input_ids = self.add_padding(input_ids)

        # [lang_code] X [eos]
        label_text = self.tgt_lang_code + tgt_text + self.eos
        labels = self.tokenizer.encode(label_text, add_special_tokens=False)

        decoder_input_ids = [self.eos_id] + labels[:-1]
        decoder_input_ids = self.add_padding(decoder_input_ids)

        labels = self.add_ignored(labels)

        return {'input_ids': np.array(input_ids, dtype=np.intc),
                'decoder_input_ids': np.array(decoder_input_ids, dtype=np.intc),
                'labels': np.array(labels, dtype=np.int_)}

In [6]:
train_set = TranslateSet(corpus.train, tokenizer, src_lang_code='__ko__', tgt_lang_code='__en__', max_len=64)
val_set = TranslateSet(corpus.dev, tokenizer, src_lang_code='__ko__', tgt_lang_code='__en__', max_len=64)
test_set = TranslateSet(corpus.test, tokenizer, src_lang_code='__ko__', tgt_lang_code='__en__', max_len=64)

**데이터로더 구축**

In [7]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_set, shuffle=False, num_workers=2, batch_size=8)
val_dataloader = DataLoader(val_set, shuffle=False, num_workers=2, batch_size=8)
test_dataloader = DataLoader(test_set, shuffle=False, num_workers=2, batch_size=8)

In [None]:
next(iter(train_dataloader))

{'input_ids': tensor([[128052,  46988,   5597, 102345,  12286,    526,   8805,   8900,  53784,
            1028,     33,    522,  42485,  28462, 108895,   2254,  29822,   1480,
           14399,  69986,  49178,   7858,      2,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1,      1,      1,
               1],
         [128052,  19916,  41724,   7805,  85866,   2992,   7710,  41115,  43128,
           26581,   1858,  41724,   7805,  85866,   1384,  37938,   3963,  11149,
             668,  75771,   1029,   7710,  85866,  37273,  98482,  32479,   1406,
           77392,  34695,   2273,      5,      2,      1,      1,      1,      1,
               1,      1,      1,      1,      1,      1,      1, 

# 모델 학습

**모델 파라미터 설정**

In [8]:
# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device_GPU = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 1
learning_rate = 1e-4

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

step = 0
eval_steps = len(train_dataloader)

In [9]:
# 256MB이상의 블록 분할 불가, GPU 메모리 50%이상 사용시 블록 회수를 시작
!PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.5,max_split_size_mb:256

**1epoch 학습 진행**

In [10]:
import gc 
gc.collect()
torch.cuda.empty_cache()

In [None]:
from tqdm.auto import tqdm as tqdm_auto

model.to(device_GPU)
accumulation_steps = 8  # Gradient Accumulation을 위한 스텝 수 (batch_size=64의 효과?)

for epoch in range(num_epochs):
    train_loss = 0.0

    model.train()
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm_auto(train_dataloader, mininterval=0.01, leave=True)):

        inputs = {k: v.to(device_GPU) for k, v in batch.items()}
        outputs = model(**inputs)   # 모형으로 결과 예측

        loss = outputs.loss
        loss = loss / accumulation_steps  # 그래디언트 업데이트 스텝 수로 나누어줍니다.
        loss.backward()

        if (step + 1) % accumulation_steps == 0:  # 스텝 수만큼 그래디언트를 업데이트합니다.
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

        if (step + 1) % eval_steps == 0:  # eval_steps 마다 loss를 출력

            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                model.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_dataloader, mininterval=0.01, leave=True):

                    # 배치를 GPU로 복사
                    inputs = {k: v.to(device_GPU) for k, v in batch.items()}
                    val_outputs = model(**inputs)     # 모형으로 결과 예측

                    val_loss += outputs.loss.item()

                avg_val_loss = val_loss / len(val_dataloader)

            print('Step %d, validation loss: %.4f' % ((step + 1) // accumulation_steps, avg_val_loss))

        del loss
        del inputs
        del outputs
        gc.collect()
        torch.cuda.empty_cache()

    avg_train_loss = (train_loss * accumulation_steps) / len(train_dataloader)
    print('epoch %d, train loss: %.4f \n' % (epoch, avg_train_loss))

    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/11766 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 1470, validation loss: 2.4386
epoch 0, train loss: 2.4775 



In [None]:
model.save_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_M2M100_epoch1_lr=1e-4')

2 epoch 부터 실행

In [11]:
from transformers import MBartForConditionalGeneration, PreTrainedTokenizerFast

def load_model():       # 저장 되어있는 모델 불러오기
    saved_model = MBartForConditionalGeneration.from_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_M2M100_epoch1_lr=1e-4')

    return saved_model

In [None]:
tuning_model = load_model()

In [14]:
from tqdm.auto import tqdm as tqdm_auto

tuning_model.to(device_GPU)
accumulation_steps = 8  # Gradient Accumulation을 위한 스텝 수 (batch_size=64의 효과?)

for epoch in range(num_epochs):
    train_loss = 0.0

    tuning_model.train()
    optimizer.zero_grad()
    for step, batch in enumerate(tqdm_auto(train_dataloader, mininterval=0.01, leave=True)):

        inputs = {k: v.to(device_GPU) for k, v in batch.items()}
        outputs = tuning_model(**inputs)   # 모형으로 결과 예측

        loss = outputs.loss
        loss = loss / accumulation_steps  # 그래디언트 업데이트 스텝 수로 나누어줍니다.
        loss.backward()

        if (step + 1) % accumulation_steps == 0:  # Gradient Accumulation 스텝 수만큼 그래디언트
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

        if (step + 1) % eval_steps == 0:  # eval_steps 마다 loss를 출력

            with torch.no_grad():   # 학습 X (그래디언트 계산 X)
                val_loss = 0
                tuning_model.eval()        # 평가모드로 전환

                for val_batch in tqdm_auto(val_dataloader, mininterval=0.01, leave=True):

                    # 배치를 GPU로 복사
                    inputs = {k: v.to(device_GPU) for k, v in batch.items()}
                    val_outputs = tuning_model(**inputs)     # 모형으로 결과 예측

                    val_loss += outputs.loss.item()

                avg_val_loss = val_loss / len(val_dataloader)

            print('Step %d, validation loss: %.4f' % ((step + 1), avg_val_loss))

        del loss
        del inputs
        del outputs
        gc.collect()
        torch.cuda.empty_cache()

    avg_train_loss = train_loss / len(train_dataloader)
    print('epoch %d, train loss: %.4f \n' % (epoch+2, avg_train_loss))

    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/11766 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

Step 11766, validation loss: 13.2887
epoch 2, train loss: 1.7022 



In [None]:
tuning_model.save_pretrained('/content/drive/Othercomputers/내 컴퓨터/KO-EN_Translation/KO-EN_Translation/KO-EN_M2M100_epoch2_lr=1e-4')

In [None]:
# epoch1 train loss: 2.4775 / validation loss: 2.4386   (잘못 계산된 loss로 각 loss값에 8로 나눠야하는게 맞는것 같다)
# epoch2 train loss: 1.7022 / validation loss: 13.2887

In [15]:
while 1:
    document = input("입력 > \n").strip()
    # quit 입력시 종료
    if document == "quit":
        break

    inputs = tokenizer(document, return_tensors="pt")

    output = tuning_model.generate(inputs["input_ids"].to(device_GPU), num_beams=3, max_length=128)
    output = tokenizer.batch_decode(output, skip_special_tokens=True)

    print(f'결과 > \n{output}')

입력 > 
개인용 컴퓨터




결과 > 
['The The The The The The The The']
입력 > 
quit
