## 1. 드라이브 마운트

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 2. 필요한 패키지 install & import

In [2]:
!pip install accelerate>=0.20.1
!pip install torch
!pip install transformers
# !pip install evaluate
!pip install nltk
!pip install sentencepiece
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from transformers import (
    MarianTokenizer,
    MarianMTModel,
    MarianConfig,
    Trainer,
    TrainingArguments,
)
# import evaluate

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import pandas as pd
import random

## 3. 데이터 로드(train:validation = 9:1)

In [4]:
# 데이터 경로
data_path = '/content/drive/MyDrive/'

# train / validation 데이터파일 불러오기
train_df = pd.read_excel(data_path+'sum_tran_train.xlsx')
valid_df = pd.read_excel(data_path+'sum_tran_valid.xlsx')

In [5]:
# [Test] 데이터 확인
train_df.shape, valid_df.shape

((350, 2), (35, 2))

In [6]:
# [Test] 데이터 확인
train_df.head(), valid_df.head()

(                                                  원문  \
 0  코로나19 피해로 소득이 감소해 생계비를 빼고 나면 빚을 갚기 어려운 채무자를 위해...   
 1  카카오페이 이 부사장은 기자간담회를 열고 향후 사업 목표가 종이 없는 사회라고 밝혔...   
 2  샤오미가 유럽과 중남미 등에서 화훼이의 제재를 틈타 중국 안방과 글로벌 시장에서 영...   
 3  블랙록·뱅가드 등의 초재벌 기관투자자들은 도덕적 기준이 아닌 주식회사제도 원칙의 기...   
 4  네이버가 매달 구독료를 내면 네이버의 주요 서비스를 쓸 수 있는 구독 서비스를 시작한다.   
 
                                                  번역문  
 0  The suspension of repayment of the principal o...  
 1  Vice President Lee of Kakao Pay held a press c...  
 2  Counterpoint Research analyzed that Xiaomi is ...  
 3  Super chaebol institutional investors such as ...  
 4  Naver will launch a subscription service that ...  ,
                                                   원문  \
 0  정부가 낙태 관련 형법·모자보건법 개정안을 입법 예고하며 임신 초기인 14주까지 본...   
 1  일본 정부는 해양 방출은 전 세계에서 일상적으로 실시되고 있다는 IAEA의 해석을 ...   
 2    SUV의 인기가 높아지며 국민차로 불리던 현대자동차 쏘나타의 인기가 예전 같지 않다.   
 3  실제 90넌대 글로벌화 시기 모 기업의 사내 토익반 강사로 일했던 홍 작가 자신의 ...   
 4  뉴질랜드와 나이지리아 대사관 성추행 사건에 대해 강 장

## 4. 사용할 Model, Tokenizer 정의

In [7]:
# 모델명
model_name = "Helsinki-NLP/opus-mt-ko-en"

# 토크나이저 정의
tokenizer = MarianTokenizer.from_pretrained(model_name)

# 모델 정의
model = MarianMTModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [8]:
# [Test] tokenizer, model
print(tokenizer, model)

MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-ko-en', vocab_size=65001, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
} MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_

## 5. 사용자 정의 Dataset 정의

In [9]:
class CustomDataset(Dataset):

    """ 데이터를 토큰화 및 텐서로 변환하는 Dataset을 재정의한 클래스 """

    def __init__(self, max_length=128):
        self.max_length = max_length
        self.model_name = "Helsinki-NLP/opus-mt-ko-en"
        self.tokenizer = MarianTokenizer.from_pretrained(self.model_name)

    # tokenizer 정의
    def tokenizer_function(self, data):
        return self.tokenizer(
            data,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True
        )

    # 학습을 위한 Dataset
    def get_inputs(self, df):
        input_pairs = []  # 모델에 학습시킬 데이터(텐서)를 저장할 리스트

        for i in range(len(df)):
            # input 및 target 데이터 추출
            input_text = df.iloc[i, 0]
            target_text = df.iloc[i, 1]

            # input 및 target 데이터 토큰화
            input_ids = self.tokenizer_function(input_text)
            target_ids = self.tokenizer_function(target_text)

            # 모델에 학습시킬 데이터 -> 텐서로 변환
            input_pair = {'input_ids' : torch.LongTensor(input_ids['input_ids']),
                          'attention_mask' : torch.LongTensor(input_ids['attention_mask']),
                          'labels' : torch.LongTensor(target_ids['input_ids'])}

            input_pairs.append(input_pair)

        return input_pairs

    # 생성 및 성능측정의 위한 Dataset
    def get_input_ids(self, df):
        input_ids_list = []  # 모델에 생성 및 성능측정할 데이터(텐서)를 저장할 리스트

        for i in range(len(df)):
            # input 데이터 추출
            input_text = df.iloc[i, 0]

            # input 데이터 토큰화
            input_ids = self.tokenizer(input_text, padding="max_length", max_length=self.max_length, truncation=True)["input_ids"]
            input_ids_list.append(input_ids)

        return torch.tensor(input_ids_list)  # 텐서 변환

In [10]:
# Dataset 정의
custom_dataset = CustomDataset()

# Dataset 생성
dataset_train = custom_dataset.get_inputs(train_df)
dataset_val = custom_dataset.get_inputs(valid_df)

In [11]:
# [Test] Dataset
print(f'dataset_train len : {len(dataset_train)}')
dataset_train[0]

dataset_train len : 350


{'input_ids': tensor([ 2157,   131,   157,  9986, 16879,   131, 28690,    48, 13572,   253,
             9, 15009, 21955, 15604, 16491, 35778, 16149,   163,  7356,  4827,
          1085,  5468,   640,    40,  1787,   450,  2771,  1492,  1282,   989,
          4646, 34975, 29847,    62,   362, 22946,  1881,   168, 25604,  9870,
          1698,  1327,   450,  2771,   162,   541,  2469, 40264,   450,  2771,
           234, 36844,   161,     2,     0, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65

In [12]:
# [Test] Dataset
print(f'dataset_val len : {len(dataset_val)}')
dataset_val[0]

dataset_val len : 35


{'input_ids': tensor([15823, 23428,  7092,  4078,  1034,  4399, 23988, 13652,  1034,     9,
         39728,   518,    51,  2341,  1034,   539,    85,  1052,  9194,  7533,
           249,   977,   349,   887, 14582,    48,  7297,  1011, 10434,  1046,
         23428,    79, 14552,   255,  3975,  5547,  3280,   578,  9355,   151,
           161,     2,     0, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65

## 6. 사용자 정의 CustomModel 재정의

In [13]:
class CustomModel(nn.Module):

    """ fine-tuning하기 위해 pretrained 모델인 MarianMT 모델 재정의한 클래스 """

    def __init__(self, custom_dataset, model_name, decoder_layers=6, dropout=None):
        super().__init__()
        config = MarianConfig.from_pretrained(model_name)  # 모델 구성요소
        config.decoder_layers = decoder_layers  # 디코더
        self.custom_dataset = custom_dataset  # 데이터셋
        self.model = MarianMTModel.from_pretrained(model_name, config = config)  # 모델 정의

        # dropout
        if dropout:
            self.model.dropout = dropout

        # encoder freezing
        for param in self.model.get_encoder().parameters():
            param.requires_grad = False

        self.batch_size = -1
        self.batch_cnt = -1

    # 모델 학습
    def train(self, training_args, train_df, valid_df):
        # train / validation Dataset 로드
        train_dataset = self.custom_dataset.get_inputs(train_df)
        valid_dataset = self.custom_dataset.get_inputs(valid_df)

        # 학습
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
        )

        trainer.train()

    # 모델 생성
    def generate(self, df):
        # 생성 및 성능측정의 위한 Dataset 로드
        input_ids = self.custom_dataset.get_input_ids(df)

        # 문장 생성
        outputs = self.model.generate(input_ids.to(device), max_length=128)

        # 문장으로 변환
        output_sentences = []
        for output in outputs:
            output_sentences.append(tokenizer.decode(output, skip_special_tokens=True))

        return output_sentences

    # batch 단위 정확도 측정(BLEU)
    def batch_accuracy(self, df):
        self.batch_cnt += 1

        # 모델 생성 결과
        output_sentences = self.generate(df)

        # target(번역문)
        target_sentences = [sentence for sentence in df['번역문']]

        # 현재 배치 문장 개수, 정확도
        batch_sen_cnt, accuracy_cnt =  0, 0

        """
        BLEU: output 단어가 target에 포함된 정도
        ROUGE: target 단어가 output에 포함된 정도
        """

        # SmoothingFunction 정의
        # smoothing_function = SmoothingFunction().method1  # method1: 기본적인 smoothing 방법

        print(f'### {self.batch_cnt} batch start ###')
        for i in range(len(df)):
            batch_sen_cnt += 1

            output_sentence = output_sentences[i]  # 모델 생성 결과(번역문)
            target_sentence = target_sentences[i]  # target(번역문)

            # 문장을 토큰 단위로 나누기
            candidate = output_sentence.split()   # 생성된 문장을 토큰 단위로 나누기
            reference = [target_sentence.split()]   # 참조 문장(target)을 토큰 단위로 나누기

            # 1-gram, 2-gram, 3-gram, 4-gram 평가
            weights = (1.0, 1.0, 1.0, 1.0)

            # BLEU 점수 계산
            bleu_score = sentence_bleu(reference, candidate, weights=weights)
            accuracy_cnt += bleu_score

        # 배치 단위 평균 BLEU 점수 측정
        performance = accuracy_cnt / batch_sen_cnt
        print(f'{self.batch_cnt} batch BLEU performance : {performance}\n')

        return performance

    # # batch 단위 정확도 측정(똑같으면 1, 하나라도 다르면 0)
    # def batch_accuracy(self, df):
    #     self.batch_cnt += 1

    #     # 모델 생성 결과
    #     output_sentences = self.generate(df)

    #     # target(번역문)
    #     target_sentences = [sentence for sentence in df['번역문']]

    #     # 현재 배치 문장 개수, 정확도
    #     batch_sen_cnt, accuracy_cnt = 0, 0

    #     print(f'### {self.batch_cnt} batch start ###')
    #     for i in range(len(df)) :
    #         batch_sen_cnt += 1

    #         output_sentence = output_sentences[i]  # input으로 모델 생성 결과(번역문)
    #         target_sentence = target_sentences[i]  # target(번역문)

    #         # 정확도 카운트
    #         if output_sentence == target_sentence :
    #             accuracy_cnt += 1
    #         else :
    #             self.test_result[0].append(output_sentence)
    #             self.test_result[1].append(target_sentence)

    #     # batch 단위 정확도 측정
    #     performance = accuracy_cnt / batch_sen_cnt
    #     print(f'{self.batch_cnt} batch performance : {performance}\n')

    #     return performance

    # 총 정확도 측정
    def total_test_accuracy(self, df, batch_size):
        self.batch_size = batch_size
        self.batch_cnt = 0
        self.test_result = [[], []]

        start, end = 0, self.batch_size

        performance = 0
        while 1:
            if start >= len(df):
                break

            end = min(end, len(df))

            # 현재 배치에 대한 정확도를 측정하여 누적
            performance += self.batch_accuracy(df[start:end])

            start = end  # 시작 인덱스를 끝 인덱스로 업데이트
            end += self.batch_size  # 끝 인덱스를 배치 크기만큼 증가

        # 총 정확도 측정
        performance /= self.batch_cnt
        print(f'final performance : {performance}')

        return performance

    def return_model(self):
        return self.model

In [14]:
# 아래 실행하지 마시오!

In [15]:
# 아래 실행하지 마시오!

## 7. Fine-tuning

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 모델 정의 및 초기화
model = CustomModel(custom_dataset, model_name)
model.return_model()

# 모델 저장 경로 설정
model_path = "/content/drive/MyDrive/model/"

# 학습 Epoch, Batch, Step 정의
num_train_epochs = 25
batch_size = 30
step = 500

# 학습 변수
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_steps=step,
    save_steps=step,
    logging_steps=step,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

# 모델 학습(600 epoch)
performance_max = 0
try:
    for i in range(24):  # 25 * 24 = 600 epoch
        model.train(training_args, train_df, valid_df)
        performance = model.total_test_accuracy(train_df, batch_size=batch_size)  # 성능측정
        print(f'{num_train_epochs * (i+1)} epoch performance : {performance}')

        if performance_max < performance:  # 현재 성능이 이전 최고 성능보다 좋으면 모델 저장
            performance_max = performance
            print(f'{num_train_epochs * (i+1)} epoch is best model\n')
            # torch.save(model,"/content/drive/MyDrive/model/marian_fine_tunned_model_v1_3.pt")
            torch.save(model.return_model().state_dict(), "/content/drive/MyDrive/model/marian_fine_tunned_model_state_dict_v1_3.pt")
            print("Best model saved.")
        else:
            # 현재 성능이 이전 최고 성능보다 좋지 않으면 이전 모델로 저장하고 학습 중단
            print("Tunning Success.")
            break

except KeyboardInterrupt:
    if performance_max > 0:  # 이전 성능이 있을 경우에만 저장
        # torch.save(model, "/content/drive/MyDrive/model/marian_fine_tunned_model_v1_3.pt")
        torch.save(model.return_model().state_dict(), "/content/drive/MyDrive/model/marian_fine_tunned_model_state_dict_v1_3.pt")
        print("Best model saved.")



Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 8.036855140815083e-05



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 2 batch start ###
2 batch BLEU performance : 0.000595318985056105

### 3 batch start ###
3 batch BLEU performance : 2.3651994461892806e-05

### 4 batch start ###
4 batch BLEU performance : 0.0002466941701014961

### 5 batch start ###
5 batch BLEU performance : 0.00014049407095816048

### 6 batch start ###
6 batch BLEU performance : 8.87666385669557e-05

### 7 batch start ###
7 batch BLEU performance : 6.929132510894393e-05

### 8 batch start ###
8 batch BLEU performance : 9.41364080175138e-06

### 9 batch start ###
9 batch BLEU performance : 0.0006024112688210565

### 10 batch start ###
10 batch BLEU performance : 4.070813455066736e-05

### 11 batch start ###
11 batch BLEU performance : 0.00038250431561550724

### 12 batch start ###
12 batch BLEU performance : 0.00017490378757383128

final performance : 0.0002045439069187099
25 epoch performance : 0.0002045439069187099
25 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.0008946055189552148



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 2 batch start ###
2 batch BLEU performance : 0.0023058814138638556

### 3 batch start ###
3 batch BLEU performance : 0.0005853981308964397

### 4 batch start ###
4 batch BLEU performance : 0.003668207537738592

### 5 batch start ###
5 batch BLEU performance : 0.003983396001330227

### 6 batch start ###
6 batch BLEU performance : 0.0008711839345660415

### 7 batch start ###
7 batch BLEU performance : 0.0006682297085401975

### 8 batch start ###
8 batch BLEU performance : 0.0005604014107392209

### 9 batch start ###
9 batch BLEU performance : 0.0009818515620717938

### 10 batch start ###
10 batch BLEU performance : 0.0009314942384123361

### 11 batch start ###
11 batch BLEU performance : 0.00254358161329247

### 12 batch start ###
12 batch BLEU performance : 0.0015924269468752944

final performance : 0.0016322215014401402
50 epoch performance : 0.0016322215014401402
50 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.015141958461599768



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 2 batch start ###
2 batch BLEU performance : 0.03720690614776855

### 3 batch start ###
3 batch BLEU performance : 0.011865745737870043

### 4 batch start ###
4 batch BLEU performance : 0.011329824344093232

### 5 batch start ###
5 batch BLEU performance : 0.010030178658256849

### 6 batch start ###
6 batch BLEU performance : 0.022536141693621035

### 7 batch start ###
7 batch BLEU performance : 0.01160570110766767

### 8 batch start ###
8 batch BLEU performance : 0.0034593782379337638

### 9 batch start ###
9 batch BLEU performance : 0.008873064074389536

### 10 batch start ###
10 batch BLEU performance : 0.040974573005548756

### 11 batch start ###
11 batch BLEU performance : 0.02273022694709739

### 12 batch start ###
12 batch BLEU performance : 0.016339233735183666

final performance : 0.017674411012585853
75 epoch performance : 0.017674411012585853
75 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.09411389259516051



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 2 batch start ###
2 batch BLEU performance : 0.10775029225726486

### 3 batch start ###
3 batch BLEU performance : 0.035993365469332586

### 4 batch start ###
4 batch BLEU performance : 0.09005791659777525

### 5 batch start ###
5 batch BLEU performance : 0.08416035299087475

### 6 batch start ###
6 batch BLEU performance : 0.08915232754327258

### 7 batch start ###
7 batch BLEU performance : 0.0966790570631373

### 8 batch start ###
8 batch BLEU performance : 0.06328220665089607

### 9 batch start ###
9 batch BLEU performance : 0.07044720244527193

### 10 batch start ###
10 batch BLEU performance : 0.1047130584074243

### 11 batch start ###
11 batch BLEU performance : 0.08143235528234564

### 12 batch start ###
12 batch BLEU performance : 0.13219412662156219

final performance : 0.0874980128270265
100 epoch performance : 0.0874980128270265
100 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.3138237790408136



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 2 batch start ###
2 batch BLEU performance : 0.34447822979165404

### 3 batch start ###
3 batch BLEU performance : 0.27228902760698415

### 4 batch start ###
4 batch BLEU performance : 0.3561611327771157

### 5 batch start ###
5 batch BLEU performance : 0.2663455068668228

### 6 batch start ###
6 batch BLEU performance : 0.39099665997370386



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 7 batch start ###
7 batch BLEU performance : 0.26611211885231045

### 8 batch start ###
8 batch BLEU performance : 0.3144602506510052

### 9 batch start ###
9 batch BLEU performance : 0.36958401827940546

### 10 batch start ###
10 batch BLEU performance : 0.4180165139994948

### 11 batch start ###
11 batch BLEU performance : 0.3488656681475798

### 12 batch start ###
12 batch BLEU performance : 0.34488548093361565

final performance : 0.3338348655767088
125 epoch performance : 0.3338348655767088
125 epoch is best model



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.5507375304135199

### 2 batch start ###
2 batch BLEU performance : 0.5990324110180109

### 3 batch start ###
3 batch BLEU performance : 0.6004235580545644



The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


### 4 batch start ###
4 batch BLEU performance : 0.664938810723498

### 5 batch start ###
5 batch BLEU performance : 0.5273016055985253

### 6 batch start ###
6 batch BLEU performance : 0.6285646870836775

### 7 batch start ###
7 batch BLEU performance : 0.5310965100680582

### 8 batch start ###
8 batch BLEU performance : 0.545942180242578

### 9 batch start ###
9 batch BLEU performance : 0.6115025072988425

### 10 batch start ###
10 batch BLEU performance : 0.5026320594648634

### 11 batch start ###
11 batch BLEU performance : 0.5351338136626356

### 12 batch start ###
12 batch BLEU performance : 0.6198105650753712

final performance : 0.5764263532253454
150 epoch performance : 0.5764263532253454
150 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.706641214039298

### 2 batch start ###
2 batch BLEU performance : 0.7470705973411427

### 3 batch start ###
3 batch BLEU performance : 0.7589007670459795

### 4 batch start ###
4 batch BLEU performance : 0.6911384773328094

### 5 batch start ###
5 batch BLEU performance : 0.7088729450207469

### 6 batch start ###
6 batch BLEU performance : 0.7350406930474875

### 7 batch start ###
7 batch BLEU performance : 0.7309799958459776

### 8 batch start ###
8 batch BLEU performance : 0.7485273471678713

### 9 batch start ###
9 batch BLEU performance : 0.750105520787913

### 10 batch start ###
10 batch BLEU performance : 0.7198793854888217

### 11 batch start ###
11 batch BLEU performance : 0.71520738401241

### 12 batch start ###
12 batch BLEU performance : 0.6969151237693638

final performance : 0.7257732875749853
175 epoch performance : 0.7257732875749853
175 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.7558844030013456

### 2 batch start ###
2 batch BLEU performance : 0.7415264984522533

### 3 batch start ###
3 batch BLEU performance : 0.8186531000067466

### 4 batch start ###
4 batch BLEU performance : 0.7690759016146937

### 5 batch start ###
5 batch BLEU performance : 0.7989855341343919

### 6 batch start ###
6 batch BLEU performance : 0.8376069290906704

### 7 batch start ###
7 batch BLEU performance : 0.7707757732777025

### 8 batch start ###
8 batch BLEU performance : 0.8013166023527436

### 9 batch start ###
9 batch BLEU performance : 0.8049200970604161

### 10 batch start ###
10 batch BLEU performance : 0.7820778840569884

### 11 batch start ###
11 batch BLEU performance : 0.8471424616277683

### 12 batch start ###
12 batch BLEU performance : 0.8213843122386562

final performance : 0.7957791247428648
200 epoch performance : 0.7957791247428648
200 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.8046137114103246

### 2 batch start ###
2 batch BLEU performance : 0.7989026013434323

### 3 batch start ###
3 batch BLEU performance : 0.8458304660147332

### 4 batch start ###
4 batch BLEU performance : 0.7803041439456282

### 5 batch start ###
5 batch BLEU performance : 0.8343776437286646

### 6 batch start ###
6 batch BLEU performance : 0.8332298380654677

### 7 batch start ###
7 batch BLEU performance : 0.7154576956497717

### 8 batch start ###
8 batch BLEU performance : 0.8474668347795755

### 9 batch start ###
9 batch BLEU performance : 0.7673694485455979

### 10 batch start ###
10 batch BLEU performance : 0.867838681399079

### 11 batch start ###
11 batch BLEU performance : 0.8800521691738382

### 12 batch start ###
12 batch BLEU performance : 0.7928653204768007

final performance : 0.8140257128777427
225 epoch performance : 0.8140257128777427
225 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.8567583151054091

### 2 batch start ###
2 batch BLEU performance : 0.794135185025688

### 3 batch start ###
3 batch BLEU performance : 0.8688711417036129

### 4 batch start ###
4 batch BLEU performance : 0.7903144743674682

### 5 batch start ###
5 batch BLEU performance : 0.8175429259629624

### 6 batch start ###
6 batch BLEU performance : 0.8654909120550013

### 7 batch start ###
7 batch BLEU performance : 0.7830192016651313

### 8 batch start ###
8 batch BLEU performance : 0.8048734740129435

### 9 batch start ###
9 batch BLEU performance : 0.831490201177055

### 10 batch start ###
10 batch BLEU performance : 0.8622038556567969

### 11 batch start ###
11 batch BLEU performance : 0.9516491642978254

### 12 batch start ###
12 batch BLEU performance : 0.8465158948141486

final performance : 0.8394053954870034
250 epoch performance : 0.8394053954870034
250 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.8412922397602127

### 2 batch start ###
2 batch BLEU performance : 0.8215852782487131

### 3 batch start ###
3 batch BLEU performance : 0.8698891285576218

### 4 batch start ###
4 batch BLEU performance : 0.8534746160550709

### 5 batch start ###
5 batch BLEU performance : 0.8818268394585661

### 6 batch start ###
6 batch BLEU performance : 0.8263698991113946

### 7 batch start ###
7 batch BLEU performance : 0.85138432513369

### 8 batch start ###
8 batch BLEU performance : 0.8274206289150402

### 9 batch start ###
9 batch BLEU performance : 0.8993763205339872

### 10 batch start ###
10 batch BLEU performance : 0.9106843749419293

### 11 batch start ###
11 batch BLEU performance : 0.8468431028899247

### 12 batch start ###
12 batch BLEU performance : 0.8515713286891146

final performance : 0.8568098401912722
275 epoch performance : 0.8568098401912722
275 epoch is best model

Best model saved.


Step,Training Loss,Validation Loss


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


### 1 batch start ###
1 batch BLEU performance : 0.8857185225613569

### 2 batch start ###
2 batch BLEU performance : 0.8242985020564572

### 3 batch start ###
3 batch BLEU performance : 0.8428079739175018

### 4 batch start ###
4 batch BLEU performance : 0.8134973015851529

### 5 batch start ###
5 batch BLEU performance : 0.768028152492548

### 6 batch start ###
6 batch BLEU performance : 0.8523013393814376

### 7 batch start ###
7 batch BLEU performance : 0.8407412629905997

### 8 batch start ###
8 batch BLEU performance : 0.7958590694146356

### 9 batch start ###
9 batch BLEU performance : 0.9031515257907201

### 10 batch start ###
10 batch BLEU performance : 0.9171484968532532

### 11 batch start ###
11 batch BLEU performance : 0.8458522511931073

### 12 batch start ###
12 batch BLEU performance : 0.8077880616398394

final performance : 0.8414327049897175
300 epoch performance : 0.8414327049897175
Tunning Success.


## 8. Fine-tunning 결과 확인(Train Dataset)

In [26]:
# [Test] Fine-tunning 모델로 생성 및 타켓 평가(train dataset)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
data_path = '/content/drive/MyDrive/'
train_df = pd.read_excel(data_path+'sum_tran_train.xlsx')[:15]

# 모델과 토크나이저 불러오기
model_name = "Helsinki-NLP/opus-mt-ko-en"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# 저장된 모델(state dict) 불러오기
model_path = "/content/drive/MyDrive/model/marian_fine_tunned_model_state_dict_v1_3.pt"
model.load_state_dict(torch.load(model_path))

# 모델을 평가 모드로 전환
model.to(device)
model.eval()

bleu_scores = []
for i in range(len(train_df)):
    inputs = tokenizer(train_df.iloc[i, 0], return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 생성 및 변환
    with torch.no_grad():
        generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=128)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # target
    target_text = train_df.iloc[i, 1]

    print(f"generated_text: {generated_text}")
    print(f"target_text: {target_text}")
    print(f"Match: {generated_text == target_text}\n")

    # 1-gram, 2-gram, 3-gram, 4-gram 평가를 위한 가중치
    weights = (0.25, 0.25, 0.25, 0.25)

    # BLEU 점수 계산
    candidate = generated_text.split()
    reference = [target_text.split()]
    bleu_score = sentence_bleu(reference, candidate, weights=weights)
    bleu_scores.append(bleu_score)
    print(f"BLEU Score: {bleu_score}\n")
    print("--------------------------------------------------")

avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("==================================================")
print(f"Average BLEU Score: {avg_bleu_score}")

generated_text: The suspension of repayment of the principal of household loans will be implemented in all financial sectors for debtors who are unable to pay off their debts after their income decreases due to COVID-19 damage, and only credit loans and financial loans for the working class are covered.
target_text: The suspension of repayment of the principal of household loans will be implemented in all financial sectors for debtors who are unable to pay off their debts after their income decreases due to COVID-19 damage, and only credit loans and financial loans for the working class are covered.
Match: True

BLEU Score: 1.0

--------------------------------------------------
generated_text: Vice President Lee of Kakao Pay held a press conference and said his future business goal is a paperless society, which aims to expand services that can receive various bills through Kakao Talk platforms.
target_text: Vice President Lee of Kakao Pay held a press conference and said his future bu

## 9. Fine-tunning 결과 확인(Validation Dataset)

In [27]:
# [Test] Fine-tunning 모델로 생성 및 타켓 평가(valid dataset)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 데이터 불러오기
data_path = '/content/drive/MyDrive/'
valid_df = pd.read_excel(data_path+'sum_tran_valid.xlsx')[:15]

# 모델과 토크나이저 불러오기
model_name = "Helsinki-NLP/opus-mt-ko-en"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# 저장된 모델(state dict) 불러오기
model_path = "/content/drive/MyDrive/model/marian_fine_tunned_model_state_dict_v1_3.pt"
model.load_state_dict(torch.load(model_path))

# 모델을 평가 모드로 전환
model.to(device)
model.eval()

bleu_scores = []
for i in range(len(valid_df)):
    inputs = tokenizer(valid_df.iloc[i, 0], return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 생성 및 변환
    with torch.no_grad():
        generated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=128)
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    # target
    target_text = valid_df.iloc[i, 1]

    print(f"generated_text: {generated_text}")
    print(f"target_text: {target_text}")
    print(f"Match: {generated_text == target_text}\n")

    # 1-gram, 2-gram, 3-gram, 4-gram 평가를 위한 가중치
    weights = (0.25, 0.25, 0.25, 0.25)

    # BLEU 점수 계산
    candidate = generated_text.split()
    reference = [target_text.split()]
    bleu_score = sentence_bleu(reference, candidate, weights=weights)
    bleu_scores.append(bleu_score)
    print(f"BLEU Score: {bleu_score}\n")
    print("--------------------------------------------------")

avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("==================================================")
print(f"Average BLEU Score: {avg_bleu_score}")

generated_text: It is oil to covernment the government's plan to enter the presidential and oppositional leader's party and lead measures to service events at the early 14 weeks of accountry.
target_text: The main point is that the government has announced legislative amendments to the abortion-related criminal law and maternal and child health laws, paving the way for abortion to be allowed unconditionally if requested by the first 14 weeks of pregnancy.
Match: False

BLEU Score: 1.4875769488812793e-78

--------------------------------------------------


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


generated_text: The Japan Mindropolitan Government revealed IAEA's announced crisis that is dayly imported in the worlds, causing disruptional prevention and expanding unique through on confirmed expression of political supply.
target_text: The Japanese government drew the IAEA's interpretation that marine discharge is routinely carried out around the world, insisting on the legitimacy of the discharge and highlighting transparent efforts to disclose information on the discharge of contaminated water.
Match: False

BLEU Score: 9.65821029960698e-232

--------------------------------------------------


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


generated_text: The hidden Choi Motors' Office is increasing, and the number of HUV is not in an official.
target_text: With the growing popularity of SUVs, Hyundai Motor's Sonata, which was called the national car, is not as popular as before.
Match: False

BLEU Score: 4.1054243192424385e-155

--------------------------------------------------
generated_text: Due to the time of debtinishment, Daek, which conducted his experience, which has been a diplobal instead of the company man, which has been a Daeju Church-kyeong team, and the  tastropany of Lee Jung-kyeung, a movicial company team that the administ
target_text: In fact, Hong, who worked as an in-house TOEIC instructor for a company during the globalization of the 90s, wrote the first draft, and director Lee's English TOEIC class, which was mixed with fictional settings, was released.
Match: False

BLEU Score: 3.9421554930094576e-155

--------------------------------------------------
generated_text: In the absence of the Nation

#### 내용 요약 & 결론: 지난 프로젝트 모델 튜닝에 BLEU 성능측정을 추가하여 복습하고자 했다. 그래서 시간 단축 및 컴퓨팅 자원 절약을 위해 적은 양의 train 데이터로 모델을 튜닝했다. 하지만 결과는 모델이 train 데이터에만 과적합되었다. 따라서 튜닝된 모델은 학습한 train 데이터의 패턴이 valid 데이터를 충분히 반영하지 못했고 이를 해결하기 위해서는 더 많은 데이터로 패턴을 학습시켜함을 느꼈다.