# 번역 데이터 준비

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

import re
import os 
import random
import math

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

print(tf.__version__)

2.6.0


영어-스페인어 데이터 다운로드

In [2]:
zip_path = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True
)

중복된 데이터 확인

In [3]:
file_path = os.path.dirname(zip_path)+"/spa-eng/spa.txt"

with open(file_path, "r") as f:
    spa_eng_sentences = f.read().splitlines()

spa_eng_sentences = list(set(spa_eng_sentences)) 
total_sentence_count = len(spa_eng_sentences)
print("Example:", total_sentence_count)

for sen in spa_eng_sentences[0:100][::20]: 
    print(">>", sen)

Example: 118964
>> I hate silent movies.	Me carga ver cine mudo.
>> You forgot to tell me about that book you read.	Se te olvidó contarme sobre el libro que leíste.
>> You must finish this work in a week.	Debes terminar este trabajo en una semana.
>> Go and fetch Tom.	Ve a traer a Tom.
>> We gladly accept your offer.	Aceptamos con gusto tu oferta.


전처리

In [4]:
def preprocess_sentence(sentence):
    sentence = sentence.lower()# 소문자 변환
    sentence = re.sub(r'[" "]+', " ", sentence) # 연속된 여러 개의 공백을 하나의 공백으로 축소
    sentence = sentence.strip() # 문자열 양 끝 공백 제거
    
    return sentence 

In [5]:
spa_eng_sentences = list(map(preprocess_sentence, spa_eng_sentences))

전체 데이터의 0.5% 정도를 테스트용으로 사용

In [6]:
test_sentence_count = total_sentence_count // 200
print("Test Size: ", test_sentence_count)
print("\n")

train_spa_eng_sentences = spa_eng_sentences[:-test_sentence_count]
test_spa_eng_sentences = spa_eng_sentences[-test_sentence_count:]
print("Train Example:", len(train_spa_eng_sentences))
for sen in train_spa_eng_sentences[0:100][::20]: 
    print(">>", sen)
print("\n")
print("Test Example:", len(test_spa_eng_sentences))
for sen in test_spa_eng_sentences[0:100][::20]: 
    print(">>", sen)

Test Size:  594


Train Example: 118370
>> i hate silent movies.	me carga ver cine mudo.
>> you forgot to tell me about that book you read.	se te olvidó contarme sobre el libro que leíste.
>> you must finish this work in a week.	debes terminar este trabajo en una semana.
>> go and fetch tom.	ve a traer a tom.
>> we gladly accept your offer.	aceptamos con gusto tu oferta.


Test Example: 594
>> tom doesn't know.	tom no lo sabe.
>> i'll take care of the horses.	yo cuidaré a los caballos.
>> tom doesn't know what to expect.	tom no sabe qué esperar.
>> mary really takes after her mother.	mary se parece mucho a su madre.
>> tom doesn't know how to drive.	tom no sabe manejar.


한 줄에 포함되어 있는 영어와 스페인어를 분리 - > split('\t')를 사용   
tab 이전이 영어, 이후가 스페인어 문장

In [7]:
def split_spa_eng_sentences(spa_eng_sentences):
    spa_sentences = []
    eng_sentences = []
    for spa_eng_sentence in tqdm(spa_eng_sentences):
        eng_sentence, spa_sentence = spa_eng_sentence.split('\t')
        spa_sentences.append(spa_sentence)
        eng_sentences.append(eng_sentence)
    return eng_sentences, spa_sentences

In [8]:
def split_spa_eng_sentences(spa_eng_sentences):
    spa_sentences = []
    eng_sentences = []
    for spa_eng_sentence in tqdm(spa_eng_sentences):
        try:
            eng_sentence, spa_sentence = spa_eng_sentence.split('\t')
            spa_sentences.append(spa_sentence)
            eng_sentences.append(eng_sentence)
        except ValueError:
            print("Error in sentence:", spa_eng_sentence)

    return eng_sentences, spa_sentences

학습 데이터와 테스트 데이터 모두 영어와 스페인어 문장 분리

In [10]:
train_eng_sentences, train_spa_sentences = split_spa_eng_sentences(train_spa_eng_sentences)
print(len(train_eng_sentences))
print(train_eng_sentences[0])
print('\n')
print(len(train_spa_sentences))
print(train_spa_sentences[0])

  0%|          | 0/118370 [00:00<?, ?it/s]

118370
i hate silent movies.


118370
me carga ver cine mudo.


In [11]:
test_eng_sentences, test_spa_sentences = split_spa_eng_sentences(test_spa_eng_sentences)
print(len(test_eng_sentences))
print(test_eng_sentences[0])
print('\n')
print(len(test_spa_sentences))
print(test_spa_sentences[0])

  0%|          | 0/594 [00:00<?, ?it/s]

594
tom doesn't know.


594
tom no lo sabe.


토큰화 -> Sentencepiece 기반의 토크나이저

In [12]:
def generate_tokenizer(corpus,
                       vocab_size,
                       lang="spa-eng",
                       pad_id=0,   # pad token의 일련번호
                       bos_id=1,  # 문장의 시작을 의미하는 bos token(<s>)의 일련번호
                       eos_id=2,  # 문장의 끝을 의미하는 eos token(</s>)의 일련번호
                       unk_id=3):   # unk token의 일련번호
    file = "./%s_corpus.txt" % lang
    model = "%s_spm" % lang

    with open(file, 'w') as f:
        for row in corpus: f.write(str(row) + '\n')

    import sentencepiece as spm
    spm.SentencePieceTrainer.Train(
        '--input=./%s --model_prefix=%s --vocab_size=%d'\
        % (file, model, vocab_size) + \
        '--pad_id==%d --bos_id=%d --eos_id=%d --unk_id=%d'\
        % (pad_id, bos_id, eos_id, unk_id)
    )

    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load('%s.model' % model)

    return tokenizer

In [13]:
VOCAB_SIZE = 20000 # 단어사전수
tokenizer = generate_tokenizer(train_eng_sentences + train_spa_sentences, VOCAB_SIZE, 'spa-eng')
tokenizer.set_encode_extra_options("bos:eos")  # 문장 양 끝에 <s> , </s> 추가

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=././spa-eng_corpus.txt --model_prefix=spa-eng_spm --vocab_size=20000--pad_id==0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ././spa-eng_corpus.txt
  input_format: 
  model_prefix: spa-eng_spm
  model_type: UNIGRAM
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
  bos_id: 1
  eos_id: 2
  pad_id: -1

True

os_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
}
normalizer_spec {
  name: nmt_nfkc
  add_dummy_prefix: 1
  remove_extra_whitespaces: 1
  escape_whitespaces: 1
  normalization_rule_tsv: 
}
denormalizer_spec {}
trainer_interface.cc(329) LOG(INFO) SentenceIterator is not specified. Using MultiFileSentenceIterator.
trainer_interface.cc(178) LOG(INFO) Loading corpus: ././spa-eng_corpus.txt
trainer_interface.cc(385) LOG(INFO) Loaded all 236740 sentences
trainer_interface.cc(400) LOG(INFO) Adding meta_piece: <s>
trainer_interface.cc(400) LOG(INFO) Adding meta_piece: </s>
trainer_interface.cc(400) LOG(INFO) Adding meta_piece: <unk>
trainer_interface.cc(405) LOG(INFO) Normalizing sentences...
trainer_interface.cc(466) LOG(INFO) all chars count=7876619
trainer_interface.cc(477) LOG(INFO) Done: 99.9546% characters are covered.
trainer_interface.cc(487) LOG(INFO) Alphabet size=43
trainer_interface.cc(488) LOG(INFO) Final character coverage=0.999546
trainer_interface.cc(520) LOG(INFO) Done! 

In [14]:
# 토콘화 해주는 함수 저의
def make_corpus(sentences, tokenizer):
    corpus = []
    for sentence in tqdm(sentences):
        tokens = tokenizer.encode_as_ids(sentence)
        corpus.append(tokens)
    return corpus

영어와 스페인어를 각각 토큰화    
훈련 데이터만 토큰화    
같은 토크나이저를 사용함   

In [15]:
eng_corpus = make_corpus(train_eng_sentences, tokenizer)
spa_corpus = make_corpus(train_spa_sentences, tokenizer)

  0%|          | 0/118370 [00:00<?, ?it/s]

  0%|          | 0/118370 [00:00<?, ?it/s]

In [16]:
print(train_eng_sentences[0])
print(eng_corpus[0])
print('\n')
print(train_spa_sentences[0])
print(spa_corpus[0])

i hate silent movies.
[1, 6, 670, 3351, 1534, 0, 2]


me carga ver cine mudo.
[1, 20, 4732, 261, 1731, 10081, 0, 2]


pad_sequences()를 이용해서 한문장의 토큰 길이가 50이 되도록 해서 데이터셋 만들기

In [17]:
MAX_LEN = 50
enc_ndarray = tf.keras.preprocessing.sequence.pad_sequences(eng_corpus, maxlen=MAX_LEN, padding='post')
dec_ndarray = tf.keras.preprocessing.sequence.pad_sequences(spa_corpus, maxlen=MAX_LEN, padding='post')

모델 훈련에 사용될 수 있도록 영어와 스페인어 데이터를 묶어 배치 크기의 텐서로 만들어 줌

In [18]:
BATCH_SIZE = 64
train_dataset = tf.data.Dataset.from_tensor_slices((enc_ndarray, dec_ndarray)).batch(batch_size=BATCH_SIZE)

# 트랜스포머 구현하기

Positional Encoding: 모델에 입력되는 단어의 상대적인 위치 정보를 반영하여 모델이 문장의 순서를 학습할 수 있도록 도움

In [20]:
def positional_encoding(pos, d_model):
    # 각 포지션과 차원에 대한 각도를 계산하는 함수
    def cal_angle(position, i):
        return position / np.power(10000, (2*(i//2)) / np.float32(d_model))

    # 각 포지션에 대한 각도 벡터를 생성하는 함수
    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    # 각 포지션과 차원에 대한 사인과 코사인 값을 가지는 테이블을 생성
    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    # 짝수 차원에는 사인 값을, 홀수 차원에는 코사인 값을 적용
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # 짝수 차원
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # 홀수 차원

    return sinusoid_table

마스크 생성

In [21]:
def generate_padding_mask(seq):
    # 입력 시퀀스에서 패딩(0)에 해당하는 위치를 찾아 마스크 생성
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_lookahead_mask(size):
    # 룩어헤드 마스크 생성: 현재 위치 이후의 위치에 대해 마스킹
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

def generate_masks(src, tgt):
    # 인코더 입력 패딩 마스크 생성
    enc_mask = generate_padding_mask(src)

    # 디코더-인코더 어텐션 마스크 생성 (인코더 입력의 패딩 위치 마스킹)
    dec_enc_mask = generate_padding_mask(src)

    # 디코더 마스크 생성 (패딩 위치와 룩어헤드 마스킹을 포함)
    dec_lookahead_mask = generate_lookahead_mask(tgt.shape[1])
    dec_tgt_padding_mask = generate_padding_mask(tgt)
    dec_mask = tf.maximum(dec_tgt_padding_mask, dec_lookahead_mask)

    return enc_mask, dec_enc_mask, dec_mask

Multi-head Attention

In [22]:
import tensorflow as tf

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        # 각각의 선형 변환을 수행하는 레이어들을 정의
        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)
        
        # 마지막에 사용되는 선형 변환 레이어
        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        # 스케일링된 점곱 어텐션을 수행하는 메소드
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        # 마스크 적용 (필요한 경우)
        if mask is not None:
            scaled_qk += (mask * -1e9)  

        # 소프트맥스 함수를 통해 어텐션 가중치 계산
        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        # 가중 평균을 통해 출력 계산
        out = tf.matmul(attentions, V)

        return out, attentions
        
    def split_heads(self, x):
        # 주어진 텐서를 여러 헤드로 나누는 메소드
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        # 나누어진 헤드를 결합하는 메소드
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x
    
    def call(self, Q, K, V, mask):
        # 주어진 Q, K, V에 대한 멀티헤드 어텐션을 수행하는 메소드
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)
        
        # 각각의 헤드로 나누기
        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)
        
        # 스케일드 닷 프로덕트 어텐션 수행
        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)
                        
        # 헤드 결합
        out = self.combine_heads(out)
        # 최종 선형 변환 수행
        out = self.linear(out)
            
        return out, attention_weights


Position-wise Feed Forward Network

In [23]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        # 첫 번째 fully-connected layer (linear layer) - 활성화 함수는 ReLU
        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        # 두 번째 fully-connected layer (linear layer)
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        # 주어진 입력을 첫 번째 fully-connected layer에 전달하고 ReLU 활성화 함수를 적용
        out = self.fc1(x)
        # 결과를 두 번째 fully-connected layer에 전달
        out = self.fc2(out)
            
        return out

Encoder Layer

In [24]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        # 멀티헤드 어텐션 레이어
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        # 위치별 피드포워드 네트워크 레이어
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        # 각 레이어마다의 레이어 정규화 레이어
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # 드롭아웃 레이어
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        '''
        Multi-Head Attention
        '''
        # Residual Connection을 위한 원본 입력 저장
        residual = x
        # 첫 번째 레이어 정규화
        out = self.norm_1(x)
        # 멀티헤드 어텐션 레이어 수행
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        # 드롭아웃 적용
        out = self.do(out)
        # Residual Connection 수행
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        # Residual Connection을 위한 원본 입력 저장
        residual = out
        # 두 번째 레이어 정규화
        out = self.norm_2(out)
        # 위치별 피드포워드 네트워크 레이어 수행
        out = self.ffn(out)
        # 드롭아웃 적용
        out = self.do(out)
        # Residual Connection 수행
        out += residual
        
        return out, enc_attn

Decoder Layer

In [25]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        # 디코더 자체 어텐션 레이어
        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        # 디코더-인코더 어텐션 레이어
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        # 위치별 피드포워드 네트워크 레이어
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        # 각 레이어마다의 레이어 정규화 레이어
        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # 드롭아웃 레이어
        self.do = tf.keras.layers.Dropout(dropout)
    
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        '''
        Masked Multi-Head Attention
        '''
        # Residual Connection을 위한 원본 입력 저장
        residual = x
        # 첫 번째 레이어 정규화
        out = self.norm_1(x)
        # 디코더 자체 어텐션 레이어 수행
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        # 드롭아웃 적용
        out = self.do(out)
        # Residual Connection 수행
        out += residual

        '''
        Multi-Head Attention
        '''
        # Residual Connection을 위한 원본 입력 저장
        residual = out
        # 두 번째 레이어 정규화
        out = self.norm_2(out)
        # 디코더-인코더 어텐션 레이어 수행
        # Q, K, V 순서에 주의하세요!
        out, dec_enc_attn = self.enc_dec_attn(Q=out, K=enc_out, V=enc_out, mask=dec_enc_mask)
        # 드롭아웃 적용
        out = self.do(out)
        # Residual Connection 수행
        out += residual
        
        '''
        Position-Wise Feed Forward Network
        '''
        # Residual Connection을 위한 원본 입력 저장
        residual = out
        # 세 번째 레이어 정규화
        out = self.norm_3(out)
        # 위치별 피드포워드 네트워크 레이어 수행
        out = self.ffn(out)
        # 드롭아웃 적용
        out = self.do(out)
        # Residual Connection 수행
        out += residual

        return out, dec_attn, dec_enc_attn


Encoder

In [26]:
class Encoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Encoder, self).__init__()
        
        # 인코더 레이어를 n_layers 개수만큼 생성
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                           for _ in range(n_layers)]
        
        # 드롭아웃 레이어
        self.do = tf.keras.layers.Dropout(dropout)
        
    def call(self, x, mask):
        # 입력 데이터 초기화
        out = x
    
        # 각 인코더 레이어를 순회하면서 호출
        enc_attns = list()
        for i in range(self.n_layers):
            # 인코더 레이어 호출
            out, enc_attn = self.enc_layers[i](out, mask)
            # 어텐션 가중치 저장
            enc_attns.append(enc_attn)
        
        return out, enc_attns


Decoder

In [27]:
class Decoder(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout):
        super(Decoder, self).__init__()
        
        # 디코더 레이어를 n_layers 개수만큼 생성
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                           for _ in range(n_layers)]
                            
    def call(self, x, enc_out, dec_enc_mask, padding_mask):
        # 입력 데이터 초기화
        out = x
    
        # 각 디코더 레이어를 순회하면서 호출
        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            # 디코더 레이어 호출
            out, dec_attn, dec_enc_attn = \
                self.dec_layers[i](out, enc_out, dec_enc_mask, padding_mask)

            # 각 레이어에서 얻은 어텐션 가중치 저장
            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        # 최종 디코더 레이어의 출력과 각 레이어에서의 어텐션 가중치 리스트 반환
        return out, dec_attns, dec_enc_attns

Transformer 전체 모델 조립

In [28]:
class Transformer(tf.keras.Model):
    def __init__(self, n_layers, d_model, n_heads, d_ff, src_vocab_size, tgt_vocab_size, pos_len, dropout=0.2, shared_fc=True, shared_emb=False):
        super(Transformer, self).__init__()

        # 입력으로 받은 d_model을 float32 타입으로 변경
        self.d_model = tf.cast(d_model, tf.float32)

        # 공유 임베딩 여부에 따라 인코더와 디코더의 임베딩 레이어 생성
        if shared_emb:
            self.enc_emb = self.dec_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        # 위치 인코딩 레이어 생성
        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        # 인코더와 디코더 생성
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        # 출력 레이어 생성
        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        # 공유 출력 레이어 여부에 따라 가중치 공유
        self.shared_fc = shared_fc
        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        # 입력에 임베딩 레이어 적용
        out = emb(x)

        # 공유된 출력 레이어를 사용할 경우 sqrt(d_model)로 스케일 조정
        if self.shared_fc:
            out *= tf.math.sqrt(self.d_model)

        # 위치 인코딩 추가
        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out

    def call(self, enc_in, dec_in, enc_mask, dec_enc_mask, dec_mask):
        # 인코더 입력과 디코더 입력에 임베딩 적용
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        # 인코더와 디코더 순전파
        enc_out, enc_attns = self.encoder(enc_in, enc_mask)
        dec_out, dec_attns, dec_enc_attns = self.decoder(dec_in, enc_out, dec_enc_mask, dec_mask)

        # 출력 레이어를 통과한 최종 로짓 반환
        logits = self.fc(dec_out)

        # 로짓, 인코더 어텐션, 디코더 어텐션, 디코더-인코더 어텐션 반환
        return logits, enc_attns, dec_attns, dec_enc_attns


모델 인스턴스 생성

In [29]:
transformer = Transformer(
    n_layers=2,
    d_model=512,
    n_heads=8,
    d_ff=2048,
    src_vocab_size=VOCAB_SIZE,
    tgt_vocab_size=VOCAB_SIZE,
    pos_len=200,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

d_model = 512

Learning Rate Scheduler

In [30]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()
        
        # 모델의 임베딩 차원과 워머핑 스텝 수 초기화
        self.d_model = d_model
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        # 학습률을 조절하기 위한 함수 정의
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)
        
        # 두 가지 학습률 조절 방법 중 작은 값 선택
        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

Learning Rate & Optimizer

In [31]:
# LearningRateScheduler를 사용하여 학습률 스케줄링 설정
learning_rate = LearningRateScheduler(d_model)

# Adam 옵티마이저 설정
optimizer = tf.keras.optimizers.Adam(
    learning_rate,      # 학습률은 LearningRateScheduler에 의해 동적으로 결정됨
    beta_1=0.9,          # Adam 옵티마이저의 하이퍼파라미터 beta_1 설정
    beta_2=0.98,         # Adam 옵티마이저의 하이퍼파라미터 beta_2 설정
    epsilon=1e-9         # Adam 옵티마이저의 하이퍼파라미터 epsilon 설정
)

Loss Function 정의

In [32]:
# SparseCategoricalCrossentropy 손실 함수 객체 생성
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,     # 모델 출력이 로짓인 경우 True, 확률 분포인 경우 False
    reduction='none'      # 손실을 각 샘플에 대해 계산하고, 각 샘플의 손실을 모두 더하지 않음
)

def loss_function(real, pred):
    # 패딩된 부분을 제외하고 유효한 부분만 마스킹
    mask = tf.math.logical_not(tf.math.equal(real, 0))

    # 모델 예측과 실제 값 간의 손실 계산
    loss_ = loss_object(real, pred)

    # 패딩 부분에 대한 마스킹 적용
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    # 마스킹된 부분의 손실을 총합하고, 마스킹의 총합으로 나눠 정규화
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

Train Step 정의

In [33]:
@tf.function()
def train_step(src, tgt, model, optimizer):
    '''
    src: 인코더 입력
    tgt: 디코더 입력
    model: 트랜스포머 모델
    optimizer: 사용할 최적화 알고리즘
    '''
    # Decoder의 입력과 최종 타겟 생성
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    # 마스크 생성
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    # 그래디언트 계산을 위한 GradientTape 사용
    with tf.GradientTape() as tape:
        # 모델 순전파 수행
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)

        # 손실 함수 계산
        loss = loss_function(gold, predictions)

    # 그래디언트 계산 및 모델 파라미터 업데이트
    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # 손실과 어텐션 가중치 반환
    return loss, enc_attns, dec_attns, dec_enc_attns


훈련을 시키자!

In [34]:
EPOCHS = 3

for epoch in range(EPOCHS):
    total_loss = 0
    
    # tqdm을 사용하여 진행 상황을 시각적으로 확인
    dataset_count = tf.data.experimental.cardinality(train_dataset).numpy()
    tqdm_bar = tqdm(total=dataset_count, desc=f'Epoch {epoch + 1}')

    # 훈련 데이터셋의 각 배치에 대해 훈련 수행
    for (batch, (src, tgt)) in enumerate(train_dataset):
        # 훈련 스텝 수행
        loss, _, _, _ = train_step(src, tgt, transformer, optimizer)
        total_loss += loss

        # tqdm 업데이트
        tqdm_bar.update(1)
        tqdm_bar.set_postfix({'loss': total_loss.numpy() / (batch + 1)})

    # epoch 종료 후 평균 손실 출력
    avg_loss = total_loss / dataset_count
    print(f'Epoch {epoch + 1}, Loss: {avg_loss.numpy()}')
    tqdm_bar.close()

Epoch 1:   0%|          | 0/1850 [00:00<?, ?it/s]

Epoch 1, Loss: 3.7447965145111084


Epoch 2:   0%|          | 0/1850 [00:00<?, ?it/s]

Epoch 2, Loss: 2.1256184577941895


Epoch 3:   0%|          | 0/1850 [00:00<?, ?it/s]

Epoch 3, Loss: 1.792917251586914


# 번역 성능 측정하기
## BLEU Score
-  N-gram으로 점수를 측정

NLTK를 활용한 BLEU Score

In [36]:
# 아래 두 문장을 바꿔가며 테스트 해보세요
reference = "많 은 자연어 처리 연구자 들 이 트랜스포머 를 선호 한다".split()
candidate = "적 은 자연어 학 개발자 들 가 트랜스포머 을 선호 한다 요".split()

print("원문:", reference)
print("번역문:", candidate)
print("BLEU Score:", sentence_bleu([reference], candidate))

원문: ['많', '은', '자연어', '처리', '연구자', '들', '이', '트랜스포머', '를', '선호', '한다']
번역문: ['적', '은', '자연어', '학', '개발자', '들', '가', '트랜스포머', '을', '선호', '한다', '요']
BLEU Score: 8.190757052088229e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU Score는 0~1 사이의 값을 가지지만, 100을 곱한 백분율 값으로 표기하는 경우도 많다   
BLEU Score가 50점을 넘는다는 것은 정말 멋진 번역을 생성했다는 의미이지만, N-gram별로 확인해 볼 필요가 있음

In [37]:
print("1-gram:", sentence_bleu([reference], candidate, weights=[1, 0, 0, 0]))
print("2-gram:", sentence_bleu([reference], candidate, weights=[0, 1, 0, 0]))
print("3-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 1, 0]))
print("4-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 0, 1]))

1-gram: 0.5
2-gram: 0.18181818181818182
3-gram: 2.2250738585072626e-308
4-gram: 2.2250738585072626e-308


BLEU 계산시 특정 N-gram이 0점이 나와서 BLEU가 너무 커지거나 작아지는 쪽으로 왜곡되는 문제를 보완하기 위해 SmoothingFunction() 을 사용하고 있다  
Smoothing 함수는 모든 Precision에 아주 작은 epsilon 값을 더해주는 역할을 하는데, 이로써 0점이 부여된 Precision도 완전한 0이 되지 않으니 점수를 1.0 으로 대체할 필요가 없어짐  

In [38]:
def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)  # smoothing_function 적용

print("BLEU-1:", calculate_bleu(reference, candidate, weights=[1, 0, 0, 0]))
print("BLEU-2:", calculate_bleu(reference, candidate, weights=[0, 1, 0, 0]))
print("BLEU-3:", calculate_bleu(reference, candidate, weights=[0, 0, 1, 0]))
print("BLEU-4:", calculate_bleu(reference, candidate, weights=[0, 0, 0, 1]))

print("\nBLEU-Total:", calculate_bleu(reference, candidate))

BLEU-1: 0.5
BLEU-2: 0.18181818181818182
BLEU-3: 0.010000000000000004
BLEU-4: 0.011111111111111112

BLEU-Total: 0.05637560315259291


결과: 거의 의미 없는 번역

## 트랜스포머 모델의 번역 성능 알아보기

테스트셋으로 모델의 BLEU Score를 측정하는 함수 eval_bleu() 를 구현

In [39]:
def translate(tokens, model, src_tokenizer, tgt_tokenizer):
    # 입력 토큰을 패딩하여 모델에 전달
    padded_tokens = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=MAX_LEN,
                                                           padding='post')

    # 디코더 입력 초기화
    output = tf.expand_dims([tgt_tokenizer.bos_id()], 0)

    # 디코딩된 토큰을 저장할 리스트 초기화
    ids = []

    # 최대 길이(MAX_LEN)만큼 반복하여 번역 수행
    for i in range(MAX_LEN):
        # 마스크 생성
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(padded_tokens, output)

        # 모델에 입력하여 다음 토큰 예측
        predictions, _, _, _ = model(padded_tokens, 
                                      output,
                                      enc_padding_mask,
                                      combined_mask,
                                      dec_padding_mask)

        # 예측된 확률 분포에서 가장 높은 확률을 가진 토큰 선택
        predicted_id = tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        # 종료 토큰이 예측되면 번역 결과 반환
        if tgt_tokenizer.eos_id() == predicted_id:
            result = tgt_tokenizer.decode_ids(ids)  
            return result

        # 예측된 토큰을 결과에 추가
        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    # 최대 길이까지 종료 토큰이 나오지 않으면 현재까지 예측된 토큰 반환
    result = tgt_tokenizer.decode_ids(ids)  
    return result

번역한 문장의 BLEU Score를 평가할 수 있도록 함수를 작성

In [40]:
def eval_bleu_single(model, src_sentence, tgt_sentence, src_tokenizer, tgt_tokenizer, verbose=True):
    # 소스 문장 및 타겟 문장을 토큰화하여 인코딩
    src_tokens = src_tokenizer.encode_as_ids(src_sentence)
    tgt_tokens = tgt_tokenizer.encode_as_ids(tgt_sentence)

    # 최대 길이를 초과하는 경우 평가하지 않음
    if (len(src_tokens) > MAX_LEN) or (len(tgt_tokens) > MAX_LEN):
        return None

    # 타겟 문장을 기준으로 BLEU 점수 계산
    reference = tgt_sentence.split()
    candidate = translate(src_tokens, model, src_tokenizer, tgt_tokenizer).split()

    # BLEU 점수 계산 및 출력
    score = sentence_bleu([reference], candidate,
                          smoothing_function=SmoothingFunction().method1)

    if verbose:
        print("Source Sentence: ", src_sentence)
        print("Model Prediction: ", candidate)
        print("Real: ", reference)
        print("Score: %lf\n" % score)
        
    return score

In [41]:
test_idx = 0  # 테스트 인덱스

# 테스트 인덱스의 영어 문장과 스페인어 실제 번역 문장을 사용하여 BLEU 점수 평가
eval_bleu_single(transformer, 
                 test_eng_sentences[test_idx], 
                 test_spa_sentences[test_idx], 
                 tokenizer, 
                 tokenizer)

Source Sentence:  tom doesn't know.
Model Prediction:  ['tom', 'no', 'sabe', 'nada', 'de', 'él', 'a', 'mary?']
Real:  ['tom', 'no', 'lo', 'sabe.']
Score: 0.058739



0.05873949094699214

전체 테스트 데이터에 대해서 평가

In [42]:
def eval_bleu(model, src_sentences, tgt_sentences, src_tokenizer, tgt_tokenizer, verbose=True):
    total_score = 0.0
    sample_size = len(src_sentences)
    
    # 각각의 테스트 샘플에 대해 BLEU 점수 계산
    for idx in tqdm(range(sample_size)):
        score = eval_bleu_single(model, src_sentences[idx], tgt_sentences[idx], src_tokenizer, tgt_tokenizer, verbose)
        if not score: continue
        
        total_score += score
    
    # 전체 테스트 샘플에 대한 평균 BLEU 점수 출력
    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)

Transformer 모델을 사용하여 테스트 데이터셋의 모든 샘플에 대한 BLEU 점수를 평가하고, 평균 BLEU 점수를 출력

In [43]:
eval_bleu(transformer, test_eng_sentences, test_spa_sentences, tokenizer, tokenizer, verbose=False)

  0%|          | 0/594 [00:00<?, ?it/s]

Num of Sample: 594
Total Score: 0.09492075419921593


## Beam Search Decoder

In [44]:
def beam_search_decoder(prob, beam_size):
    sequences = [[[], 1.0]]  # 생성된 문장과 점수를 저장

    for tok in prob:
        all_candidates = []

        for seq, score in sequences:
            for idx, p in enumerate(tok): # 각 단어의 확률을 총점에 누적 곱
                candidate = [seq + [idx], score * -math.log(-(p-1))]
                all_candidates.append(candidate)

        ordered = sorted(all_candidates,
                         key=lambda tup:tup[1],
                         reverse=True) # 총점 순 정렬
        sequences = ordered[:beam_size] # Beam Size에 해당하는 문장만 저장 

    return sequences

In [45]:
vocab = {
    0: "<pad>",
    1: "까요?",
    2: "커피",
    3: "마셔",
    4: "가져",
    5: "될",
    6: "를",
    7: "한",
    8: "잔",
    9: "도",
}

prob_seq = [[0.01, 0.01, 0.60, 0.32, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.75, 0.01, 0.01, 0.17],
            [0.01, 0.01, 0.01, 0.35, 0.48, 0.10, 0.01, 0.01, 0.01, 0.01],
            [0.24, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.68],
            [0.01, 0.01, 0.12, 0.01, 0.01, 0.80, 0.01, 0.01, 0.01, 0.01],
            [0.01, 0.81, 0.01, 0.01, 0.01, 0.01, 0.11, 0.01, 0.01, 0.01],
            [0.70, 0.22, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01],
            [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01]]

prob_seq = np.array(prob_seq)
beam_size = 3

result = beam_search_decoder(prob_seq, beam_size)

for seq, score in result:
    sentence = ""

    for word in seq:
        sentence += vocab[word] + " "

    print(sentence, "// Score: %.4f" % score)

커피 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 42.5243
커피 를 마셔 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 28.0135
마셔 를 가져 도 될 까요? <pad> <pad> <pad> <pad>  // Score: 17.8983


Beam Search를 생성 기법으로 구현할 때에는 분기를 잘 나눠줘야 한다.

### Beam Search Decoder 작성 및 평가하기
각 단어의 확률값을 계산하는 calc_prob()와 Beam Search를 기반으로 동작하는 beam_search_decoder() 를 구현하고 생성된 문장에 대해 BLEU Score를 출력하는 beam_bleu() 를 구현

In [46]:
def calc_prob(src_ids, tgt_ids, model):
    # 마스크 생성
    enc_padding_mask, combined_mask, dec_padding_mask = \
    generate_masks(src_ids, tgt_ids)

    # 모델에 입력하여 다음 토큰의 확률 분포를 예측
    predictions, _, _, _ = model(src_ids, 
                                 tgt_ids,
                                 enc_padding_mask,
                                 combined_mask,
                                 dec_padding_mask)
    
    # 소프트맥스 함수를 통해 확률로 변환하여 반환
    return tf.math.softmax(predictions, axis=-1)

In [47]:
def beam_search_decoder(sentence, 
                        src_len,
                        tgt_len,
                        model,
                        src_tokenizer,
                        tgt_tokenizer,
                        beam_size):
    # 소스 문장을 토큰으로 변환
    tokens = src_tokenizer.encode_as_ids(sentence)
    
    # 패딩 및 길이 조정
    src_in = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                            maxlen=src_len,
                                                            padding='post')

    # 예측 캐시 및 임시 예측 배열 초기화
    pred_cache = np.zeros((beam_size * beam_size, tgt_len), dtype=np.int64)
    pred_tmp = np.zeros((beam_size, tgt_len), dtype=np.int64)

    # EOS 플래그 및 스코어 초기화
    eos_flag = np.zeros((beam_size, ), dtype=np.int64)
    scores = np.ones((beam_size, ))

    # 첫 번째 타임 스텝의 시작 토큰 설정
    pred_tmp[:, 0] = tgt_tokenizer.bos_id()

    # 첫 번째 타임 스텝의 확률 계산
    dec_in = tf.expand_dims(pred_tmp[0, :1], 0)
    prob = calc_prob(src_in, dec_in, model)[0, -1].numpy()

    # 루프를 통한 각 타임 스텝에 대한 빔 서치
    for seq_pos in range(1, tgt_len):
        score_cache = np.ones((beam_size * beam_size, ))

        # 이전 타임 스텝의 예측을 캐시에 저장
        for branch_idx in range(beam_size):
            cache_pos = branch_idx * beam_size

            score_cache[cache_pos:cache_pos+beam_size] = scores[branch_idx]
            pred_cache[cache_pos:cache_pos+beam_size, :seq_pos] = \
            pred_tmp[branch_idx, :seq_pos]

        # 현재 타임 스텝에서의 각 Branch에 대한 예측 계산
        for branch_idx in range(beam_size):
            cache_pos = branch_idx * beam_size

            # 모든 Branch를 로 시작하는 경우를 방지
            if seq_pos != 1:
                dec_in = pred_cache[branch_idx, :seq_pos]
                dec_in = tf.expand_dims(dec_in, 0)

                # 현재 타임 스텝에서의 확률 계산
                prob = calc_prob(src_in, dec_in, model)[0, -1].numpy()

            # 각 Beam에 대한 확률 계산 및 최적 예측 저장
            for beam_idx in range(beam_size):
                max_idx = np.argmax(prob)

                score_cache[cache_pos+beam_idx] *= prob[max_idx]
                pred_cache[cache_pos+beam_idx, seq_pos] = max_idx

                prob[max_idx] = -1

        # 각 Branch에서의 최종 예측 및 스코어 업데이트
        for beam_idx in range(beam_size):
            if eos_flag[beam_idx] == -1: continue

            # 가장 높은 스코어를 갖는 예측 선택
            max_idx = np.argmax(score_cache)
            prediction = pred_cache[max_idx, :seq_pos+1]

            # 현재 Branch의 예측 및 스코어 업데이트
            pred_tmp[beam_idx, :seq_pos+1] = prediction
            scores[beam_idx] = score_cache[max_idx]
            score_cache[max_idx] = -1

            # EOS 토큰이 예측된 경우 EOS 플래그 설정
            if prediction[-1] == tgt_tokenizer.eos_id():
                eos_flag[beam_idx] = -1

    # 최종 예측 결과 정리
    pred = []
    for long_pred in pred_tmp:
        zero_idx = long_pred.tolist().index(tgt_tokenizer.eos_id())
        short_pred = long_pred[:zero_idx+1]
        pred.append(short_pred)
    return pred

In [49]:
def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    """
    Calculate BLEU score between a reference and a candidate sentence.

    Args:
    - reference (list): A list containing the reference sentence as a list of tokens.
    - candidate (list): A list containing the candidate sentence as a list of tokens.
    - weights (list, optional): Weights for the n-gram scores (default is [0.25, 0.25, 0.25, 0.25]).

    Returns:
    - float: BLEU score between the reference and candidate sentences.
    """
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)


In [48]:
def beam_bleu(reference, ids, tokenizer):
    """
    빔 서치를 사용하여 생성된 일련의 후보 시퀀스에 대한 평균 BLEU 점수를 계산합니다.

    Args:
    - reference (str): 참조 문장으로 문자열입니다.
    - ids (list): 각 후보 시퀀스의 토큰 ID를 포함하는 하위 목록이 있는 목록입니다.
    - tokenizer: 토큰 ID를 디코딩하는 데 사용되는 토크나이저 객체입니다.

    Returns:
    - float: 후보 시퀀스에 대한 평균 BLEU 점수입니다.
    """
    reference = reference.split()

    total_score = 0.0
    for _id in ids:
        candidate = tokenizer.decode_ids(_id.tolist()).split()
        score = calculate_bleu(reference, candidate)

        print("Reference:", reference)
        print("Candidate:", candidate)
        print("BLEU:", calculate_bleu(reference, candidate))

        total_score += score
        
    return total_score / len(ids)


In [50]:
test_idx = 1

ids = \
beam_search_decoder(test_eng_sentences[test_idx],
                    MAX_LEN,
                    MAX_LEN,
                    transformer,
                    tokenizer,
                    tokenizer,
                    beam_size=5)

bleu = beam_bleu(test_spa_sentences[test_idx], ids, tokenizer)
print(bleu)

Reference: ['tienes', 'que', 'decírmelo.']
Candidate: ['tenés', 'que', 'decirlo', 'a', 'mí?']
BLEU: 0.05372849659117709
Reference: ['tienes', 'que', 'decírmelo.']
Candidate: ['tienes', 'que', 'decirlo', 'a', 'mí?']
BLEU: 0.11362193664674995
Reference: ['tienes', 'que', 'decírmelo.']
Candidate: ['tenés', 'que', 'decirlo', 'a', 'mí', 'mismo']
BLEU: 0.040824829046386304
Reference: ['tienes', 'que', 'decírmelo.']
Candidate: ['tienes', 'que', 'decirlo', 'a', 'mí', 'mismo']
BLEU: 0.08633400213704504
Reference: ['tienes', 'que', 'decírmelo.']
Candidate: ['tenés', 'que', 'decirlo', 'por', 'mí?']
BLEU: 0.05372849659117709
0.0696475522025071


# 데이터 부풀리기
- Embedding을 활용한 Lexical Substitution 구현

- gensim 에 사전 훈련된 Embedding 모델을 불러오는 것은 두 가지 방법이 있습니다.

    1. 직접 모델을 다운로드해 load 하는 방법
    2. gensim 이 자체적으로 지원하는 downloader 를 활용해 모델을 load 하는 방법  
    
    
- 한국어는 gensim 에서 지원하지 않으므로 2번째 방법은 사용할 수 없음
- 대표적으로 사용되는 Embedding 모델은 word2vec-google-news-300 이지만 용량이 커서 다운로드에 많은 시간이 소요
- 이번에는 적당한 사이즈의 모델인 glove-wiki-gigaword-300 을 사용

In [53]:
# 사전 훈련된 Embedding 모델을 다운로드
import gensim.downloader as api

wv = api.load('glove-wiki-gigaword-300')

In [54]:
wv.most_similar("banana")

[('bananas', 0.6691170930862427),
 ('mango', 0.5804104208946228),
 ('pineapple', 0.5492372512817383),
 ('coconut', 0.5462778806686401),
 ('papaya', 0.541056752204895),
 ('fruit', 0.52181077003479),
 ('growers', 0.4877638816833496),
 ('nut', 0.48399588465690613),
 ('peanut', 0.48062023520469666),
 ('potato', 0.48061180114746094)]

주어진 데이터를 토큰 단위로 분리한 후, 랜덤하게 하나를 선정하여 해당 토큰과 가장 유사한 단어를 찾아 대치하면 그것으로 Lexical Substitution은 완성

In [55]:
sample_sentence = "you know ? all you need is attention ."
sample_tokens = sample_sentence.split()

selected_tok = random.choice(sample_tokens)

result = ""
for tok in sample_tokens:
    if tok is selected_tok:
        result += wv.most_similar(tok)[0][0] + " "

    else:
        result += tok + " "

print("From:", sample_sentence)
print("To:", result)

From: you know ? all you need is attention .
To: you know ? all you need is attention , 


In [65]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def lexical_sub(sentence, embedding_model, top_n=5):
    """
    주어진 문장에 대해 단어 임베딩을 기반으로 한 어휘 대체를 수행합니다.

    Args:
    - sentence (str): 어휘 대체를 수행할 입력 문장입니다.
    - embedding_model: 사전 훈련된 단어 임베딩 모델 (예: Word2Vec, FastText).
    - top_n (int, optional): 각 단어를 대체할 상위 후보의 수 (기본값은 5).

    Returns:
    - str: 어휘 대체 후의 증강된 문장입니다.
    """
    words = sentence.split()
    augmented_sentence = []

    for word in words:
        if word in embedding_model:
            word_embedding = np.array([embedding_model[word]])
            word_similarities = embedding_model.similar_by_vector(word_embedding.flatten(), topn=top_n)
            top_candidates = [candidate for candidate, _ in word_similarities]

            # 가장 유사한 후보로 단어를 대체합니다.
            augmented_sentence.append(random.choice(top_candidates))
        else:
            augmented_sentence.append(word)

    return ' '.join(augmented_sentence)

In [None]:
new_corpus = []

for old_src in tqdm(train_eng_sentences): 
    new_src = lexical_sub(old_src, wv)
    if new_src is not None: 
        new_corpus.append(new_src)
    # Augmentation이 없더라도 원본 문장을 포함시킵니다
    new_corpus.append(old_src)

print(new_corpus[:10])

### 정리

In [62]:
import random
import numpy as np
from gensim.models import KeyedVectors

def lexical_sub(sentence, embedding_model, top_n=5):
    words = sentence.split()
    augmented_sentence = []

    for word in words:
        if word in embedding_model:
            word_embedding = np.array([embedding_model[word]])
            word_similarities = embedding_model.similar_by_vector(word_embedding.flatten(), topn=top_n)
            top_candidates = [candidate for candidate, _ in word_similarities]

            # 가장 유사한 후보로 단어를 대체합니다.
            augmented_sentence.append(random.choice(top_candidates))
        else:
            augmented_sentence.append(word)

    return ' '.join(augmented_sentence)

# 예시 사용법
sample_sentence = "you know ? all you need is attention ."
augmented_sentence = lexical_sub(sample_sentence, wv)
print("Original:", sample_sentence)
print("Augmented:", augmented_sentence)

Original: you know ? all you need is attention .
Augmented: you what n't other ? must this focus ,
