# Sequence-to-Sequence, seq2seq
- https://wikidocs.net/24996
- 입력된 시퀀스로부터 다른 도메인의 시퀀스를 출력하는 다양한 분야에서 사용되는 모델
- 챗봇(Chatbot), 기계 번역(Machine Translation), 내용 요약(Text Summarization), STT(Speech to Text),...

# Character-Level Neural Machine Translation

## Import

In [1]:
import pandas as pd
import urllib3
import zipfile
import shutil
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

## Load data

In [2]:
http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:       
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [7]:
lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
print(len(lines))
lines.head()

178009


Unnamed: 0,src,tar
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [8]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 사용
lines.sample(10)

Unnamed: 0,src,tar
10711,"Help us, please.","Aide-nous, je te prie !"
58493,Tom looked really angry.,Tom parut furieux.
55087,I figured you'd be here.,Je pensais que tu serais ici.
7149,You're bright.,Vous êtes brillant.
45289,Tom's car caught fire.,La voiture de Tom a prit feu.
20534,Make up your mind.,Faites votre choix.
56651,It was truly depressing.,C'était vraiment déprimant.
56926,My brother doesn't swim.,Mon frère ne nage pas.
30073,I revised my theory.,J'ai revu ma théorie.
56097,I'd better clean up now.,Je ferais mieux de nettoyer maintenant.


## Text preprocessing

In [9]:
lines.tar = lines.tar.apply(lambda x: '\t ' + x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
2214,I feel safe.,\t Je me sens en sécurité. \n
45826,Where are the victims?,\t Où sont les victimes ? \n
45058,Tom has a lot of toys.,\t Tom a beaucoup de jouets. \n
34554,Have a great weekend.,\t Passe un bon week-end. \n
1213,Get to bed.,\t Au lit ! \n
56515,I've never seen the sea.,\t Je n'ai jamais vu la mer. \n
1344,I listened.,\t J'ai écouté. \n
3545,How arrogant!,\t Quelle arrogance ! \n
31087,Let's keep in touch.,\t On garde contact. \n
30149,I think she is sick.,\t Je crois qu'elle est malade. \n


In [10]:
#글자집합, 단위;글자
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)
        
tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)
        
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

79
106


In [11]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [12]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, '’': 77, '€': 78}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 3

In [13]:
from tqdm import tqdm

In [14]:
encoder_input = []
for line in tqdm(lines.src): #입력 데이터에서 1줄씩 문장을 읽음
    temp_X = []
    for w in line: #각 줄에서 1개씩 글자를 읽음
        temp_X.append(src_to_index[w]) # 글자를 해당되는 정수로 변환
    encoder_input.append(temp_X)
print(encoder_input[:5])

100%|██████████| 60000/60000 [00:00<00:00, 294016.97it/s]

[[30, 64, 10], [31, 58, 10], [31, 58, 10], [41, 70, 63, 2], [41, 70, 63, 2]]





In [15]:
decoder_input = []
for line in tqdm(lines.tar):
    temp_X = []
    for w in line:
        temp_X.append(tar_to_index[w])
    decoder_input.append(temp_X)
print(decoder_input[:5])

100%|██████████| 60000/60000 [00:00<00:00, 170189.53it/s]

[[1, 3, 48, 53, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 14, 3, 2], [1, 3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [1, 3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]





In [17]:
decoder_target = []
for line in tqdm(lines.tar):
    t = 0
    temp_X = []
    for w in line:
        if t > 0:
            temp_X.append(tar_to_index[w])
        t = t + 1
    decoder_target.append(temp_X)
print(decoder_target[:5])

100%|██████████| 60000/60000 [00:00<00:00, 123541.14it/s]

[[3, 48, 53, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [3, 45, 53, 64, 73, 72, 14, 3, 2], [3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]





In [18]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print(max_src_len)
print(max_tar_len)

24
76


In [19]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')
encoder_input.shape, decoder_input.shape, decoder_target.shape

((60000, 24), (60000, 76), (60000, 76))

In [20]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)
encoder_input.shape, decoder_input.shape, decoder_target.shape

((60000, 24, 79), (60000, 76, 106), (60000, 76, 106))

## Teacher forcing
- RNN의 모든 시점에 대해서 이전 시점의 예측값 대신 실제값을 입력으로 주는 방법을 Teacher forcing라고 한다.
- 이전 시점의 디코더 셀의 예측이 틀렸는데 이를 현재 시점의 디코더 셀의 입력으로 사용하면 현재 시점의 디코더 셀의 예측도 잘못될 가능성이 높고 이는 연쇄 작용으로 디코더 전체의 예측을 어렵게 할수있음

## seq2seq 기계 번역기 훈련시키기

In [23]:
encoder_inputs = Input(shape=(None, src_vocab_size))
encoder_lstm = LSTM(units=256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c] # context vector

In [24]:
decoder_inputs = Input(shape=(None, tar_vocab_size))
decoder_lstm = LSTM(units=256, return_sequences=True, return_state=True)
decoder_outputs, _, _= decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

In [25]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None, 79)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None, 106)]  0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 344064      input_2[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 256),  371712      input_3[0][0]                    
                                                                 lstm_1[0][1]          

In [None]:
model.fit(
    x=[encoder_input, decoder_input], 
    y=decoder_target, 
    batch_size=64, 
    epochs=50, 
    validation_split=0.2
)

Epoch 1/50