In [1]:
import os
path = "C:/pytest/data/eng-kor/"
os.chdir(path)

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('eng-kor_small.txt', names = ['source', 'target'], sep = '\t', encoding = 'utf-8')

In [4]:
print(len(data), type(data), data.shape, data.sample(5), sep = '\n')

1000
<class 'pandas.core.frame.DataFrame'>
(1000, 2)
                source      target
99           Tom lied.   톰이 거짓말했어.
445      Please leave.    제발 떠나 줘.
785  Watch me closely.  나를 가까이서 봐.
120         Don't lie.   거짓말 하지 마.
87           Sit down!         앉아!


In [5]:
print(len(data.target), type(data.target), data.target.shape, data.target.sample(5), sep = '\n')

1000
<class 'pandas.core.series.Series'>
(1000,)
298       다시 확인해.
138         약속할게.
946       즉시 돌아와.
204      이거 확인해봐.
528    아무 말이나 해봐.
Name: target, dtype: object


In [6]:
# 시작 부호와 종료 부호 부착
# 데이터 3종 필요
# source 언어 : encoder_input 1개, target 언어 : decoder_input, decoder_target 2개
# decoder_input 데이터의 시작 : '\t' , 문장의 끝 : '\n'(음절 기반이므로)
# decoder_target데이터는 '\n'만 필요
data.target_input = data.target.apply(lambda x: '\t'+x+'\n')
data.target_target = data.target.apply(lambda x : x+'\n')
print(data.target_input, data.target_target, sep = '\n')

0                 \t가.\n
1                \t안녕.\n
2                \t뛰어!\n
3                \t뛰어.\n
4                \t누구?\n
             ...        
995     \t노래하는 거 좋아해요?\n
996      \t노래하는 거 좋아해?\n
997    \t고양이를 좋아하지 않아?\n
998      \t꿈은 이루어질 거야.\n
999     \t모두 그녀를 사랑한다.\n
Name: target, Length: 1000, dtype: object
0                 가.\n
1                안녕.\n
2                뛰어!\n
3                뛰어.\n
4                누구?\n
            ...       
995     노래하는 거 좋아해요?\n
996      노래하는 거 좋아해?\n
997    고양이를 좋아하지 않아?\n
998      꿈은 이루어질 거야.\n
999     모두 그녀를 사랑한다.\n
Name: target, Length: 1000, dtype: object


  
  import sys


In [7]:
print(len(data.target_input), type(data.target_input), data.target_input.shape, data.target_input.sample(5), sep = '\n')

1000
<class 'pandas.core.series.Series'>
(1000,)
931    \t난 심지어 여기 없었어.\n
427      \t계속 춤추고 있어봐.\n
510        \t난 겨울이 좋아.\n
993         \t요리 좋아해요?\n
785       \t나를 가까이서 봐.\n
Name: target, dtype: object


In [8]:
# padding 에 사용할 문장의 길이 : maxlen
max_src_len = data.source.apply(lambda x:len(x)).max()
print(max_src_len)

20


In [9]:
# target 문장의 최대 길이
# max_tar_len = data.target_input.apply(lambda x: len(x)).max()-2 # '\t','\n'의 길이 제외
max_tar_len = data.target_input.apply(lambda x: len(x)).max()
print(max_tar_len)

21


In [10]:
# Tokenizing
from keras.preprocessing.text import Tokenizer
# 음절 기반
tokenizer_source = Tokenizer(num_words= None, char_level= True, lower= False)
tokenizer_source.fit_on_texts(data.source)
word_index_source = tokenizer_source.word_index
print(len(word_index_source), word_index_source)

64 {' ': 1, 'e': 2, 'o': 3, '.': 4, 'a': 5, 't': 6, 'i': 7, 's': 8, 'n': 9, 'r': 10, 'l': 11, 'd': 12, 'm': 13, 'h': 14, 'y': 15, 'u': 16, 'T': 17, 'g': 18, 'I': 19, 'c': 20, 'p': 21, 'w': 22, 'k': 23, "'": 24, 'v': 25, 'b': 26, 'f': 27, '?': 28, 'S': 29, '!': 30, 'W': 31, 'H': 32, 'C': 33, 'D': 34, 'E': 35, 'K': 36, 'A': 37, 'G': 38, 'Y': 39, 'N': 40, 'x': 41, 'F': 42, 'B': 43, 'L': 44, 'M': 45, 'q': 46, ',': 47, 'P': 48, 'R': 49, 'O': 50, 'z': 51, 'J': 52, 'j': 53, 'Q': 54, '-': 55, '7': 56, ':': 57, '4': 58, '5': 59, 'U': 60, '2': 61, '0': 62, '1': 63, '3': 64}


In [11]:
# target Tokenizing
tokenizer_target = Tokenizer(num_words= None, char_level=True, lower=False)
# 시작, 종료 기호가 있는 target_input으로 가져옴
tokenizer_target.fit_on_texts(data.target_input)
word_index_target = tokenizer_target.word_index
print(len(word_index_target), word_index_target)

558 {' ': 1, '\t': 2, '\n': 3, '.': 4, '어': 5, '이': 6, '톰': 7, '해': 8, '아': 9, '은': 10, '다': 11, '그': 12, '가': 13, '는': 14, '?': 15, '나': 16, '거': 17, '!': 18, '고': 19, '하': 20, '을': 21, '했': 22, '요': 23, '있': 24, '야': 25, '지': 26, '었': 27, '사': 28, '난': 29, '말': 30, '들': 31, '기': 32, '게': 33, '리': 34, '를': 35, '니': 36, '도': 37, '우': 38, '좋': 39, '와': 40, '내': 41, '에': 42, '람': 43, '무': 44, '자': 45, '마': 46, '서': 47, '봐': 48, '한': 49, '계': 50, '안': 51, '네': 52, '시': 53, '속': 54, '너': 55, '수': 56, '모': 57, '만': 58, '짓': 59, '라': 60, '두': 61, '누': 62, '일': 63, '세': 64, '정': 65, '웃': 66, '로': 67, '않': 68, '줘': 69, '았': 70, '제': 71, '렸': 72, '걸': 73, '없': 74, '려': 75, '물': 76, '미': 77, '저': 78, '여': 79, '건': 80, '죽': 81, '으': 82, '워': 83, '조': 84, '주': 85, '린': 86, '신': 87, '것': 88, '의': 89, '져': 90, '심': 91, '좀': 92, '운': 93, '인': 94, '구': 95, '진': 96, '렇': 97, '날': 98, '입': 99, '래': 100, '울': 101, '전': 102, '빨': 103, '피': 104, '히': 105, '할': 106, '보': 107, '러': 108, '파': 109, '소': 110, '

In [12]:
# Data Sequencing
# 배당된 숫자를 이용하여 각 문장의 문자를 숫자로 치환한다.
# source 언어 Sequencing
encoder_input = tokenizer_source.texts_to_sequences(data.source)

print(data.source[0],encoder_input[0])
print(data.source[1],encoder_input[1])
print(data.source[2],encoder_input[2])
print(data.source[3],encoder_input[3])

Go. [38, 3, 4]
Hi. [32, 7, 4]
Run! [49, 16, 9, 30]
Run. [49, 16, 9, 4]


In [13]:
# target 언어 Sequencing
decoder_input = tokenizer_target.texts_to_sequences(data.target_input)
decoder_target = tokenizer_target.texts_to_sequences(data.target_target)
print(data.target_input[0], decoder_input[0])
print(data.target_input[1], decoder_input[1])
print(data.target_input[2], decoder_input[2])

print(data.target_target[0], decoder_target[0])
print(data.target_target[1], decoder_target[1])
print(data.target_target[2], decoder_target[2])

	가.
 [2, 13, 4, 3]
	안녕.
 [2, 51, 223, 4, 3]
	뛰어!
 [2, 272, 5, 18, 3]
가.
 [13, 4, 3]
안녕.
 [51, 223, 4, 3]
뛰어!
 [272, 5, 18, 3]


In [14]:
# Tokenizing의 결과는 리스트 타입
print(type(data.source), type(encoder_input), data.source, encoder_input, sep = '\n')

<class 'pandas.core.series.Series'>
<class 'list'>
0                       Go.
1                       Hi.
2                      Run!
3                      Run.
4                      Who?
               ...         
995    Do you like singing?
996    Do you like singing?
997    Don't you like cats?
998    Dreams do come true.
999    Everybody loves her.
Name: source, Length: 1000, dtype: object
[[38, 3, 4], [32, 7, 4], [49, 16, 9, 30], [49, 16, 9, 4], [31, 14, 3, 28], [31, 3, 22, 30], [42, 7, 10, 2, 30], [32, 2, 11, 21, 30], [52, 16, 13, 21, 30], [52, 16, 13, 21, 4], [31, 5, 7, 6, 30], [31, 5, 7, 6, 30], [31, 5, 7, 6, 4], [43, 2, 18, 7, 9, 4], [32, 2, 11, 11, 3, 30], [19, 1, 8, 2, 2, 4], [19, 1, 6, 10, 15, 4], [19, 1, 22, 3, 9, 30], [50, 14, 1, 9, 3, 30], [49, 2, 11, 5, 41, 4], [29, 14, 3, 3, 6, 30], [29, 13, 7, 11, 2, 4], [37, 6, 6, 5, 20, 23, 30], [37, 6, 6, 5, 20, 23, 30], [42, 10, 2, 2, 51, 2, 30], [38, 2, 6, 1, 16, 21, 4], [38, 3, 6, 1, 7, 6, 30], [32, 16, 18, 1, 13, 2, 4], [19

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input,maxlen = max_src_len, padding = 'post')
decoder_input = pad_sequences(decoder_input, maxlen = max_tar_len, padding = 'post')
decoder_target = pad_sequences(decoder_target, maxlen = max_tar_len, padding = 'post')
print(data.target_input[0],decoder_input[0])

	가.
 [ 2 13  4  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [16]:
print(len(decoder_input), type(decoder_input), decoder_input.shape)

1000 <class 'numpy.ndarray'> (1000, 21)


In [17]:
from tensorflow.keras.utils import to_categorical
# to_categorical() 함수는 차원을 하나 더 만들어 출력
encoder_input = to_categorical(encoder_input, num_classes=len(word_index_source)+1)
decoder_input = to_categorical(decoder_input, num_classes=len(word_index_target)+1)
decoder_target = to_categorical(decoder_target, num_classes=len(word_index_target)+1)

In [18]:
print(decoder_input.shape, data.target_input[0], decoder_input[0])

(1000, 21, 559) 	가.
 [[0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [19]:
print(len(decoder_input), type(decoder_input), decoder_input.shape)

1000 <class 'numpy.ndarray'> (1000, 21, 559)


In [20]:
from keras.models import Model
from keras import layers

In [21]:
# 훈련용 Encoder
# Encoder - Source
# 입력문의 길이는 문장마다 다르므로 None. OHE 출력 결과는 len(word_index_source)+1
encoder_inputs = layers.Input(shape = (None, len(word_index_source)+1))
# encoder 내부 상태(은닉상태, 셀상태)를 decoder로 넘겨주기 위해 return_state = True
encoder_lstm = layers.LSTM(256, return_state = True)

# return_state = True로 만들어진 모델이므로 은닉상태와 셀상태를 받음
# encoder_outputs는 받지만, 사용하지는 않는다.
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [22]:
# Decoder - Input
decoder_inputs = layers.Input(shape = (None, len(word_index_target)+1))
# Decoder - Output
# 256 unit으로 된 encoder_states를 받아야 함
decoder_lstm = layers.LSTM(256, return_sequences = True, return_state = True)

# Decoder의 은닉 상태와 셀 상태는 훈련 과정에서 사용하지 않음
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = layers.Dense(len(word_index_target)+1, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
import tensorflow as tf

In [24]:
# epochs는 50회 정도 해야 함
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer='rmsprop', loss = 'categorical_crossentropy')
model.fit(x = [encoder_input, decoder_input], y = decoder_target,batch_size = 64,epochs = 200, validation_split = 0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155

Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x21dcdf87dc8>

In [25]:
# 예측용 encoder
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)

In [26]:
# 예측용 decoder
# hidden state와 cell state를 받아서 사용함
# Encoder Context Vector의 두 내부상태를 받기 위한 정의
decoder_state_input_h = layers.Input(shape= (256,))
decoder_state_input_c = layers.Input(shape= (256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(inputs = [decoder_inputs] + decoder_states_inputs, outputs = [decoder_outputs] + decoder_states)

In [27]:
# word로부터 index를 얻은 것을 index로부터 word를 얻는 것으로 바꿈
index_to_src = dict((i,char) for char, i in word_index_source.items())
index_to_tar = dict((i,char) for char, i in word_index_target.items())
print(index_to_tar)

{1: ' ', 2: '\t', 3: '\n', 4: '.', 5: '어', 6: '이', 7: '톰', 8: '해', 9: '아', 10: '은', 11: '다', 12: '그', 13: '가', 14: '는', 15: '?', 16: '나', 17: '거', 18: '!', 19: '고', 20: '하', 21: '을', 22: '했', 23: '요', 24: '있', 25: '야', 26: '지', 27: '었', 28: '사', 29: '난', 30: '말', 31: '들', 32: '기', 33: '게', 34: '리', 35: '를', 36: '니', 37: '도', 38: '우', 39: '좋', 40: '와', 41: '내', 42: '에', 43: '람', 44: '무', 45: '자', 46: '마', 47: '서', 48: '봐', 49: '한', 50: '계', 51: '안', 52: '네', 53: '시', 54: '속', 55: '너', 56: '수', 57: '모', 58: '만', 59: '짓', 60: '라', 61: '두', 62: '누', 63: '일', 64: '세', 65: '정', 66: '웃', 67: '로', 68: '않', 69: '줘', 70: '았', 71: '제', 72: '렸', 73: '걸', 74: '없', 75: '려', 76: '물', 77: '미', 78: '저', 79: '여', 80: '건', 81: '죽', 82: '으', 83: '워', 84: '조', 85: '주', 86: '린', 87: '신', 88: '것', 89: '의', 90: '져', 91: '심', 92: '좀', 93: '운', 94: '인', 95: '구', 96: '진', 97: '렇', 98: '날', 99: '입', 100: '래', 101: '울', 102: '전', 103: '빨', 104: '피', 105: '히', 106: '할', 107: '보', 108: '러', 109: '파', 110: '소', 111: 

In [28]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1,len(word_index_target)+1))
    target_seq[0,0,word_index_target['\t']] = 1.
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        ouput_tokens, h, c = decoder_model.predict([target_seq]+ states_value)
        sampled_token_index = np.argmax(ouput_tokens)
        
        if(sampled_token_index ==0):
            sampled_token_index = 1
        
        sampled_char = index_to_tar[sampled_token_index]
        decoded_sentence += sampled_char
        
        if(sampled_char == '\n' or len(decoded_sentence)> max_tar_len):
            stop_condition = True
        
        target_seq = np.zeros((1,1,len(word_index_target)+1))
        target_seq[0,0,sampled_token_index] = 1.
        states_value = [h,c]
    return decoded_sentence

In [29]:
import numpy as np

In [30]:
for seq_index in [998,997,996]:
    input_seq = encoder_input[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    
    print(35 * '-')
    print('입력 문장:', data.source[seq_index])
    print('정답 문장:', data.target[seq_index][:len(data.target[seq_index])])
    print('번역기가 번역한 문장:', decoded_sentence[:len(decoded_sentence)-1])

-----------------------------------
입력 문장: Dreams do come true.
정답 문장: 꿈은 이루어질 거야.
번역기가 번역한 문장: 물 좀 마셔.
-----------------------------------
입력 문장: Don't you like cats?
정답 문장: 고양이를 좋아하지 않아?
번역기가 번역한 문장: 저희에게 거짓말 하지 마세요.
-----------------------------------
입력 문장: Do you like singing?
정답 문장: 노래하는 거 좋아해?
번역기가 번역한 문장: 생선 좋아해요?


In [33]:
text = ['Swing in a miss!']

In [34]:
# 전처리
input_seq1_tokenizing = tokenizer_source.texts_to_sequences([text])
# padding
input_seq1_padding = pad_sequences(input_seq1_tokenizing, maxlen= max_src_len, padding = 'post')
# OHE
input_seq1_1hot = to_categorical(input_seq1_padding, num_classes= len(word_index_source)+1)

# predict
decoded_sentence_1 = decode_sequence(input_seq1_1hot)
print(decoded_sentence_1)

안녕.

