In [1]:
import os
path = 'C:/pytest/data/kor-eng/'
os.chdir(path)

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
data = pd.read_csv('kor-eng.txt', names = ['source','target'],sep = '\t', encoding = 'utf-8')[:1000]

In [4]:
len(data)

1000

In [5]:
# 시작 부호와 종료 부호 부착
data.target_input = data.target.apply(lambda x: '\t'+x+'\n')
data.target_target = data.target.apply(lambda x: x+'\n')
data.target_input

  
  This is separate from the ipykernel package so we can avoid doing imports until


0       \tI go to the attic every evening to meet Bat.\n
1        \tSir, I don't understand this sentence here.\n
2      \tTime flies when you start using the computer.\n
3         \tI'm going back to Korea today at midnight.\n
4             \tI go to bathroom as soon as I wake up.\n
                             ...                        
995        \tIf you were mine, I will be really happy.\n
996    \tWe have lots in common because we are studen...
997    \tI cannot open it because I have no authority...
998           \tI think we are alike in personalities.\n
999    \tAnd if we have something to talk about, let'...
Name: target, Length: 1000, dtype: object

In [6]:
# padding 에 사용할 문장의 길이
# source
max_src_len = data.source.apply(lambda x:len(x)).max()
# target
max_tar_len = data.target.apply(lambda x: len(x)).max()

In [7]:
# Tokenizing
from keras.preprocessing.text import Tokenizer
tokenizer_source = Tokenizer(num_words= None, filters = '',lower= False)
tokenizer_source.fit_on_texts(data.source)
word_index_source = tokenizer_source.word_index

In [8]:
# target Tokenizing
tokenizer_target = Tokenizer(num_words= None,filters = '', lower=False)
tokenizer_target.fit_on_texts(data.target_input)
word_index_target = tokenizer_target.word_index

In [9]:
# data sequencing
encoder_input  = tokenizer_source.texts_to_sequences(data.source)

# target sequencing
decoder_input = tokenizer_target.texts_to_sequences(data.target_input)
decoder_target= tokenizer_target.texts_to_sequences(data.target_target)

In [10]:
# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input, maxlen = max_src_len, padding = 'post')
decoder_input = pad_sequences(decoder_input, maxlen = max_tar_len, padding = 'post')
decoder_target = pad_sequences(decoder_target, maxlen = max_tar_len, padding = 'post')

In [12]:
from keras.models import Model
from keras import layers

In [13]:
# 훈련용 encoder
encoder_inputs = layers.Input(shape=(None, len(word_index_source)+1))
encoder_embedding = layers.Embedding()
encoder_lstm = layers.LSTM(256, return_state = True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [14]:
# 훈련용 decoder
decoder_inputs = layers.Input(shape = (None, len(word_index_target)+1))
# decoder - output
decoder_lstm = layers.LSTM(256, return_sequences = True, return_state = True)

decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state= encoder_states)
decoder_dense = layers.Dense(len(word_index_target)+1, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [15]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')
model.fit(x = [encoder_input, decoder_input], y = decoder_target, batch_size = 64, epochs = 100, validation_split = 0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x207c789e808>

In [16]:
# prediction encoder
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)

In [17]:
# prediction decoder
decoder_state_input_h = layers.Input(shape = (256,))
decoder_state_input_c = layers.Input(shape = (256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(inputs = [decoder_inputs]+ decoder_states_inputs, outputs = [decoder_outputs]+decoder_states)

In [18]:
index_to_src = dict((i,char) for char, i in word_index_source.items())
index_to_tar = dict((i,char) for char, i in word_index_target.items())

In [22]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq, verbose = 0)
    target_seq = np.zeros((1,1,len(word_index_target)+1))
    target_seq[0,0,word_index_target['\t']] = 1.
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq]+states_value, verbose = 0)
        sampled_token_index = np.argmax(output_tokens)
        if (sampled_token_index ==0):
            sampled_token_index = 1
        sampled_char = index_to_tar[sampled_token_index]
        decoded_sentence += sampled_char
        
        if(sampled_char == '\n' or len(decoded_sentence) > max_tar_len):
            stop_condition = True
        target_seq = np.zeros((1,1,len(word_index_target)+1))
        target_seq[0,0,sampled_token_index] = 1.
        states_value = [h,c]
    return decoded_sentence

In [23]:
for seq_index in [450,451,452]:
    input_seq = encoder_input[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    
    print(35*'-')
    print('입력 문장 : ', data.source[seq_index])
    print('정답 문장 : ', data.target[seq_index][:len(data.target[seq_index])])
    print('번역기가 번역한 문장 : ',decoded_sentence[:len(decoded_sentence)-1])

-----------------------------------
입력 문장 :  난 오늘 밥을 먹고 공원에 갔어.
정답 문장 :  I went to the park after eating today.
번역기가 번역한 문장 :  I went to the beater thas will we hove in the  oou.
-----------------------------------
입력 문장 :  즐거운 마음을 가지고 학교에 갔어.
정답 문장 :  I went to the school with pleasure.
번역기가 번역한 문장 :  I went to the sippork with my friends to the monning.
-----------------------------------
입력 문장 :  오늘은 쇼핑을 하러 동대문을 갔어.
정답 문장 :  I went to Dongdaemun to do some shopping.
번역기가 번역한 문장 :  I went to the sippork with my friends to the monning.
