## sequence-to-sequence learning in keras / 공백 제거 후 학습

In [33]:
from __future__ import print_function

from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.utils.vis_utils import plot_model
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG

import numpy as np

In [34]:
batch_size = 64  # Batch size for training
epochs = 100  # Number of epochs to train for
latent_dim = 256  # Latent dimensionality of the encoding space.

input_data_path = 'data/aion_train_v.txt'
target_data_path = "data/aion_train_t.txt"

input_texts = []
target_texts = []
input_characters = set()
target_characters = set()

with open(input_data_path, "r", encoding="utf-8") as f:
    reader = f.readlines()
    for i in reader:
        i = i.replace(" ", "")
        input_texts.append(i)
        for char in i:
            if char not in input_characters:
                input_characters.add(char)
            
with open(target_data_path, "r", encoding="utf-8") as f:
    reader = f.readlines()
    for i in reader:
        # '\t'  = start sequence
        # '\n' = end sequence
        i = i.replace(" ", "")
        target_text = '\t' + i
        target_texts.append(target_text)
        for char in target_text:
            if char not in target_characters:
                target_characters.add(char)

In [35]:
print(input_texts)

['감정표현동작하자.\n', '에레슈키갈의사도변신을시작해줘.\n', '정신큐빅을시작하소서.\n', '황금깡통변신을하게나.\n', '최근퀘스트빨리하지.\n', '카이시넬변신을빨리합시다.\n', '감정표현동작하여라.\n', '루나파견대변신을빨리하게.\n', '광대변신을빨리하여라.\n', '징표동작지금좀.\n', '웨다변신을빨리해주세요.\n', '작은투명지도설정을지금하게나.\n', '감정표현동작빨리하게나.\n', '실험체변신지금합시다.\n', '루나파견대변신을지금하지.\n', '루나파견대변신을지금좀.\n', '승리의비룡변신을빨리하라.\n', '그렌달변신을하겠니.\n', '포메라니안변신하라.\n', '이성큐빅을시작하라고.\n', '슈퍼슈고변신시작하시오.\n', '초강력팬더변신을지금하라니까.\n', '챈가룽야영지거점이동빨리하라니까.\n', '레파르혁명단변신을시작해주세요.\n', '챈가룽야영지거점이동빨리하자.\n', '타하바타변신을지금해라.\n', '스켈레톤변신지금하자.\n', '지혜큐빅시작하자.\n', '웨다변신을시작해라.\n', '깡통변신지금하게나.\n', '작은투명지도설정하세요.\n', '비룡변신을시작하라.\n', '불량토끼변신하세요.\n', '이성큐빅을빨리합시다.\n', '몽실이변신을하렴.\n', '맹세큐빅하여라.\n', '일반채팅지금해줘.\n', '티아마트의사도변신지금해라.\n', '침묵큐빅지금좀.\n', '몽실이변신을지금하여라.\n', '헤라나스변신을하라고.\n', '인테르디카요새거점이동을빨리합시다.\n', '옐로우펭귄변신지금하게.\n', '테그라크변신지금하거라.\n', '명예큐빅시작하여라.\n', '광기큐빅을합시다.\n', '타르하크랄변신을지금하자.\n', '슈퍼슈고변신을시작하시오.\n', '광기큐빅을빨리하여라.\n', '인테르디카결계탑거점이동을빨리하라고.\n', '타하바타변신을빨리하소서.\n', '아그우드변신을하거라.\n', '맹세큐빅시작하세.\n', '토벌대전진기지거점이동을지금해.\n', '불량토끼변신을지금하렴.\n', '키샤르감시기지거점이동을빨리하시오.

In [36]:
print(target_texts)

['\tmotion(감정표현).\n', '\ttransform(에레슈키갈의사도).\n', '\tcubic(정신).\n', '\ttransform(황금깡통).\n', '\tquest(최근).\n', '\ttransform(카이시넬).\n', '\tmotion(감정표현).\n', '\ttransform(루나파견대).\n', '\ttransform(광대).\n', '\tmotion(징표).\n', '\ttransform(웨다).\n', '\tmap(작은투명).\n', '\tmotion(감정표현).\n', '\ttransform(실험체).\n', '\ttransform(루나파견대).\n', '\ttransform(루나파견대).\n', '\ttransform(승리의비룡).\n', '\ttransform(그렌달).\n', '\ttransform(포메라니안).\n', '\tcubic(이성).\n', '\ttransform(슈퍼슈고).\n', '\ttransform(초강력팬더).\n', '\tteleport(챈가룽야영지).\n', '\ttransform(레파르혁명단).\n', '\tteleport(챈가룽야영지).\n', '\ttransform(타하바타).\n', '\ttransform(스켈레톤).\n', '\tcubic(지혜).\n', '\ttransform(웨다).\n', '\ttransform(깡통).\n', '\tmap(작은투명).\n', '\ttransform(비룡).\n', '\ttransform(불량토끼).\n', '\tcubic(이성).\n', '\ttransform(몽실이).\n', '\tcubic(맹세).\n', '\tchat(일반).\n', '\ttransform(티아마트의사도).\n', '\tcubic(침묵).\n', '\ttransform(몽실이).\n', '\ttransform(헤라나스).\n', '\tteleport(인테르디카요새).\n', '\ttransform(옐로우펭귄).\n', '\ttransform(테그라크).\n', '\tcubic(명예)

In [37]:
input_characters = sorted(list(input_characters)) ## character를 순서대로 정리 ㄱ~ㅎ
target_characters = sorted(list(target_characters))  ## character를 순서대로 정리 a~z ㄱ~ㅎ
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])  ## input texts에 있는 문장 길이를 순서대로 다른 리스트에 등록하고 최대 찾기
max_decoder_seq_length = max([len(txt) for txt in target_texts])

In [38]:
print(input_characters)

['\n', '.', '가', '각', '갈', '감', '강', '거', '검', '게', '겠', '견', '결', '계', '고', '곰', '관', '광', '귄', '그', '근', '금', '기', '까', '깡', '끼', '나', '냥', '네', '넬', '능', '니', '다', '단', '달', '대', '더', '데', '도', '동', '둠', '드', '디', '라', '랄', '랑', '랙', '량', '레', '렌', '려', '력', '렴', '로', '룡', '루', '룬', '룽', '르', '를', '리', '린', '마', '막', '만', '망', '맹', '메', '명', '모', '몽', '무', '묵', '민', '바', '반', '백', '벌', '베', '벨', '변', '보', '복', '불', '브', '블', '비', '빅', '빛', '빨', '사', '상', '새', '생', '샤', '서', '설', '성', '세', '셀', '소', '수', '쉬', '슈', '스', '승', '시', '신', '실', '심', '십', '아', '악', '안', '야', '양', '어', '에', '여', '역', '영', '예', '옐', '오', '와', '요', '용', '우', '웨', '유', '은', '을', '의', '이', '인', '일', '자', '작', '적', '전', '점', '정', '제', '좀', '주', '줘', '지', '진', '집', '징', '채', '챈', '천', '체', '초', '최', '추', '축', '치', '침', '카', '칸', '켈', '쿠', '퀘', '큐', '크', '키', '타', '탑', '테', '토', '톤', '통', '투', '트', '티', '팅', '파', '팬', '퍼', '펭', '포', '표', '프', '플', '픽', '핑', '하', '한', '합', '해', '험', '헤', '혁', '현', '혜', '호', '황']


In [39]:
print(target_characters)

['\t', '\n', '(', ')', '.', 'a', 'b', 'c', 'e', 'f', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', '가', '각', '갈', '감', '강', '검', '견', '결', '계', '고', '곰', '관', '광', '귄', '그', '근', '금', '기', '깡', '끼', '나', '냥', '네', '넬', '능', '니', '다', '단', '달', '대', '더', '데', '도', '동', '둠', '드', '디', '라', '랄', '랑', '랙', '량', '레', '렌', '력', '로', '룡', '루', '룬', '룽', '르', '리', '린', '마', '막', '만', '망', '맹', '메', '명', '모', '몽', '묵', '민', '바', '반', '백', '벌', '베', '벨', '보', '복', '불', '브', '블', '비', '빛', '사', '상', '새', '생', '샤', '성', '세', '셀', '수', '쉬', '슈', '스', '승', '시', '신', '실', '심', '아', '악', '안', '야', '양', '어', '에', '여', '역', '영', '예', '옐', '오', '와', '요', '용', '우', '웨', '유', '은', '을', '의', '이', '인', '일', '작', '적', '전', '정', '제', '지', '진', '집', '징', '채', '챈', '천', '체', '초', '최', '추', '축', '치', '침', '카', '칸', '켈', '쿠', '크', '키', '타', '탑', '테', '토', '톤', '통', '투', '트', '티', '파', '팬', '퍼', '펭', '포', '표', '프', '플', '픽', '핑', '하', '한', '해', '험', '헤', '혁', '현', '혜', '호', '황']


In [40]:
print('Number of samples:', len(input_texts))
print('Number of unique input characters:', num_encoder_tokens)
print('Number of unique output characters:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 15182
Number of unique input characters: 199
Number of unique output characters: 195
Max sequence length for inputs: 22
Max sequence length for outputs: 22


In [41]:
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [42]:
## one hot vector 생성 - [0,1,...0,...0,0]
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.

In [43]:
encoder_inputs = Input(shape=(None, num_encoder_tokens)) ##[문장길이, one-hot vector길이]
encoder = LSTM(latent_dim, return_state=True) ## latent_dim == size
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [44]:
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) ## return_sequences -> output이 다음 state의 input으로 들어감
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

## 모델 저장시 Warning 발생하지만 무시. // keras 버그 인듯

In [45]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('model_save/aion_slu_lstm.h5')

Train on 12145 samples, validate on 3037 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100


Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


  '. They will not be included '


In [46]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                     [decoder_outputs] + decoder_states)

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

In [47]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == "\n" or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

## Training Data

In [48]:
for seq_index in range(10):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: 감정표현동작하자.

Decoded sentence: motion(감정표현).

-
Input sentence: 에레슈키갈의사도변신을시작해줘.

Decoded sentence: transform(에레슈키갈의사도).

-
Input sentence: 정신큐빅을시작하소서.

Decoded sentence: cubic(정신).

-
Input sentence: 황금깡통변신을하게나.

Decoded sentence: transform(황금깡통).

-
Input sentence: 최근퀘스트빨리하지.

Decoded sentence: quest(최근).

-
Input sentence: 카이시넬변신을빨리합시다.

Decoded sentence: transform(카이시넬).

-
Input sentence: 감정표현동작하여라.

Decoded sentence: motion(감정표현).

-
Input sentence: 루나파견대변신을빨리하게.

Decoded sentence: transform(루나파견대).

-
Input sentence: 광대변신을빨리하여라.

Decoded sentence: transform(광대).

-
Input sentence: 징표동작지금좀.

Decoded sentence: motion(징표).



## Test Data

In [27]:
test_input_texts = []

with open("data/aion_test_v.txt", "r", encoding='utf=8') as f:
    reader =f.readlines()
    for i in reader:
        test_input_texts.append(i)

test_encoder_input_data = np.zeros((len(test_input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32')

for i, test_input_text in enumerate(test_input_texts):
    for t, char in enumerate(test_input_text):
        test_encoder_input_data[i, t, input_token_index[char]] = 1.
        
for seq_index in range(28, 30):
    input_seq = test_encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', test_input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: 레파르 혁명단 변신 시작 하렴.

Decoded sentence: transform(레파르 혁명단).
-
Input sentence: 레드 펭귄 변신를 지금 하십시오.

Decoded sentence: transform(레드 펭귄).
