In [None]:
import numpy as np
import os

# My own imports
from helper import *
from Classes.Seq2Seq import Seq2Seq
from Classes.TokenizerWrap import TokenizerWrap

# Keras imports
from keras.optimizers import RMSprop
from keras.callbacks import *

# Get necessary parameters
from Config import *

In [None]:
class TestCallback(Callback):
    def __init__(self, src, dest, num_predictions=5):
        super(TestCallback, self).__init__()
        self.src = src
        self.dest = dest
        self.num_predictions = 5 if num_predictions < len(src) else len(src)

    def on_epoch_end(self, epoch, logs=None):
        print('\n')
        pairs = [[src_seq, dest_seq] for src_seq, dest_seq in zip(self.src, self.dest)]

        random_valid_pairs = random.sample(pairs, k=self.num_predictions)
        for pairs in random_valid_pairs:
            src_seq = pairs[0]
            dest_seq = pairs[1]

            prediction = s2s.predict_greedy(str(src_seq))
            helper.print_prediction(src_seq, prediction, dest_seq)
        print('\n')

try:
    # Load data for validation
    valid_src = collect.get_data_nmt_dataset(VALIDATION_SRC)
    valid_dest = collect.get_data_nmt_dataset(VALIDATION_DEST)
except FileNotFoundError:
    print("No validation set in your directory...")

    # If no validation found, we initialized our own text
    valid_src = ["thank you very much", "i like her"]
    valid_dest = ["cảm ơn rất nhiều", "tôi thích cô ấy"]

In [None]:
data_src = collect.get_data_nmt_dataset(DATASET_SRC)
data_dest = collect.get_data_nmt_dataset(DATASET_DEST, start=START_WORD, end=END_WORD)

tokenizer_src = TokenizerWrap(texts=data_src, padding='pre', reverse=True, num_words=NUM_WORDS, max_tokens=SEQ_LEN)
tokenizer_dest = TokenizerWrap(texts=data_dest, padding='post', reverse=False, num_words=NUM_WORDS, max_tokens=SEQ_LEN)

encoder_input_data = tokenizer_src.tokens_padded
decoder_output_data = tokenizer_dest.tokens_padded

# Trainning data
x_train = encoder_input_data
y_train = decoder_output_data

print("Input shape: {}".format(x_train.shape))
print("Output shape: {}".format(y_train.shape))

# Initialize our model
s2s = Seq2Seq(tokenizer_src, tokenizer_dest, START_WORD, END_WORD)
s2s.build(NUM_WORDS, EMBEDDING_SIZE, STATE_SIZE, LAYERS, DROPOUT_RATE)
s2s.compile('rmsprop')

In [None]:
# Inference
# Set some dummy example
x_dummy = ["thank you very much", "i like her"]
y_dummy = ["cảm ơn rất nhiều", "tôi thích cô ấy"]
for input_text, actual_text in zip(x_dummy, y_dummy):
    predict_text = s2s.predict_greedy(str(input_text))
    helper.print_prediction(input_text, predict_text, actual_text)
print('\n')