In [None]:
import pandas as pd
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
from models import Seq2Seq

In [None]:
lines = pd.read_csv('fra-eng/fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']

In [None]:
lines = lines[0:10000]

In [None]:
lines.tar = lines.tar.apply(lambda x: '\t'+x+'\n')

In [None]:
tokenizer_eng = tf.keras.preprocessing.text.Tokenizer(num_words=100, char_level=True)
tokenizer_fra = tf.keras.preprocessing.text.Tokenizer(num_words=100, char_level=True)
tokenizer_eng.fit_on_texts(lines.src)
tokenizer_fra.fit_on_texts(lines.tar)
word_index_eng = tokenizer_eng.word_index
word_index_fra = tokenizer_fra.word_index
seq_eng = tokenizer_eng.texts_to_sequences(lines.src)
seq_fra = tokenizer_fra.texts_to_sequences(lines.tar)
seq_fra_tar = tokenizer_fra.texts_to_sequences(lines.tar.apply(lambda x: x[1:]))

In [None]:
max_seq_eng_len = max([len(x) for x in seq_eng])
max_seq_fra_len = max([len(x) for x in seq_fra])
dataset_eng = to_categorical(pad_sequences(seq_eng, maxlen=max_seq_eng_len, padding='post'))
dataset_fra = to_categorical(pad_sequences(seq_fra, maxlen=max_seq_fra_len, padding='post'))
dataset_fra_tar = to_categorical(pad_sequences(seq_fra_tar, maxlen=max_seq_fra_len, padding='post'))

In [None]:
model = Seq2Seq(dataset_eng.shape[2], dataset_fra.shape[2], state_dim=256)

In [None]:
index_to_eng = dict((i, char) for char, i in word_index_eng.items())
index_to_fra = dict((i, char) for char, i in word_index_fra.items())

def sampling(model, x):
    pred_val = ""
    _, context_state = model.encode(x)
    dec_in = np.zeros((1, 1, dataset_fra.shape[2]))
    dec_in[0, 0, word_index_fra['\t']] = 1.

    while True:
        y_pred, dec_state_h, dec_state_c = model.decode(dec_in, context_state[0], context_state[1])
        target_word_idx = np.argmax(y_pred[0, 0, :])

        if index_to_fra[target_word_idx] == '\n':
            break

        pred_val += index_to_fra[target_word_idx]

        if len(pred_val) == max_seq_fra_len-2:
            break

        dec_in = np.zeros((1, 1, dataset_fra.shape[2]))
        dec_in[0, 0, target_word_idx] = 1.
        context_state = [dec_state_h, dec_state_c]
    
    return pred_val

In [None]:
def compute_loss(model, x, y, y_true):
    _, context_state = model.encode(x)
    y_pred, _, _ = model.decode(y, context_state[0], context_state[1])
    loss = tf.losses.categorical_crossentropy(y_true, y_pred)
    return loss

@tf.function
def train_step(model, x, y, y_true, optimizer):
    with tf.GradientTape() as tape:
        loss = compute_loss(model, x, y, y_true)
    gradients = tape.gradient(loss, model.encoder.trainable_variables + model.decoder.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.encoder.trainable_variables + model.decoder.trainable_variables))

In [None]:
batch_size = 32
train_dataset = tf.data.Dataset.from_tensor_slices((dataset_eng, dataset_fra, dataset_fra_tar)).batch(batch_size)

In [None]:
for eng, fra, fra_tar in train_dataset.take(1):
    test_eng = eng[:1]
    test_fra = fra[:1]
    test_fra_tar = fra_tar[:1]

In [None]:
optimizer = tf.keras.optimizers.RMSprop()

In [None]:
def test_sample(epoch, test_eng, test_fra):
    eng_idxs = np.argmax(test_eng[0,:], axis=1)
    eng_sentence = "" 
    for idx in eng_idxs:
        if idx == 0:
            break
        eng = index_to_eng[idx]
        eng_sentence += eng

    fra_sentence = sampling(model, test_eng)

    print("{}. {} : {}".format(epoch, eng_sentence, fra_sentence))

In [None]:
for epoch in range(1, 2):
    for train_data in train_dataset:
        train_step(model, train_data[0], train_data[1], train_data[2], optimizer)
    test_sample(epoch, test_eng, test_fra)

In [None]:
# model.save_weights('./checkpoints/0610')