In [3]:
import collections

import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import keras

from tensorflow.python.client import device_lib

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

from sklearn.model_selection import train_test_split

In [None]:
#PREPROCESSING FUNCTIONS

def tokenize(x):
    """
    Input is a List of sentences/strings to be tokenized
    Returns a Tuple of (tokenized x data, tokenizer used to tokenize x)
    """
    tk = Tokenizer()
    tk.fit_on_texts(x)
    return tk.texts_to_sequences(x), tk


def pad(x, length):
    """
    Input is a List of sequences.
    Returns Padded numpy array of sequences
    """
    return pad_sequences(x, maxlen=length, padding='post')


def preprocess(x, y):
    """
    Preprocess x:Feature List and y: Label List
    Applying tokenize() and pad()
    """
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk


def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    logits: Logits from a neural network
    tokenizer: Keras Tokenizer fit on the labels
    Returns a String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
# MODEL FUNCTION

def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    input_shape: Tuple of input shape
    output_sequence_length: Length of output sequence
    Returns Keras model
    """
    model = Sequential()
    
    model.add(
        Embedding(input_dim=english_vocab_size,
                  output_dim=256,
                  input_length=english_vocab_size,
                  input_shape=input_shape[1:]))
    
    model.add(Bidirectional(LSTM(output_sequence_length)))
    # Add repeatvector to fix vectors shape
    model.add(RepeatVector(output_sequence_length))
    model.add(Dropout(0.5))

    model.add(Bidirectional(LSTM(512,return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(Dense(french_vocab_size, activation='softmax')) 
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.002),
                  metrics=['accuracy'])
    
    return model


def final_predictions(x, y, x_tk, y_tk):
    """
    Gets predictions using the final model
    x: Preprocessed English data
    y: Preprocessed French data
    x_tk: English tokenizer
    y_tk: French tokenizer
    """
    model = model_final(x.shape,y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
    model.fit(x, y, batch_size=1024, epochs=20, validation_split=0.2)
    
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw an old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

In [None]:
# Load input data
english_sentences = 
# Load output data
french_sentences =

In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

In [None]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer =\
    preprocess(english_sentences, french_sentences)
    
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

In [None]:
final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

In [None]:
## TODO divide into proper train and test
### Having problem with shape of test to evaluate model.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

x, y, x_tk, y_tk = preprocess(X_train, y_train)

test_english_sentences, test_french_sentences, test_english_tokenizer, test_french_tokenizer =\
    preprocess(X_test, y_test)

model = model_final(x.shape,y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
model.fit(x, y, batch_size=1024, epochs=20)
    
y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
y_id_to_word[0] = '<PAD>'

sentence = 'he saw an old yellow truck'
sentence = [x_tk.word_index[word] for word in sentence.split()]
sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
sentences = np.array([sentence[0], x[0]])
predictions = model.predict(sentences, len(sentences))

In [None]:
score = model.evaluate(test_english_sentences, test_french_sentences, batch_size=1024)

print('Test Score: ', score)

print('Sample 1:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
print('Il a vu un vieux camion jaune')
print('Sample 2:')
print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))