In [1]:
import numpy as np

In [2]:
import re
import string
from unicodedata import normalize
def clean(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(line)
    return cleaned

In [3]:
import glob
documentsV=[]
documentsR=[]
with open('fra.txt',encoding="utf8") as f:
    for line in f:
        data=line.split("\t")
        documentsV.append(data[0])
        documentsR.append(data[1])
documentsV=clean(documentsV)
documentsR=clean(documentsR)

In [4]:
from keras.preprocessing.text import Tokenizer
tokenizerV = Tokenizer(filters=' ')
tokenizerV.fit_on_texts(documentsV)

Using TensorFlow backend.


In [5]:
tokenizerR = Tokenizer(filters=' ')
tokenizerR.fit_on_texts(documentsR)

In [6]:
def max_length(docs):
    return max(len(d) for d in docs)

In [7]:
rlength = max_length(documentsR)
vlength = max_length(documentsV)
r_vocab_size = len(tokenizerR.word_index) + 1
v_vocab_size = len(tokenizerV.word_index) + 1

In [8]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [9]:
from numpy import array

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [10]:
# prepare training data
trainX = encode_sequences(tokenizerV, vlength, documentsV)
trainY = encode_sequences(tokenizerR, rlength, documentsR)
trainY = encode_output(trainY, r_vocab_size)

In [11]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
# define model
model = define_model(v_vocab_size, r_vocab_size, vlength, rlength, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3, 256)            82176     
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 7, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 7, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 7, 684)            175788    
Total params: 1,308,588
Trainable params: 1,308,588
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model.fit(trainX, trainY, epochs=20, batch_size=25)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x98d9bce160>

In [19]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [20]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [21]:
from numpy import argmax
def evaluate_model(model, tokenizer, sources, raw_src, raw_target):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        #raw_target, raw_src = raw_dataset[i]
        print('src=[%s], target=[%s], predicted=[%s]' % (raw_src[i], raw_target[i], translation))

In [22]:
evaluate_model(model, tokenizerR, trainX, documentsV, documentsR)

src=[['go']], target=[['va']], predicted=[detendstoi]
src=[['run']], target=[['cours']], predicted=[fantastique]
src=[['run']], target=[['courez']], predicted=[fantastique]
src=[['fire']], target=[['au', 'feu']], predicted=[a a]
src=[['help']], target=[['a', 'laide']], predicted=[repondezmoi]
src=[['jump']], target=[['saute']], predicted=[genial]
src=[['stop']], target=[['ca', 'suffit']], predicted=[stop]
src=[['stop']], target=[['stop']], predicted=[stop]
src=[['stop']], target=[['arretetoi']], predicted=[stop]
src=[['wait']], target=[['attends']], predicted=[attendez]
src=[['wait']], target=[['attendez']], predicted=[attendez]
src=[['go', 'on']], target=[['poursuis']], predicted=[poursuis]
src=[['go', 'on']], target=[['continuez']], predicted=[poursuis]
src=[['go', 'on']], target=[['poursuivez']], predicted=[poursuis]
src=[['i', 'see']], target=[['je', 'comprends']], predicted=[je comprends]
src=[['i', 'try']], target=[['jessaye']], predicted=[je]
src=[['i', 'won']], target=[['jai', 