In [6]:
import numpy as np

In [7]:
import re
import string
from unicodedata import normalize
def clean(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lowercase
        line = [word.lower() for word in line]
        # remove punctuation from each token
        line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(line)
    return cleaned

In [10]:
import glob
documentsV=[]
documentsR=[]
with open('fra.txt',encoding="utf8") as f:
    for line in f:
        data=line.split("\t")
        documentsV.append(data[0])
        documentsR.append(data[1])
documentsV=clean(documentsV)
documentsR=clean(documentsR)

In [20]:
from keras.preprocessing.text import Tokenizer
tokenizerV = Tokenizer(filters=' ')
tokenizerV.fit_on_texts(documentsV)

In [21]:
tokenizerR = Tokenizer(filters=' ')
tokenizerR.fit_on_texts(documentsR)

In [22]:
def max_length(docs):
    return max(len(d) for d in docs)

In [23]:
rlength = max_length(documentsR)
vlength = max_length(documentsV)
r_vocab_size = len(tokenizerR.word_index) + 1
v_vocab_size = len(tokenizerV.word_index) + 1

In [24]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [25]:
from numpy import array

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [26]:
# prepare training data
trainX = encode_sequences(tokenizerV, vlength, documentsV)
trainY = encode_sequences(tokenizerR, rlength, documentsR)
trainY = encode_output(trainY, r_vocab_size)

In [27]:
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model
# define model
model = define_model(v_vocab_size, r_vocab_size, vlength, rlength, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 147, 256)          14336     
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 192, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 192, 256)          525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 192, 57)           14649     
Total params: 1,079,609
Trainable params: 1,079,609
Non-trainable params: 0
_________________________________________________________________
None


In [28]:
model.fit(trainX, trainY, epochs=120, batch_size=25)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

<keras.callbacks.History at 0xe3d417c6a0>

In [29]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [30]:
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [31]:
from numpy import argmax
def evaluate_model(model, tokenizer, sources, raw_src, raw_target):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, tokenizer, source)
        #raw_target, raw_src = raw_dataset[i]
        print('src=[%s], target=[%s], predicted=[%s]' % (raw_src[i], raw_target[i], translation))

In [32]:
evaluate_model(model, tokenizerR, trainX, documentsV, documentsR)

src=[ <? php  if ( ! empty ( $ _POST [ 'name' ] ) ) {  echo  '<b>' ;  echo  $ _POST [ 'name' ] ;  echo  '</b>' ;  }  ?>], target=[ <? php  if ( ! empty ( $ _POST [ 'name' ] ) ) {  echo  '<b>' ;  echo  HTMLSpecialChars ( $ _POST [ 'name' ] ) ;  echo  '</b>' ;  }  ?>], predicted=[$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ ; ; ; ; ; ; ; ;]
src=[ <? php  echo  $ _POST [ 'name' ] ;  ?>], target=[ <? php  echo  HTMLSpecialChars ( $ _POST [ 'name' ] ) ;  ?>], predicted=[$ $ $ $ $ $ $ $ $ $ $ $ $ ;]
src=[ <? php  $ name  =  $ _POST [ 'name' ] ;  echo  ' Welcome, '  .  $ name  .  '</div>' ;  ?>], target=[ <? php  $ name  =  $ _POST [ 'name' ] ;  echo  ' Welcome, '  .   HTMLSpecialChars ( $ name )  .  '</div>' ;  ?>], predicted=[$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ ; ; ; ;]
src=[ <? php  $ name  =  $ _POST [ 'name' ] ;  echo  "Welcome" ;  echo  "$name"  ?>], target=[ <? php  $ name  =  $ _POST [ 'name' ] ;  echo  "Welcome" ;  $ name  =  HTMLSpecialChars ( $ name ) ;  echo  "$name"  ?>], predic