In [98]:
import numpy as np

from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Embedding
from keras.optimizers import Adam

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [99]:
f = open('small_vocab_en.txt','r')
en = f.read()

f = open('small_vocab_fr.txt','r')
fr = f.read()

en = en.split('\n')
fr  = fr.split('\n')

In [100]:
print(en[0])
print(fr[0])

new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [101]:
en_tk = Tokenizer(char_level = False)
en_tk.fit_on_texts(en)

fr_tk = Tokenizer(char_level = False)
fr_tk.fit_on_texts(fr)

en_enc = en_tk.texts_to_sequences(en)
fr_enc = fr_tk.texts_to_sequences(fr)

In [102]:
print "French: ",fr[0]
print "English: ",en[0]
print 
print "Encoded French : ",fr_enc[0]
print "Encoded English: ",en_enc[0]

French:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
English:  new jersey is sometimes quiet during autumn , and it is snowy in april .

Encoded French :  [35, 34, 1, 8, 67, 37, 11, 24, 6, 3, 1, 112, 2, 50]
Encoded English:  [17, 23, 1, 8, 67, 4, 39, 7, 3, 1, 55, 2, 44]


In [103]:
def get_max_len(enc):
    tmp = []
    for i in enc:
        tmp.append(len(i))
    return max(tmp)

max_french_sequence_length  = get_max_len(fr_enc)
max_english_sequence_length = get_max_len(en_enc)

max_french_sequence_length = 10
max_english_sequence_length = 10

In [104]:
fr_enc = pad_sequences(fr_enc, max_french_sequence_length)
en_enc = pad_sequences(en_enc, max_french_sequence_length)

In [105]:
print "French  VocabSize: ",len(fr_tk.word_index)
print "English VocabSize: ",len(en_tk.word_index)

French  VocabSize:  345
English VocabSize:  199


In [106]:
fr_enc = np.reshape(fr_enc, (137861,max_french_sequence_length ))
en_enc = np.reshape(en_enc, (137861,max_english_sequence_length,1))

In [109]:
french_vocab_size  = len(fr_tk.word_index)+1
english_vocab_size = len(en_tk.word_index)+1

model = Sequential()

model.add(Embedding(french_vocab_size, 64, input_length = max_english_sequence_length) )
model.add(GRU(64, return_sequences=True, activation="tanh")    )
model.add(TimeDistributed(Dense(english_vocab_size, activation="softmax")))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = Adam(1e-3), metrics = ['accuracy'])

# French to English Translator

In [110]:
model.fit(fr_enc, en_enc , batch_size=1024, epochs = 10, validation_split = 0.1)

Train on 124074 samples, validate on 13787 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1c4f0b7f90>

In [124]:
fr_index_word = fr_tk.index_word
fr_index_word[0] = ' '

en_index_word = en_tk.index_word
en_index_word[0] = ' '

for ind in range(10):
    
    sent = fr_enc[ind:ind+1]
    sent = np.reshape(sent,(1,10))
    
    pred = model.predict(sent)

    pre = []
    act = []
    inp = []

    for i in pred[0]:
        pre.append(en_index_word[np.argmax(i)])

    for i in range(10):
        inp.append(fr_index_word[sent[0][i]])
        
    for i in range(10):
        act.append(en_index_word[en_enc[ind:ind+1][0][i][0]])

    print '----------------------------------------------------------'
    
    print
    print "French : ",' '.join(inp)
    print "Actual : ",' '.join(act)
    print "Pred   : ",' '.join(pre)
    print

----------------------------------------------------------

French :  calme pendant l' automne et il est neigeux en avril
Actual :  sometimes quiet during autumn and it is snowy in april
Pred   :  quiet quiet during autumn and it is snowy in april

----------------------------------------------------------

French :  généralement froid en juillet et il gèle habituellement en novembre
Actual :  chilly during july and it is usually freezing in november
Pred   :  usually chilly during july and it never freezing in november

----------------------------------------------------------

French :  calme en mars et il est généralement chaud en juin
Actual :  quiet during march and it is usually hot in june
Pred   :  quiet during march and it is usually hot in june

----------------------------------------------------------

French :  parfois légère en juin et il fait froid en septembre
Actual :  sometimes mild during june and it is cold in september
Pred   :  sometimes during during june and it

In [55]:
model.save('Simple_En2Fr.mdl')