In [18]:
import numpy as np

from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.optimizers import Adam

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [19]:
f = open('small_vocab_en.txt','r')
en = f.read()

f = open('small_vocab_fr.txt','r')
fr = f.read()

en = en.split('\n')
fr  = fr.split('\n')

In [20]:
print(en[0])
print(fr[0])

new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [21]:
en_tk = Tokenizer(char_level = False)
en_tk.fit_on_texts(en)

fr_tk = Tokenizer(char_level = False)
fr_tk.fit_on_texts(fr)

en_enc = en_tk.texts_to_sequences(en)
fr_enc = fr_tk.texts_to_sequences(fr)

In [22]:
print "English: ",en[0]
print "French: ",fr[0]
print "Encoded English: ",en_enc[0]
print "Encoded French : ",fr_enc[0]

English:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
Encoded English:  [17, 23, 1, 8, 67, 4, 39, 7, 3, 1, 55, 2, 44]
Encoded French :  [35, 34, 1, 8, 67, 37, 11, 24, 6, 3, 1, 112, 2, 50]


In [23]:
fr_enc = pad_sequences(fr_enc,10)
en_enc = pad_sequences(en_enc,10)

In [24]:
print("English VocabSize: ",len(en_tk.word_index))
print("French  VocabSize: ",len(fr_tk.word_index))

('English VocabSize: ', 199)
('French  VocabSize: ', 345)


In [25]:
en_enc = np.reshape(en_enc, (137861,10,1))
fr_enc = np.reshape(fr_enc, (137861,10,1))

In [26]:
input_dim = (10,1)

input_seq = Input(input_dim)
rnn = GRU(64, return_sequences = True)(input_seq)
logits = TimeDistributed(Dense(len(fr_tk.word_index)+1))(rnn)
model = Model(input_seq, Activation('softmax')(logits))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = Adam(1e-3), metrics = ['accuracy'])

# English to French Translator

In [27]:
model.fit(en_enc, fr_enc , batch_size=1024, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0xb31713590>

In [54]:
for ind in range(10):
    
    pred = model.predict(en_enc[ind:ind+1])[0]

    pre = []
    act = []
    inp = []

    for i in pred:
        pre.append(fr_tk.index_word[np.argmax(i)])

    for i in range(10):
        inp.append(en_tk.index_word[en_enc[ind:ind+1][0][i][0]])
        
    for i in range(10):
        act.append(fr_tk.index_word[fr_enc[ind:ind+1][0][i][0]])

    print '----------------------------------------------------------'
    
    print
    print "English: ",' '.join(inp)
    print "Actual : ",' '.join(act)
    print "Pred   : ",' '.join(pre)
    print


----------------------------------------------------------

English:  sometimes quiet during autumn and it is snowy in april
Actual :  calme pendant l' automne et il est neigeux en avril
Pred   :  fruit mois en mars et il est agréable en avril

----------------------------------------------------------

English:  chilly during july and it is usually freezing in november
Actual :  généralement froid en juillet et il gèle habituellement en novembre
Pred   :  mois en juillet et il est généralement agréable en novembre

----------------------------------------------------------

English:  quiet during march and it is usually hot in june
Actual :  calme en mars et il est généralement chaud en juin
Pred   :  mois en mars et il est généralement humide en automne

----------------------------------------------------------

English:  sometimes mild during june and it is cold in september
Actual :  parfois légère en juin et il fait froid en septembre
Pred   :  fruit mois en juin et il est à en s