In [3]:
import numpy as np

from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.optimizers import Adam

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
f = open('small_vocab_en.txt','r')
en = f.read()

f = open('small_vocab_fr.txt','r')
fr = f.read()

en = en.split('\n')
fr  = fr.split('\n')

In [5]:
print(en[0])
print(fr[0])

new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [6]:
en_tk = Tokenizer(char_level = False)
en_tk.fit_on_texts(en)

fr_tk = Tokenizer(char_level = False)
fr_tk.fit_on_texts(fr)

en_enc = en_tk.texts_to_sequences(en)
fr_enc = fr_tk.texts_to_sequences(fr)

In [7]:
print "English: ",en[0]
print "French: ",fr[0]
print "Encoded English: ",en_enc[0]
print "Encoded French : ",fr_enc[0]

English:  new jersey is sometimes quiet during autumn , and it is snowy in april .
French:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
Encoded English:  [17, 23, 1, 8, 67, 4, 39, 7, 3, 1, 55, 2, 44]
Encoded French :  [35, 34, 1, 8, 67, 37, 11, 24, 6, 3, 1, 112, 2, 50]


In [8]:
fr_enc = pad_sequences(fr_enc,10)
en_enc = pad_sequences(en_enc,10)

In [9]:
print("English VocabSize: ",len(en_tk.word_index))
print("French  VocabSize: ",len(fr_tk.word_index))

('English VocabSize: ', 199)
('French  VocabSize: ', 345)


In [10]:
en_enc = np.reshape(en_enc, (137861,10,1))
fr_enc = np.reshape(fr_enc, (137861,10,1))

In [12]:
input_dim = (10,1)

input_seq = Input(input_dim)
rnn = GRU(64, return_sequences = True)(input_seq)
logits = TimeDistributed(Dense(len(en_tk.word_index)+1))(rnn)
model = Model(input_seq, Activation('softmax')(logits))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = Adam(1e-3), metrics = ['accuracy'])

# French to English Translator

In [13]:
model.fit(fr_enc, en_enc, batch_size=1024, epochs = 10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0xb2a0596d0>

In [18]:
for ind in range(5,10):
    
    pred = model.predict(fr_enc[ind:ind+1])[0]

    pre = []
    act = []
    inp = []

    for i in pred:
        pre.append(en_tk.index_word[np.argmax(i)])

    for i in range(10):
        inp.append(fr_tk.index_word[fr_enc[ind:ind+1][0][i][0]])
        
    for i in range(10):
        act.append(en_tk.index_word[en_enc[ind:ind+1][0][i][0]])

    print '----------------------------------------------------------'
    
    print
    print "French : ",' '.join(inp)
    print "Actual : ",' '.join(act)
    print "Pred   : ",' '.join(pre)
    print


----------------------------------------------------------

French :  fruit préféré est l'orange mais mon préféré est le raisin
Actual :  fruit is the orange but my favorite is the grape
Pred   :  is favorite during orange but my least is the grape

----------------------------------------------------------

French :  relaxant en décembre mais il est généralement froid en juillet
Actual :  relaxing during december but it is usually chilly in july
Pred   :  dry during february but it is usually pleasant in september

----------------------------------------------------------

French :  occupé au printemps et il est jamais chaude en mars
Actual :  busy during spring and it is never hot in march
Pred   :  snowy during spring and it is never snowy in winter

----------------------------------------------------------

French :  aimé le citron mais mon moins aimé est le raisin
Actual :  is the lemon but my least liked is the grape
Pred   :  is the lemon but my least liked is the lemon

-----