Reading text file of english and french

In [None]:
with open('small_vocab_en.txt', 'r') as f:
    eng_sentences = f.read().split('\n')

with open('small_vocab_fr.txt', 'r') as f:
    fre_sentences = f.read().split('\n')

Importing pandas libraries

In [None]:
import pandas as pd

Changing the engish and french sentences list into Data frame

In [None]:
table=pd.DataFrame(eng_sentences,columns=['English words/sentences'])

In [None]:
table['French words/sentences']=fre_sentences

In [None]:
table.head()

Unnamed: 0,English words/sentences,French words/sentences
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...
2,"california is usually quiet during march , and...","california est généralement calme en mars , et..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo..."


In [None]:
len(table)

137861

Adding start and end token in the french sentences data

In [None]:
table['French words/sentences']=table['French words/sentences'].apply(lambda x:'<SOS> '+x+' <EOS>')
table['French words/sentences']=table['French words/sentences'].apply(lambda x:x.replace('  ',' '))

In [None]:
table=table.head(100000)

In [None]:
table.head()

Unnamed: 0,English words/sentences,French words/sentences
0,"new jersey is sometimes quiet during autumn , ...",<SOS> new jersey est parfois calme pendant l' ...
1,the united states is usually chilly during jul...,<SOS> les états-unis est généralement froid en...
2,"california is usually quiet during march , and...",<SOS> california est généralement calme en mar...
3,the united states is sometimes mild during jun...,<SOS> les états-unis est parfois légère en jui...
4,"your least liked fruit is the grape , but my l...","<SOS> votre moins aimé fruit est le raisin , m..."


Importing **Tokenizer** (which tokenize every unique vocabulary) and **pad_sequences** (which normalize the length of the every sequences to the length of the maximum sequence length available in the dataset by putting zero after sequence ends)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

Making a Tokenizer with defaults setting for english sentences and making a tokenizer for french sentences with one change in setting that is we will filter all characters except '<' and '>' to avoid the change of our start token and end token

In [None]:
eng_token=Tokenizer()
fre_token=Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

Fitting all the sentences in english and french in their respective tokenizer

In [None]:
eng_token.fit_on_texts(table['English words/sentences'])
fre_token.fit_on_texts(table['French words/sentences'])

Saving the dictionary to word to index and index to word

In [None]:
fre_index=fre_token.word_index
reverse_fre_index=fre_token.index_word

In [None]:
fre_index['<EOS>']

3

In [None]:
fre_index['<SOS>']

2

In [None]:
reverse_fre_index[3]

'<EOS>'

In [None]:
reverse_fre_index[2]

'<SOS>'

Maximum sentence length in of all data of english sentences

In [None]:
max([len(i.split()) for i in table['English words/sentences']])

17

Maximum sentence length in of all data of french sentences

In [None]:
max([len(i.split()) for i in table['French words/sentences']])

25

In [None]:
max_eng_length=max([len(i.split()) for i in table['English words/sentences']])
max_fre_length=max([len(i.split()) for i in table['French words/sentences']])

Changing all the sentences in both english and french sentences data into sequences

In [None]:
eng_seq=eng_token.texts_to_sequences(table['English words/sentences'])
fre_seq=fre_token.texts_to_sequences(table['French words/sentences'])

Padding both set of sequences with their maximum length

In [None]:
pad_eng_seq=pad_sequences(eng_seq,maxlen=max_eng_length,padding='post')
pad_fre_seq=pad_sequences(fre_seq,maxlen=max_fre_length,padding='post')

Total vocab size after padding. That is why we add 1 in the total vocab in their respective tokenizer as after padding we get a new index that is '0'

In [None]:
len(eng_token.word_index)+1,len(fre_token.word_index)+1

(200, 344)

In [None]:
fre_vocab_size=len(fre_token.word_index)+1

In [None]:
X1=pad_eng_seq

In [None]:
X2 = pad_fre_seq.reshape((-1, max_fre_length, 1))[:, :-1, :]
y = pad_fre_seq.reshape((-1, max_fre_length, 1))[:, 1:, :]

In [None]:
X1.shape,X2.shape,y.shape

((100000, 17), (100000, 24, 1), (100000, 24, 1))

In [None]:
X1[0],X2[0],y[0]

(array([17, 20,  1,  8, 65,  4, 36,  7,  3,  1, 42,  2, 46,  0,  0,  0,  0],
       dtype=int32),
 array([[  2],
        [ 36],
        [ 35],
        [  1],
        [ 10],
        [ 69],
        [ 39],
        [ 13],
        [ 26],
        [  8],
        [  5],
        [  1],
        [114],
        [  4],
        [ 54],
        [  3],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]], dtype=int32),
 array([[ 36],
        [ 35],
        [  1],
        [ 10],
        [ 69],
        [ 39],
        [ 13],
        [ 26],
        [  8],
        [  5],
        [  1],
        [114],
        [  4],
        [ 54],
        [  3],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]], dtype=int32))

In [None]:
import numpy as np

In [None]:
from keras.layers import Input,Embedding,LSTM,Dense
from keras import Model
from keras.losses import sparse_categorical_crossentropy

Making the Encoder-Decoder model and training it

In [None]:
encoder_input=Input(shape=(None,))
encoder_embedding=Embedding(len(eng_token.word_index)+1,200,mask_zero=True)
encoder_embed=encoder_embedding(encoder_input)
encoder_lstm=LSTM(256,return_state=True)
_,h,c=encoder_lstm(encoder_embed)
states=[h,c]

decoder_input=Input(shape=(None,1))
decoder_lstm=LSTM(256,return_state=True,return_sequences=True)
decoder_output,_,__=decoder_lstm(decoder_input,initial_state=states)

dense_layers=Dense(fre_vocab_size,activation='softmax')
output=dense_layers(decoder_output)

model=Model([encoder_input,decoder_input],output)
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit([X1,X2],y,epochs=20,validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7d8c8c2dfdc0>

 After training, we split the combined model into encoder model and decoder model

* Encoder Model will input english sequence and will predict states of that sequence

In [None]:
encoder_model=Model(encoder_input,states)

* Decoder Model will input a single token index and initial states and predict softmax output and new states

In [None]:
h=Input(shape=(256,))
c=Input(shape=(256,))
dec_state=[h,c]
lst,h2,c2=decoder_lstm(decoder_input,initial_state=dec_state)
new_state=[h2,c2]
out=dense_layers(lst)

decoder_model=Model([decoder_input]+dec_state,[out]+new_state)

Checking the encoder and decoder model

In [None]:
sentence="new jersey is sometimes quiet during autumn and it is snowy in april"
sentence=eng_token.texts_to_sequences([sentence])
sentence=pad_sequences(sentence,maxlen=max_eng_length,padding='post')
sent_states=encoder_model.predict(sentence)
condition=True
translate=[]
start="<SOS>"
token=np.zeros((1,1,1))
token[0,0,0]=fre_index[start]
while condition:
  trans,hid,cell=decoder_model.predict([token]+sent_states)
  ob_idx=np.argmax(trans[0,0,:])
  ob_word=reverse_fre_index[ob_idx]
  translate.append(ob_word)
  token[0,0,0]=ob_idx
  sent_states=[hid,cell]
  if ob_word=='<EOS>'or len(translate)==max_fre_length:
    condition=False

" ".join(translate)



"new jersey est parfois calme pendant l' automne et il est neigeux en avril <EOS>"

Defining a function that translate the english sentence into french sentence

In [None]:
def translator(sentence):
  sentence=eng_token.texts_to_sequences([sentence])
  sentence=pad_sequences(sentence,maxlen=max_eng_length,padding='post')
  sent_states=encoder_model.predict(sentence)
  condition=True
  translate=[]
  start="<SOS>"
  token=np.zeros((1,1,1))
  token[0,0,0]=fre_index[start]
  while condition:
    trans,hid,cell=decoder_model.predict([token]+sent_states)
    ob_idx=np.argmax(trans[0,0,:])
    ob_word=reverse_fre_index[ob_idx]
    translate.append(ob_word)
    token[0,0,0]=ob_idx
    sent_states=[hid,cell]
    if ob_word=='<EOS>'or len(translate)==max_fre_length:
      condition=False
  return " ".join(translate)

Now translating english sentences into french

In [None]:
translator("india is sometimes cold during march , but it is sometimes hot in january .")



"l' inde est parfois froid au mois de mars mais il est parfois chaud en janvier <EOS>"

In [None]:
translator("our least liked fruit is the lemon but her least liked is the pear")



'notre fruit est moins aimé le citron mais elle est moins aimé la poire <EOS>'

In [None]:
translator("she disliked a rusty yellow car .")



"elle n'aimait pas une voiture jaune rouillée <EOS>"

In [None]:
translator("paris is never hot during summer , and it is usually mild in winter .")



"paris est jamais chaude pendant l' été et il est généralement doux en hiver <EOS>"

In [None]:
translator("china is hot during july , but it is never pleasant in january .")



'chine est chaud en juillet mais il est jamais agréable en janvier <EOS>'

In [None]:
translator("the strawberry is their least favorite fruit , but the apple is our least favorite.")



'la fraise est leur fruit préféré moins mais la pomme est notre moins préféré <EOS>'

In [None]:
translator("lemons")



'citrons <EOS>'