In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.layers import *
from keras.models import Model, load_model
import re
import string
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

In [0]:
df=pd.read_csv("english_to_hindi.txt",sep='\t')

In [0]:
#df=df.drop(columns=["Unnamed: 0"])

In [0]:
df.columns=["English","Hindi"]

In [0]:
df=shuffle(df)

In [118]:
df

Unnamed: 0,English,Hindi
20524,Deleting removed messages from server,कैश से मिटाए संदेश हटा रहे
14027,Set of fields to display for this dictionary type,इस शब्दकोश किस्म को प्रदर्शित करने के लिए क्षे...
11008,Hiragana,हिरागाना
19481,DNS Servers,डीएनएस सर्वर्स
14091,Stack overflow,स्टैक ओवरफ़्लो
...,...,...
2069,The cat slowly approached the mouse.,बिल्ली धीरे से चूहे की तरफ़ बढ़ी।
6294,ASCII extensions,एक्सटेंशनः
28940,Kooka Gallery,कूका दीर्घा
12561,The default image file,डिफ़ॉल्ट छवि फ़ाइल


In [0]:
df["English"]=df["English"].astype(str)
df["English"]=df["English"].apply(lambda x: x.lower())

In [0]:
df["Hindi"]=df["Hindi"].apply(lambda x : x.lower())

In [0]:
df["English"]=df["English"].apply(lambda x : re.sub("'",'',x))
df["Hindi"]=df["Hindi"].apply(lambda x : re.sub("'",'',x))

In [0]:
df["English"]=df["English"].apply(lambda x: re.sub("[0-9]","",x))
df["Hindi"]=df["Hindi"].apply(lambda x: re.sub("[0-9]","",x))

In [0]:
spl_chars=string.punctuation
df["English"]=df["English"].apply(lambda x: ''.join(ch for ch in x if ch not in spl_chars))
df["Hindi"]=df["Hindi"].apply(lambda x: ''.join(ch for ch in x if ch not in spl_chars))

In [0]:
df["English"]=df["English"].apply(lambda x : re.sub(" +"," ",x))
df["Hindi"]=df["Hindi"].apply(lambda x : re.sub(" +"," ",x))

In [0]:
df["Hindi"]=df["Hindi"].apply(lambda x: "startseq "+x+" endseq")

In [0]:
tokenizer_eng=Tokenizer(10000)
tokenizer_hin=Tokenizer(10000)

In [0]:
train_eng=df["English"]
train_hin=df["Hindi"]

In [0]:
#XT,Xt,YT,Yt=train_test_split(train_eng,train_hin,test_size=0.1)
#XT=train_eng
#YT=train_hin
XT=train_eng
YT=train_hin

In [129]:
XT.shape,YT.shape

((29414,), (29414,))

In [0]:
tokenizer_eng.fit_on_texts(XT)

In [0]:
tokenizer_hin.fit_on_texts(YT)

In [0]:
VOCAB_SIZE_ENG=len(tokenizer_eng.word_index)+1

In [0]:
VOCAB_SIZE_HIN=len(tokenizer_hin.word_index)+1

In [134]:
VOCAB_SIZE_ENG,VOCAB_SIZE_HIN

(10653, 11935)

In [0]:
eng_word_to_idx=dict(tokenizer_eng.word_index)
hin_word_to_idx=dict(tokenizer_hin.word_index)
eng_idx_to_word=dict(tokenizer_eng.index_word)
hin_idx_to_word=dict(tokenizer_hin.index_word)

In [0]:
max_len_eng=0
for text in XT:
    if(len(text.split())>max_len_eng):
        max_len_eng=len(text.split())

In [0]:
max_len_hin=0
for text in YT:
    if(len(text.split())>max_len_hin):
        max_len_hin=len(text.split())

In [138]:
max_len_eng,max_len_hin

(38, 44)

In [0]:
latent_dim=128
encoder_inputs=Input(shape=(None,))
enc_emb=Embedding(VOCAB_SIZE_ENG,latent_dim,mask_zero=True)(encoder_inputs)
encoder_lstm=LSTM(latent_dim,return_state=True)
encoder_outputs, state_h, state_c=encoder_lstm(enc_emb)
encoder_states=[state_h,state_c]

In [0]:
decoder_inputs=Input(shape=(None,))
dec_emb_layer=Embedding(VOCAB_SIZE_HIN,latent_dim,mask_zero=True)
dec_emb=dec_emb_layer(decoder_inputs)
decoder_lstm=LSTM(latent_dim,return_state=True,return_sequences=True)
decoder_outputs,_,_=decoder_lstm(dec_emb,initial_state=encoder_states)


decoder_dense=Dense(VOCAB_SIZE_HIN,activation='softmax')
decoder_outputs=decoder_dense(decoder_outputs)

model=Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [141]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 128)    1363584     input_11[0][0]                   
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 128)    1527680     input_12[0][0]                   
____________________________________________________________________________________________

In [0]:
model.compile(optimizer='adam',loss='categorical_crossentropy')

In [0]:
def generate_batch(X=XT , Y=YT, batch_size=32):
    while True:
        for i in range(0,len(X),batch_size):
            encoder_input_data=np.zeros((batch_size,max_len_eng),dtype='float32')
            decoder_input_data=np.zeros((batch_size,max_len_hin),dtype='float32')
            decoder_output_data=np.zeros((batch_size,max_len_hin,VOCAB_SIZE_HIN),dtype='float32')
            for j ,(eng_text,hin_text) in enumerate(zip(X[i:i+batch_size],Y[i:i+batch_size])):
                for t,word in enumerate(eng_text.split()):
                    encoder_input_data[j,t]=eng_word_to_idx[word]
                
                for t,word in enumerate(hin_text.split()):
                    if(t<len(hin_text)-1):
                        decoder_input_data[j,t]=hin_word_to_idx[word]
                    if(t>0):
                        decoder_output_data[j,t-1,hin_word_to_idx[word]]=1
            
            yield ([encoder_input_data,decoder_input_data],decoder_output_data)

In [144]:
epochs=10
batch_size=32
steps=len(XT)/batch_size
generator=generate_batch()

for i in range(epochs):
    generator=generate_batch()
    model.fit_generator(generator=generator,steps_per_epoch=steps,epochs=1,verbose=1)

#model.save("the_final_model.h5")

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [0]:
model.save("the_final_model.h5")

In [0]:
encoder_model=Model(encoder_inputs,encoder_states)

decoder_state_h=Input(shape=(latent_dim,))
decoder_state_c=Input(shape=(latent_dim,))
decoder_states_inputs=[decoder_state_h,decoder_state_c]

dec_emb2=dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2=decoder_lstm(dec_emb2,initial_state=decoder_states_inputs)
decoder_states2=[state_h2,state_c2]

decoder_outputs2=decoder_dense(decoder_outputs2)

decoder_model=Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs2] + decoder_states2)

In [0]:
#encoder_model.save("encoder_model.h5")
#decoder_model.save("decoder_model.h5")

In [0]:
#model.save("model_translate.h5")

In [149]:
decoder_model.summary()

Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 128)    1527680     input_12[0][0]                   
__________________________________________________________________________________________________
input_13 (InputLayer)           (None, 128)          0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 128)          0                                            
___________________________________________________________________________________________

In [0]:
def decode_sequence(input_seq):
    states_value=encoder_model.predict(input_seq)
    target_seq=np.zeros((1,1))
    target_seq[0,0]=hin_word_to_idx["startseq"]
    
    stop_condition=False
    decoded_sentence=""
    
    while stop_condition is False:
        output_seq,h,c=decoder_model.predict([target_seq]+states_value)
        
        idx=np.argmax(output_seq[0,-1,:])
        ch=hin_idx_to_word[idx]
        decoded_sentence+=" "+ch
        
        if(ch=="endseq" or len(decoded_sentence)>40):
            stop_condition=True
        
        target_seq=np.zeros((1,1))
        target_seq[0,0]=idx
        
        states_value=[h,c]
    
    return decoded_sentence

In [0]:
train_gen=generate_batch(XT,YT,batch_size=1)
k=-1

In [0]:
def data_clean(sentence):
  sentence=sentence.lower()
  sentence=re.sub("'",'',sentence)
  sentence=re.sub("[0-9]","",sentence)
  spl_chars=string.punctuation
  sentence=''.join(ch for ch in sentence if ch not in spl_chars)
  sentence=re.sub(" +"," ",sentence)
  return sentence

In [0]:
def input_sentence(sentence):
  sentence=data_clean(sentence)
  seq=np.zeros((1,max_len_eng),dtype='float32')
  for t,word in enumerate(sentence.split()):
    seq[0,t]=eng_word_to_idx.get(word)
  return decode_sequence(seq)


In [155]:
for i in range(20):
  k+=1
  (input_seq,actual_output),_=next(train_gen)
  decoded_sentence=decode_sequence(input_seq)
  print("Input= ",XT[k:k+1].values[0])
  print("Predicted= ",decoded_sentence[:-6])
  print()

Input=  download first
Predicted=   प्रथम डाउनलोड 

Input=  negative float
Predicted=   नेगेटिव फ्लोट 

Input=  he ordered them to release the prisoners
Predicted=   उसने पुलिस अफ़सर को सुलाना करना चाहिए। 

Input=  height
Predicted=   ऊँचाई 

Input=  maximum lifespan
Predicted=   अधिकतम जीवनकाल 

Input=  roll or double
Predicted=   एक वृत्त का सीमा 

Input=  filter the list by city name
Predicted=   नाम नाम के लिए नाम चुनें 

Input=  i wish i had this problem
Predicted=   मुझे मुझे इस घर को मुझे मिलना चाहिए। 

Input=  lock icons in place
Predicted=   प्रारंभ में प्रतीक सक्षम करें 

Input=  select to beginning of document
Predicted=   दस्तावेज़ के साथ चुनें 

Input=  pitfalls
Predicted=   डिजाइनेबल 

Input=  started late
Predicted=   प्रारंभ 

Input=  create new calendar file if one does not exist
Predicted=   यदि कोई एक फ़ाइलें नहीं है या कोई नही

Input=  server information
Predicted=   सर्वर जानकारी 

Input=  autocompletion for forms
Predicted=   फ़ॉर्म के लिए रूप में दिखाएँ 

Input= 

In [164]:
sen="i am not available"
print("Input= ",sen)
print(input_sentence(sen)[:-6])

Input=  i am not available
 मैं नहीं मिला 
