In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import numpy as np
import pandas as pd


import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model



In [3]:
df=pd.read_csv("/content/drive/MyDrive/Dataset/Machine_Translation/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [4]:
 df.head(20)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
5,tides,The then Governor of Kashmir resisted transfer...,कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का व...
6,indic2012,In this lies the circumstances of people befor...,इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।
7,ted,"And who are we to say, even, that they are wrong",और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
8,indic2012,“”Global Warming“” refer to warming caused in ...,ग्लोबल वॉर्मिंग से आशय हाल ही के दशकों में हुई...
9,tides,You may want your child to go to a school that...,हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटे...


In [5]:
pd.isnull(df).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

In [6]:
df=df[~pd.isnull(df['english_sentence'])]

In [7]:
df.drop_duplicates(inplace=True)

* ### Let us pick any 25000 rows from the dataset.

In [8]:
df=df.sample(n=50000,random_state=42)
df = df.reset_index(drop= True)
df.shape

(50000, 3)

In [9]:
# Lowercase all characters
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())

In [10]:
# Remove quotes
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [11]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [12]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [13]:
# Add start and end tokens to target sequences
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [14]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,indic2012,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...
1,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END
2,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END
3,tides,race equality new laws,START_ नये कानून नस्ली समानता _END
4,tides,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...


In [15]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in df['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in df['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [16]:
len(all_eng_words)

45291

In [17]:
all_eng_words

{'code',
 'satisfactions',
 'muhammed',
 'miniaturization',
 'riverbanks',
 'acidity',
 'fatehpur',
 'hiroshima',
 'workmen',
 'preface',
 'hindi',
 'timereal',
 'resolves',
 'fourth',
 'prided',
 'quantified',
 'sponges',
 'churidars',
 'mandakineand',
 'taiselesion',
 'thaugust',
 'nicobar',
 'junaghar',
 'rc',
 'walkers',
 'feild',
 'sankrit',
 'vaikunthaperumal',
 'manasarovar',
 'nasik',
 'department',
 'thomsoncsf',
 'anonymity',
 'gelatin',
 'insufficient',
 'retrospective',
 'clash',
 'padded',
 'manna',
 'tongues',
 'narrative',
 'germanys',
 'backyards',
 'photo',
 'jawahar',
 'kamayani',
 'scadenavian',
 'melt',
 '“hes',
 'cheetah',
 'leapt',
 'brutality',
 'prabodh',
 'fringe',
 'schandrapentagon',
 'pt',
 'mouthpiece',
 'turn',
 'jubilee',
 'cool”',
 'muhammadpbuhhas',
 'speechprresentation',
 'mcdowells',
 'takeover',
 'de',
 'legend',
 'equalled',
 'drifted',
 'treachery',
 'cattleshed',
 'lifegiving',
 'bazarcobbler',
 'kalash',
 'morbidity',
 'mahabalipuram',
 'spindle

In [18]:
len(all_hindi_words)

52937

In [19]:
all_hindi_words

{'केंद्रो',
 'चढ़ाना',
 'कूड़ेदान',
 'गालीगलौज',
 'acidity',
 'लुईज़ियाना',
 'जाएलेकिन',
 'पूछता',
 'डिफेंस',
 'मांड़वी',
 'कामकोटि',
 'परदादा',
 'तस',
 'ईं',
 'मैगिनौट',
 'hindi',
 'मायेर',
 'हैइनके',
 'कम्प्यूटिंग',
 'अलाउद्दीन',
 'किलॊमीटर',
 'धनमऋऊण्श्छ्ष्माल',
 'अशासकीय',
 'धातुविषाक्तता',
 'fourth',
 'गिरफ्त',
 'बालगंगाधर',
 'देवदार',
 'जरावों',
 'conductनिषिद्ध',
 'हैंफिर',
 'लीज',
 'पाचवी',
 'एबालसुब्रमण्यन',
 'प्रारुपों',
 'मध्याह्न',
 'इंजनों',
 'department',
 'विद्यापीठ',
 'बोधिसत्व',
 'तारिक',
 'sauicapuna',
 'बाफला',
 'सं',
 'घटनामंच',
 'फैला',
 'बाँगलादेश',
 'निypaxana',
 'खेलिए',
 'ओबी',
 'पंचवाद्य',
 'पैठे',
 'शॉर्टकट',
 'कल्लेश्वर',
 'फस',
 'योग्यता',
 'मूल्य',
 'बहादूर',
 'शतवार्षिकी',
 'संज्ञेय',
 'पोटैशियम',
 'पकिर',
 'अभियोक्ता',
 'दंजिग',
 'फायरफॉक्स',
 'दुकानों',
 'सुमित्रानन्दन',
 'तकरीरों',
 'परिचर्याअभियान',
 'मंडपम',
 'अञ्चल',
 'सन्तुष्ट',
 'ओरियन्टेशन',
 'मेल्टिंग',
 'शादियों',
 'फिजूलखर्ची',
 'रग्बी',
 'बत्रा',
 'पथनिर्देश',
 'अतऋऊण्श्छ्ष्याधिक',
 'कौसिलजनर

In [20]:
df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_hin_sentence']=df['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [21]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,indic2012,islam is word from arabic and it full word is ...,START_ इस्लाम शब्द अरबी भाषा का शब्द है जिसका ...,14,21
1,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
2,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
3,tides,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
4,tides,the provision would not affect the power of pa...,START_ व्यवसायों आदि से होने वाली आय के बारे म...,22,24


In [22]:
df[df['length_eng_sentence']>30].shape

(4905, 5)

In [23]:
df=df[df['length_eng_sentence']<=20]
df=df[df['length_hin_sentence']<=20]

In [24]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
1,ted,everything is reliant on these computers working,START_ इन कंप्यूटरों पर सब कुछ निर्भर है _END,7,9
2,tides,parliament does not control the government,START_ संसद का सरकार पपर नियंत्रण नपहीं रहता _END,6,9
3,tides,race equality new laws,START_ नये कानून नस्ली समानता _END,4,6
6,ted,there was lasagna there was casseroles,START_ वहां लाजान्या था कैसेरोल थे _END,6,7
7,indic2012,super power india source google writer vedprat...,START_ महाशक्ति भारत गूगल पुस्तक लेखक वेदप्रता...,8,9


In [25]:
df.shape

(32971, 5)

In [26]:
print("maximum length of Hindi Sentence ",max(df['length_hin_sentence']))
print("maximum length of English Sentence ",max(df['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [27]:
max_length_src=max(df['length_hin_sentence'])
max_length_tar=max(df['length_eng_sentence'])

In [28]:
print(max_length_src)

20


In [29]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(45291, 52937)

In [30]:
num_decoder_tokens += 1 #for zero padding


In [31]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [32]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [33]:
df = shuffle(df)
df.head(10)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
2977,tides,today s geoeconomics and china s success story...,START_ आज के भूअर्थशास्त्र और चीनी सफलता ने दू...,13,14
30070,indic2012,bikaner a place,START_ बीकानेर _END,3,3
36265,ted,most western intellectuals,START_ कई पाश्चात्य बुद्धिजीवी _END,3,5
38707,ted,and lets talk about jobs,START_ और हम नौकरियों के बारे में बात करते हैं...,5,11
4892,ted,i was born in mexico i grew up in mexico,START_ मैं मेक्सिको में पैदा हुआ मेक्सिको में ...,10,11
41217,tides,you and the adviser will make this agreement a...,START_ आप और सलाहकर दोनों मिल के यह करारनामा त...,17,19
42720,ted,and improvements,START_ और सुधार _END,2,4
24692,indic2012,introduction of cricket,START_ क्रिकेट का परिचय _END,3,5
25051,ted,it was abraham path day,START_ उसे अब्राहम पथ दिवस कहा जाता है। _END,5,9
39821,ted,he had some very complex models of how to survive,START_ उसके पास जंगल में जीवन को बचाने के बहुत...,10,15


### Split the data into train and test

In [34]:
X, y = df['english_sentence'], df['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((26376,), (6595,))

In [35]:
X_train

15888                                            dineelahi
32120    but tell me that you study trabeculae which is...
17754                          it represents the beginning
39430                               he talks like this now
49187    dhutarashtra was born blind so pandu had been ...
                               ...                        
28464        theres the punk on the ground puddle of blood
10334    she made calls to her cells at frequent interv...
271      submit it to reddit and the community of your ...
4872                                  so back to economics
16893          and this is what the world looks like today
Name: english_sentence, Length: 26376, dtype: object

In [36]:
y_train

15888                                START_ दीनएइलाही _END
32120    START_ ये भी बताईये कि आप trabeculae का अध्धयन...
17754    START_ यह पश्चिम को अफ़्रीका की कहानियाँ बताने...
39430                  START_ अब वह ऐसे बात करते हैं। _END
49187    START_ धृतराष्ट्र जन्म से ही नेत्रहीन थे अतः उ...
                               ...                        
28464    START_ वो गुंडा ज़मीन पर पड़ा था खून से सना हु...
10334    START_ उसने बिना इल्लियां लाए थोड़े थोड़े अंतर...
271      START_ उसे रेडिट पे डाल दें और आपके साथियों का...
4872       START_ तो वापस चलते हैं अर्थशास्त्र की तरफ _END
16893    START_ और ये है जैसा कि आज विश्व दिख रहा है। _END
Name: hindi_sentence, Length: 26376, dtype: object

### Let us save this data

In [37]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder-Decoder Architecture

In [38]:
latent_dim=300

In [39]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [40]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [41]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [42]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 300)            1358730   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 embedding_1 (Embedding)     (None, None, 300)            1588140   ['input_2[0][0]']         

In [49]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 12000
epochs = 80

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

model.save('/content/drive/MyDrive/Dataset/Machine_Translation/models/english_to_hindi_translator.h5')

  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/Machine_Translation/models/english_to_hindi_translator2.pkl', 'wb') as file:
    pickle.dump(model, file)



In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)


In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1


In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])