In [1]:
from numpy import array
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
import random

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset = open('train_temp.txt').read().split('\n')
random.shuffle(dataset)
eng, hin = list(), list()
eng_characters, hin_characters = set(), set()

for tup in dataset:
    if tup == '':
        continue
    
    eng_and_hindi = tup.split('\t')
    
    if eng_and_hindi[0].strip() == 'डॅम्प्सकीबेसेल्सकॅबेट':
        continue
    
    eng.append(eng_and_hindi[0])
    hin.append(eng_and_hindi[1] + '\n')
    
    for char in eng_and_hindi[0]:
        if char not in eng_characters:
            eng_characters.add(char)
            
    for char in eng_and_hindi[1]:
        if char not in hin_characters:
            hin_characters.add(char)
            
hin_characters.add('\n')

In [3]:
input_characters = sorted(list(eng_characters))
target_characters = sorted(list(hin_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(word) for word in eng])
max_decoder_seq_length = max([len(word) for word in hin])

In [4]:
print('Number of samples:', len(eng), len(hin))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 107206 107206
Number of unique input tokens: 61
Number of unique output tokens: 107
Max sequence length for inputs: 24
Max sequence length for outputs: 27


In [5]:
input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)] )
target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)] )
input_index_token = dict( [(i, char) for i, char in enumerate(input_characters)] )
target_index_token = dict( [(i, char) for i, char in enumerate(target_characters)] )

## Model

In [6]:
model = Sequential()
model.add(LSTM(512, input_shape=(max_encoder_seq_length, num_encoder_tokens)))
model.add(RepeatVector(max_decoder_seq_length))
model.add(LSTM(512, return_sequences=True))
model.add(TimeDistributed(Dense(num_decoder_tokens, activation='softmax')))

model.compile(optimizer='adam', loss='categorical_crossentropy')

In [7]:
def encode(inp):
    encoder_input_data = np.zeros((len(inp), max_encoder_seq_length, num_encoder_tokens), 
                              dtype='float32')
    
    for i, input_text in enumerate(inp):
        for t, char in enumerate(input_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.
    
    return encoder_input_data

In [8]:
def decoder(inp):
    decoder_target_data = np.zeros((len(inp), max_decoder_seq_length, num_decoder_tokens),
                               dtype='float32')
    
    for i, target_text in enumerate(inp):
        for t, char in enumerate(target_text):
            decoder_target_data[i, t, target_token_index[char]] = 1.
            
    return decoder_target_data

In [9]:
encoder_input_data = encode(eng)
decoder_target_data = decoder(hin)

In [10]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [11]:
trainX, trainY = encoder_input_data[0:80000, :, :], decoder_target_data[0:80000, :, :]
testX, testY = encoder_input_data[80000:, :, :], decoder_target_data[80000:, :, :]

In [161]:
model.fit(trainX, trainY, epochs=15, batch_size=64, 
          validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 80000 samples, validate on 27437 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.58171, saving model to model.h5
 - 139s - loss: 0.5880 - val_loss: 0.5817
Epoch 2/15

Epoch 00002: val_loss improved from 0.58171 to 0.44805, saving model to model.h5
 - 137s - loss: 0.5393 - val_loss: 0.4481
Epoch 3/15

Epoch 00003: val_loss improved from 0.44805 to 0.31281, saving model to model.h5
 - 135s - loss: 0.3864 - val_loss: 0.3128
Epoch 4/15

Epoch 00004: val_loss improved from 0.31281 to 0.23194, saving model to model.h5
 - 135s - loss: 0.2732 - val_loss: 0.2319
Epoch 5/15

Epoch 00005: val_loss improved from 0.23194 to 0.18166, saving model to model.h5
 - 135s - loss: 0.2041 - val_loss: 0.1817
Epoch 6/15

Epoch 00006: val_loss improved from 0.18166 to 0.15553, saving model to model.h5
 - 135s - loss: 0.1630 - val_loss: 0.1555
Epoch 7/15

Epoch 00007: val_loss improved from 0.15553 to 0.14196, saving model to model.h5
 - 135s - loss: 0.1392 - val_loss: 0.1420
Epoch 8/1

<keras.callbacks.History at 0x7fc5b8d422b0>

## Predict

In [16]:
def predict_sequence(model, source):
    prediction = model.predict(source, verbose=0)
    integers = [np.argmax(vector) for vector in prediction[0]]
    target = list()
    for i in integers:
        word = target_index_token[i]
        if word == '\n':
            break
        target.append(word)
    return ' '.join(target), target

In [163]:
print(predict_sequence(model, encode(['publics'])))

('प स ् ् ि क', ['प', 'स', '्', '्', 'ि', 'क'])


In [164]:
print(predict_sequence(model, encode(['mobile'])))

('म ो ब ा इ ल', ['म', 'ो', 'ब', 'ा', 'इ', 'ल'])


In [165]:
print(predict_sequence(model, encode(['anshul'])))

('आ ं स ु ल', ['आ', 'ं', 'स', 'ु', 'ल'])


In [166]:
print(predict_sequence(model, encode(['anurag'])))

('अ ं ु र ा', ['अ', 'ं', 'ु', 'र', 'ा'])


In [167]:
print(predict_sequence(model, encode(['anubha'])))

('अ ं ु र ा', ['अ', 'ं', 'ु', 'र', 'ा'])


In [168]:
print(predict_sequence(model, encode(['aakansha'])))

('आ ँ ा क न', ['आ', 'ँ', 'ा', 'क', 'न'])


In [169]:
print(predict_sequence(model, encode(['akanksha'])))

('अ क ा क क श', ['अ', 'क', 'ा', 'क', 'क', 'श'])


In [170]:
print(predict_sequence(model, encode(['ayush'])))

('आ य ु ष ् न', ['आ', 'य', 'ु', 'ष', '्', 'न'])


In [171]:
print(predict_sequence(model, encode(['deepanshu'])))

('द ी ं द ु स ् स', ['द', 'ी', 'ं', 'द', 'ु', 'स', '्', 'स'])


In [172]:
print(predict_sequence(model, encode(['randy'])))

('र ं ं द द', ['र', 'ं', 'ं', 'द', 'द'])


In [174]:
print(predict_sequence(model, encode(['barcelona'])))

('ब ा र ् ल ् ्', ['ब', 'ा', 'र', '्', 'ल', '्', '्'])


## Model 2

In [12]:
model2 = Sequential()
model2.add(Bidirectional(LSTM(512), input_shape=(max_encoder_seq_length, num_encoder_tokens)))
model2.add(RepeatVector(max_decoder_seq_length))
model2.add(LSTM(512, return_sequences=True))
model2.add(TimeDistributed(Dense(num_decoder_tokens, activation='softmax')))

model2.compile(optimizer='adam', loss='categorical_crossentropy')

In [13]:
filename = 'model2.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [14]:
model2.fit(trainX, trainY, epochs=15, batch_size=64, 
          validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 80000 samples, validate on 27206 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.13622, saving model to model2.h5
 - 219s - loss: 0.2588 - val_loss: 0.1362
Epoch 2/15

Epoch 00002: val_loss improved from 0.13622 to 0.11367, saving model to model2.h5
 - 198s - loss: 0.1198 - val_loss: 0.1137
Epoch 3/15

Epoch 00003: val_loss improved from 0.11367 to 0.10258, saving model to model2.h5
 - 198s - loss: 0.0986 - val_loss: 0.1026
Epoch 4/15

Epoch 00004: val_loss improved from 0.10258 to 0.09651, saving model to model2.h5
 - 197s - loss: 0.0856 - val_loss: 0.0965
Epoch 5/15

Epoch 00005: val_loss improved from 0.09651 to 0.09601, saving model to model2.h5
 - 198s - loss: 0.0765 - val_loss: 0.0960
Epoch 6/15

Epoch 00006: val_loss improved from 0.09601 to 0.09352, saving model to model2.h5
 - 197s - loss: 0.0681 - val_loss: 0.0935
Epoch 7/15

Epoch 00007: val_loss did not improve
 - 197s - loss: 0.0611 - val_loss: 0.0949
Epoch 8/15

Epoch 00008: val_loss did not impr

<keras.callbacks.History at 0x7f1159cfce10>

In [17]:
print(predict_sequence(model2, encode(['deepanshu'])))

('द ी प श ं श', ['द', 'ी', 'प', 'श', 'ं', 'श'])


In [18]:
print(predict_sequence(model2, encode(['publics'])))

('प ब ् ल ि क ् स', ['प', 'ब', '्', 'ल', 'ि', 'क', '्', 'स'])


In [19]:
print(predict_sequence(model2, encode(['sachin'])))

('स च ि न', ['स', 'च', 'ि', 'न'])


In [20]:
print(predict_sequence(model2, encode(['india'])))

('इ ं ड ि य ा', ['इ', 'ं', 'ड', 'ि', 'य', 'ा'])


In [21]:
print(predict_sequence(model2, encode(['anshul'])))

('अ ं श ु ल', ['अ', 'ं', 'श', 'ु', 'ल'])


In [22]:
print(predict_sequence(model2, encode(['anurag'])))

('अ न ु र ा ग', ['अ', 'न', 'ु', 'र', 'ा', 'ग'])


In [23]:
print(predict_sequence(model2, encode(['anubha'])))

('अ न ु ब ा', ['अ', 'न', 'ु', 'ब', 'ा'])


In [24]:
print(predict_sequence(model2, encode(['akanksha'])))

('अ क ं ं ् ा', ['अ', 'क', 'ं', 'ं', '्', 'ा'])


## Model 3

In [25]:
model3 = Sequential()
model3.add(Bidirectional(LSTM(512), input_shape=(max_encoder_seq_length, num_encoder_tokens)))
model3.add(RepeatVector(max_decoder_seq_length))
model3.add(Bidirectional(LSTM(512, return_sequences=True)))
model3.add(TimeDistributed(Dense(num_decoder_tokens, activation='softmax')))

model3.compile(optimizer='adam', loss='categorical_crossentropy')

filename = 'model3.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [26]:
model3.fit(trainX, trainY, epochs=15, batch_size=64, 
          validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 80000 samples, validate on 27206 samples
Epoch 1/15

Epoch 00001: val_loss improved from inf to 0.14292, saving model to model3.h5
 - 322s - loss: 0.2500 - val_loss: 0.1429
Epoch 2/15

Epoch 00002: val_loss improved from 0.14292 to 0.11478, saving model to model3.h5
 - 320s - loss: 0.1254 - val_loss: 0.1148
Epoch 3/15

Epoch 00003: val_loss improved from 0.11478 to 0.10344, saving model to model3.h5
 - 322s - loss: 0.1022 - val_loss: 0.1034
Epoch 4/15

Epoch 00004: val_loss improved from 0.10344 to 0.09916, saving model to model3.h5
 - 321s - loss: 0.0887 - val_loss: 0.0992
Epoch 5/15

Epoch 00005: val_loss improved from 0.09916 to 0.09598, saving model to model3.h5
 - 319s - loss: 0.0791 - val_loss: 0.0960
Epoch 6/15

Epoch 00006: val_loss improved from 0.09598 to 0.09501, saving model to model3.h5
 - 319s - loss: 0.0707 - val_loss: 0.0950
Epoch 7/15

Epoch 00007: val_loss improved from 0.09501 to 0.09419, saving model to model3.h5
 - 322s - loss: 0.0640 - val_loss: 0.0942
Ep

<keras.callbacks.History at 0x7f1158218438>

In [27]:
print(predict_sequence(model3, encode(['deepanshu'])))

('द ी प ा श ु श', ['द', 'ी', 'प', 'ा', 'श', 'ु', 'श'])


In [28]:
print(predict_sequence(model3, encode(['Anshul'])))

('अ ं श ु ल', ['अ', 'ं', 'श', 'ु', 'ल'])


In [29]:
print(predict_sequence(model3, encode(['Anurag'])))

('अ ु न ु र ा ग', ['अ', 'ु', 'न', 'ु', 'र', 'ा', 'ग'])


In [30]:
print(predict_sequence(model3, encode(['Anubha'])))

('अ ु भ ह ा ा', ['अ', 'ु', 'भ', 'ह', 'ा', 'ा'])


In [31]:
print(predict_sequence(model3, encode(['laptop'])))

('ल प प ट प', ['ल', 'प', 'प', 'ट', 'प'])


In [32]:
print(predict_sequence(model3, encode(['akanksha'])))

('आ क क ं ं ् ष ा', ['आ', 'क', 'क', 'ं', 'ं', '्', 'ष', 'ा'])


## Model 4

In [33]:
model4 = Sequential()
model4.add(Bidirectional(LSTM(512, return_sequences=False), input_shape=(max_encoder_seq_length, num_encoder_tokens)))
model4.add(RepeatVector(max_decoder_seq_length))
model4.add(LSTM(1024, return_sequences=True))
model4.add(Dropout(0.2))
model4.add(LSTM(512, return_sequences=True))
model4.add(Dropout(0.2))
model4.add(TimeDistributed(Dense(num_decoder_tokens, activation='softmax')))

model4.compile(optimizer='adam', loss='categorical_crossentropy')

filename = 'model4.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [34]:
model4.fit(trainX, trainY, epochs=12, batch_size=64, 
          validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Train on 80000 samples, validate on 27206 samples
Epoch 1/12

Epoch 00001: val_loss improved from inf to 0.16262, saving model to model4.h5
 - 382s - loss: 0.3620 - val_loss: 0.1626
Epoch 2/12

Epoch 00002: val_loss improved from 0.16262 to 0.11648, saving model to model4.h5
 - 379s - loss: 0.1403 - val_loss: 0.1165
Epoch 3/12

Epoch 00003: val_loss improved from 0.11648 to 0.10448, saving model to model4.h5
 - 379s - loss: 0.1099 - val_loss: 0.1045
Epoch 4/12

Epoch 00004: val_loss improved from 0.10448 to 0.09794, saving model to model4.h5
 - 376s - loss: 0.0943 - val_loss: 0.0979
Epoch 5/12

Epoch 00005: val_loss improved from 0.09794 to 0.09490, saving model to model4.h5
 - 380s - loss: 0.0837 - val_loss: 0.0949
Epoch 6/12

Epoch 00006: val_loss improved from 0.09490 to 0.09356, saving model to model4.h5
 - 377s - loss: 0.0748 - val_loss: 0.0936
Epoch 7/12

Epoch 00007: val_loss did not improve
 - 376s - loss: 0.0678 - val_loss: 0.0947
Epoch 8/12

Epoch 00008: val_loss improved fro

<keras.callbacks.History at 0x7f11582180f0>

In [35]:
print(predict_sequence(model4, encode(['deepanshu'])))

('द ी प ा न ू', ['द', 'ी', 'प', 'ा', 'न', 'ू'])


In [36]:
print(predict_sequence(model4, encode(['anshul'])))

('अ ं श ु ल', ['अ', 'ं', 'श', 'ु', 'ल'])


In [37]:
print(predict_sequence(model4, encode(['anurag'])))

('अ न ु र ा ग', ['अ', 'न', 'ु', 'र', 'ा', 'ग'])


In [38]:
print(predict_sequence(model4, encode(['ayush'])))

('आ य ू ष', ['आ', 'य', 'ू', 'ष'])


In [39]:
print(predict_sequence(model4, encode(['anubha'])))

('अ न ू भ ा', ['अ', 'न', 'ू', 'भ', 'ा'])


In [40]:
print(predict_sequence(model4, encode(['akanksha'])))

('अ क ा श ् क ा', ['अ', 'क', 'ा', 'श', '्', 'क', 'ा'])


In [41]:
print(predict_sequence(model4, encode(['aakanksha'])))

('आ क ा श क क ा', ['आ', 'क', 'ा', 'श', 'क', 'क', 'ा'])


In [42]:
print(predict_sequence(model4, encode(['india'])))

('इ ं ड ि य ा', ['इ', 'ं', 'ड', 'ि', 'य', 'ा'])


In [43]:
print(predict_sequence(model4, encode(['pooja'])))


('प ू ज ा', ['प', 'ू', 'ज', 'ा'])
