In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
%matplotlib inline

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import RNN, GRU, LSTM, Dense, Input, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
english_data = []
with open('dict.te-en.en.txt', 'r', encoding='utf-8') as f:
    line = f.readlines()
    for word in line:
        english_data.append(word)
        
        
telugu_data = []
with open('dict.te-en.te.txt', 'r', encoding='utf-8') as f:
    line = f.readlines()
    for word in line:
        telugu_data.append(word)
        

In [3]:
data = {'english': english_data, 'telugu': telugu_data}
data_frame = pd.DataFrame(data)

In [4]:
data_frame.english = [re.sub(r'\s','', i) for i in data_frame.english]
data_frame.telugu = [re.sub(r'\s','', i) for i in data_frame.telugu]

In [5]:
import string
exclude = set(string.punctuation)
data_frame.english = data_frame.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
data_frame.telugu = data_frame.telugu.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [6]:
data_frame.telugu = data_frame.telugu.apply(lambda x : 'START_ '+ x + ' _END')


In [7]:
data_frame.head()

Unnamed: 0,english,telugu
0,Chitfundcircle,START_ చిట్‌ఫండుసర్కిలు _END
1,chitfundcircle,START_ చిట్‌ఫండుసర్కిలు _END
2,Poetically,START_ కవిత్వంచే _END
3,kavitwamchey,START_ కవిత్వంచే _END
4,poetry,START_ కవిత్వంచే _END


In [8]:
all_eng_words=set()
for eng in data_frame.english:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
    
all_tel_words=set()
for tel in data_frame.telugu:
    for word in tel.split():
        if word not in all_tel_words:
            all_tel_words.add(word)

In [9]:
len(all_eng_words), len( all_tel_words)

(26121, 12191)

In [10]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_tel_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_tel_words)

In [11]:
num_encoder_tokens, num_decoder_tokens

(26121, 12191)

In [32]:
max_eng_length = []
for word in data_frame.english:
    #print(word)
    max_eng_length.append(len(word.split(" ")))
    
np.max(max_eng_length)


max_tel_length = []
for tel in data_frame.telugu:
    #print(word)
    max_tel_length.append(len(tel.split(" ")))
    
np.max(max_tel_length)

3

In [33]:
np.max(max_eng_length)

1

In [12]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(input_words)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(target_words)])

In [13]:
input_token_index['definitely']

14497

In [14]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(data_frame.english)

english_index = eng_tokenizer.word_index

In [25]:
len( english_index)

21068

In [16]:
tel_tokenizer = Tokenizer(filters='')
tel_tokenizer.fit_on_texts(data_frame.telugu)

telugu_index = tel_tokenizer.word_index

In [54]:
len(telugu_index)

12191

In [17]:
len(english_index), len(telugu_index)

(21068, 12191)

In [29]:
embedding_file = 'glove.6B/glove.6B.200d.txt'
embeddings_index = {}
f = open(os.path.join('',embedding_file), encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [16]:
'''
#https://fasttext.cc/docs/en/english-vectors.html
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data
    
    
telugu_vectors = load_vectors('cc.te.300.vec')


tel_num_words = len(all_tel_words)+1
telugu_embedding_matrix = np.zeros((tel_num_words, 300))
for word, i in telugu_index.items():
    if i > tel_num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        telugu_embedding_matrix[i] = embedding_vector
        

telugu_embedding_layer = Embedding(
    tel_num_words,
    300,
    weights=[telugu_embedding_matrix],
    input_length = num_decoder_tokens)
'''

In [27]:
len(all_eng_words)

26121

In [30]:
eng_num_words = len(all_eng_words)+1
english_embedding_matrix = np.zeros((eng_num_words, 200))
for word, i in english_index.items():
    if i > eng_num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        english_embedding_matrix[i] = embedding_vector

In [31]:
eng_num_words

26122

In [56]:
english_embedding_layer = Embedding(
    eng_num_words,
    200,
    weights=[english_embedding_matrix],
    input_length = num_encoder_tokens)


In [35]:
encoder_input_data = np.zeros(
    (len(data_frame.english), 1),
    dtype='float16')
decoder_input_data = np.zeros(
    (len(data_frame.telugu), 3),
    dtype='float16')
decoder_target_data = np.zeros(
    (len(data_frame.telugu), 3, num_decoder_tokens),
    dtype='float16')


predict_encoder_data = np.zeros(
    (1, 1),
    dtype='float16')

In [36]:
for i, (input_text, target_text) in enumerate(zip(data_frame.english, data_frame.telugu)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [37]:
encoder_input_data.shape, decoder_target_data.shape

((38532, 1), (38532, 3, 12191))

In [207]:
encoder_input_data[:2]

array([[ 76.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.],
       [123., 140., 340.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.]], dtype=float32)

### Rough

In [139]:
## Rough 
for i, (eng, tel) in enumerate(zip(lines.eng, lines.tel)):
    for t, word in enumerate(eng.split()):
        #print(t)
        encoder_input_data[i,t] = w2i[word]
        

In [151]:
encoder_input_data[1,2] = 6

In [152]:
encoder_input_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 6., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [155]:
#------------------------------------------

In [38]:
# Model
encoder_inputs = Input(shape=(None,))
en_x=  Embedding(num_encoder_tokens, 200)(encoder_inputs)
encoder = LSTM(50, return_state=True)
encoder_outputs, state_h, state_c = encoder(en_x)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [39]:
decoder_inputs = Input(shape=(None,))
dex=  Embedding(num_decoder_tokens, 200)
final_dex= dex(decoder_inputs)

decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [40]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 200)    5224200     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 200)    2438200     input_4[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LS

In [41]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=128,
          epochs=20,
          validation_split=0.03)

Train on 37376 samples, validate on 1156 samples
Epoch 1/20
 2368/37376 [>.............................] - ETA: 6:04 - loss: 4.8343 - acc: 0.3274

InvalidArgumentError: indices[4,1] = 12192 is not in [0, 12191)
	 [[Node: embedding_4/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training_1/RMSprop/Assign_3"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_4/embeddings/read, embedding_4/Cast, training_1/RMSprop/gradients/embedding_4/embedding_lookup_grad/concat/axis)]]

In [24]:
for i, val in eng_tokenizer.word_index.items():
    if val == 12190:
        print(i)

trivikramarao


In [23]:
for i, val in tel_tokenizer.word_index.items():
    if val == 12190:
        print(i)
    

ఆయనపై


In [51]:
len(tel_tokenizer.word_index), len(eng_tokenizer.word_index)

(12191, 21068)

In [58]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 383, 300)          115200    
_________________________________________________________________
lstm_1 (LSTM)                [(None, 50), (None, 50),  70200     
Total params: 185,400
Trainable params: 185,400
Non-trainable params: 0
_________________________________________________________________


In [175]:
h = encoder_input_data[1031: 1032]

In [59]:
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dex2= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [60]:

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [70]:
test = 'i dislike you'
for t, word in enumerate(test.split()):
    predict_encoder_data[0, t] = input_token_index[word]

In [71]:
decode_sequence(predict_encoder_data)

' నువ్వు బాగోలేదని విన్నాను _END'

In [185]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', lines.eng[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: definitely!
Decoded sentence:  తప్పకుండా _END
-
Input sentence: he hung up.
Decoded sentence:  మాకు ఫ్రెంచి అర్ధం కాదు _END
-
Input sentence: i ran home.
Decoded sentence:  నేను ఇంటికి పరిగెత్తాను _END
-
Input sentence: who are we?
Decoded sentence:  మేము ఎవరము ? _END
-
Input sentence: are you mad?
Decoded sentence:  కోపమొచ్చిందా ? _END
-
Input sentence: he touched me.
Decoded sentence:  ఈ పెన్సిళ్లు ఒకే రంగులో _END
-
Input sentence: my head hurts.
Decoded sentence:  నాకు గంట మోగటం వినపడింది _END
-
Input sentence: i drank coffee.
Decoded sentence:  నేను కాఫీ తాగాను _END
-
Input sentence: how tall is she?
Decoded sentence:  నువ్వు అది చూసావా ? _END
-
Input sentence: they're animals.
Decoded sentence:  కోపమొచ్చిందా ? _END
-
Input sentence: can you see that?
Decoded sentence:  నువ్వు చాలా సంతోషంగా ఉన్నట్లున్నావ్ _END
-
Input sentence: i began to speak.
Decoded sentence:  నేను మాట్లాడటం మొదలుపెట్టాను _END
-
Input sentence: i dislike coffee.
Decoded sentence:  మేము వినాలని

-
Input sentence: my sister is crazy about tennis.
Decoded sentence:  మా అక్కకి టెన్నిసంటే పిచ్చి _END
-
Input sentence: do you live in this neighborhood?
Decoded sentence:  నువ్వు చాలా సంతోషంగా ఉన్నట్లున్నావ్ _END
-
Input sentence: have you ever had a heart attack?
Decoded sentence:  నేను నువ్వైతే ఈరోజు అక్కడికి వెళ్ళను _END
-
Input sentence: i can't keep you here any longer.
Decoded sentence:  అది ఇక్కడ ఇంకా ఎక్కువ సమయం వుంచలేను _END
-
Input sentence: she refuses to say more about it.
Decoded sentence:  అది అంత సులభం ఏం కాదు _END
-
Input sentence: these pencils are the same color.
Decoded sentence:  ఈ పెన్సిళ్లు ఒకే రంగులో ఉన్నాయి _END


In [None]:
input_seq = 'Hello world'

encoder_input_data[seq_index: seq_index + 1]
decoded_sentence = decode_sequence(input_seq)
print('-')
print('Input sentence:', lines.eng[seq_index])
print('Decoded sentence:', decoded_sentence)

In [160]:
encoder_input_data[2: 3]

array([[143., 243., 138.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.]], dtype=float32)

In [3]:
input_text = []
target_input_text = []
target_text = []

for line in open('tel.txt', encoding='utf-8'):
    text = line.split('\t')
    source_text = text[0]
    translation = text[1]
    
    #target_translation = translation+'<eos>'
    #target_translation_input = '<sos>'+translation
    
    input_text.append(source_text)
    target_text.append(translation)
    target_input_text.append(translation)
    
    
output_input_text = target_input_text
output_text = target_text 
    
#target_input_text = target_input_text.insert('<sos>')
#target_text = target_text + '<eos>'

dummy = []
for i in target_input_text:
    dummy.append('<sos>'+str(i)+'<eos>')

print(len(input_text))
print(len(target_input_text))
print(len(target_text))

134
134
134


In [4]:
#input tokenizer
tokenizer_input = Tokenizer()
tokenizer_input.fit_on_texts(input_text)
input_sequences = tokenizer_input.texts_to_sequences(input_text)

In [5]:
input_index = tokenizer_input.word_index
len(input_index)

341

In [57]:
import operator
sorted(input_index.items(), key=operator.itemgetter(1))

[('you', 1),
 ('i', 2),
 ('to', 3),
 ('the', 4),
 ('is', 5),
 ('that', 6),
 ('a', 7),
 ('are', 8),
 ("don't", 9),
 ('do', 10),
 ('she', 11),
 ('he', 12),
 ('we', 13),
 ('it', 14),
 ('was', 15),
 ('this', 16),
 ('me', 17),
 ('my', 18),
 ('can', 19),
 ('in', 20),
 ('your', 21),
 ('very', 22),
 ('have', 23),
 ('about', 24),
 ('how', 25),
 ("i'm", 26),
 ('not', 27),
 ('where', 28),
 ('know', 29),
 ('of', 30),
 ('out', 31),
 ('for', 32),
 ("it's", 33),
 ('need', 34),
 ('really', 35),
 ('her', 36),
 ('all', 37),
 ('did', 38),
 ('no', 39),
 ('who', 40),
 ('coffee', 41),
 ('speak', 42),
 ('want', 43),
 ('what', 44),
 ('more', 45),
 ('be', 46),
 ('anything', 47),
 ('at', 48),
 ('made', 49),
 ('with', 50),
 ('there', 51),
 ('as', 52),
 ('go', 53),
 ('time', 54),
 ('when', 55),
 ('going', 56),
 ('say', 57),
 ('here', 58),
 ('one', 59),
 ('eat', 60),
 ('up', 61),
 ('home', 62),
 ("what's", 63),
 ('which', 64),
 ('help', 65),
 ('came', 66),
 ('make', 67),
 ("wasn't", 68),
 ('fault', 69),
 ("she's",

In [6]:
max_input_length = max(len(s) for s in input_sequences)
max_input_length

13

In [7]:
#output tokenizer
tokenizer_output = Tokenizer(filters='')
tokenizer_output.fit_on_texts(target_input_text + target_text)

target_sequences = tokenizer_output.texts_to_sequences(target_text)
target_input_sequences = tokenizer_output.texts_to_sequences(target_input_text)

401

In [8]:
max_output_length = max(len(s) for s in target_sequences)
max_output_length

10

In [9]:
target_input_length = max(len(s) for s in target_input_sequences)
target_input_length

10

In [49]:
#telugu index 
telugu_tokenizer = Tokenizer()
telugu_tokenizer.fit_on_texts(target_text)

In [10]:
output_index = tokenizer_output.word_index

In [33]:
len(output_index)+1

402

In [11]:
#padding
encoder_inputs = pad_sequences(input_sequences, max_input_length)
decoder_inputs = pad_sequences(target_input_sequences, max_output_length, padding='post')
decoder_output = pad_sequences(target_sequences, max_output_length, padding='post')

In [35]:
encoder_inputs.shape, decoder_inputs.shape, decoder_output.shape

((134, 13), (134, 10), (134, 10))

In [177]:
embedding_file = 'C:/Users/cvenkatanagasatya/Pictures/LazyProgrammer/machine_learning_examples/large_files/glove.6B/glove.6B.300d.txt'
embeddings_index = {}
f = open(os.path.join('',embedding_file), encoding='utf8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [195]:
num_words = len(input_index)+1
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in input_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

ValueError: could not broadcast input array from shape (300) into shape (100)

In [14]:
embedding_layer = Embedding(
    num_words,
    embed_size,
    weights=[embedding_matrix],
    input_length = max_input_length)

In [1]:
len(input_text), max_input_length, len(output_index)+1

NameError: name 'input_text' is not defined

In [15]:
decoder_targets_one_hot = np.zeros(
  (
    len(input_text),
    max_input_length,
    num_words+1
  ),
  dtype='float32'
)


In [16]:

# assign the values
for i, d in enumerate(decoder_output):
    for t, word in enumerate(d):
        if word != 0:
            decoder_targets_one_hot[i, t, word] = 1

IndexError: index 343 is out of bounds for axis 2 with size 343

In [38]:
import tensorflow as tf
tf.executing_eagerly()

False

In [165]:
test = []
for i,d in enumerate(decoder_output):
   # print(i)
    for t, word in enumerate(d):
        test.append(word)
        #decoder_target_one_hot[i, t, word] = 1

In [37]:
type(max_input_length)

int

In [40]:
#model
encoder_input = Input(shape=(None, ))
embed = embedding_layer(encoder_input)
encoder = LSTM(100, return_state = True)
encoder_output, h, c = encoder(embed)

encoder_states = [h,c]

In [41]:
decoder_input = Input(shape=(None, ))
embed_decoder = embedding_layer(decoder_input)
decoder_lstm = LSTM(100, return_state=True)
decoder_target, _, _ = decoder_lstm(embed_decoder, initial_state = encoder_states)

decoder_dense = Dense(max_output_length, activation='softmax')
decoder_target = decoder_dense(decoder_target)

In [42]:
model = Model([encoder_input, decoder_input], decoder_target)

In [43]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 13, 100)      34200       input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 100), (None, 80400       embedding_3[0][0]                
__________

In [44]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_inputs, decoder_inputs], decoder_output,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Train on 107 samples, validate on 27 samples
Epoch 1/10


InvalidArgumentError: indices[13,3] = 342 is not in [0, 342)
	 [[Node: embedding_3_1/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_3/embeddings/read, embedding_3_1/Cast, embedding_3/embedding_lookup/axis)]]