<a href="https://colab.research.google.com/github/Bindhya-K/Language-Translation/blob/master/LanguageTranslation_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from keras.models import Model
from keras.layers import TimeDistributed
from keras.optimizers import Adam
from keras.layers import *
from keras.preprocessing.text import Tokenizer
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [2]:
data_path = 'fra-eng.txt'
num_samples = 100000

In [3]:
# Reading the data
input_texts=[]
target_texts=[]
with open(data_path,'r',encoding='utf-8')as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples,len(lines)-1)]:
    input_text,target_text = line.split('\t')
    input_texts.append(input_text)
    target_texts.append(target_text)

In [4]:
len(lines)

82685

In [5]:
#Cleaning the data
def clean_data(sentence):
  lower = sentence.lower()
  cleaned_sentence = [re.sub(r'[^\w\s]','' , lower)]
  return cleaned_sentence

In [6]:
# tokenizing the data
def tokenize(sentence):
    t = Tokenizer()
    t.fit_on_texts(sentence)
    return t.texts_to_sequences(sentence),t

In [7]:
cleaned_eng_sentence = [clean_data(sentence) for sentence in input_texts]
cleaned_lan1_sentence = [clean_data(sentence) for sentence in target_texts]

In [8]:
eng_text_tokenized,eng_text_tokenizer =tokenize(input_texts)
lan1_text_tokenized,lan1_text_tokenizer = tokenize(target_texts)

In [9]:
# English and language1 vocabulary size
eng_vocab = len(eng_text_tokenizer.word_index)+1
lan1_vocab = len(lan1_text_tokenizer.word_index)+1
# MAximum sentence length
eng_sentence_max_length = int(len(max(eng_text_tokenized,key=len)))
lan1_sentence_max_length = int(len(max(lan1_text_tokenized,key=len)))
print('English Vocabulary size:', eng_vocab)
print('Language1 vocab size:',lan1_vocab)
print('English sentence max length:', eng_sentence_max_length)
print('Language1 sentence max length:',lan1_sentence_max_length)

English Vocabulary size: 8038
Language1 vocab size: 17502
English sentence max length: 8
Language1 sentence max length: 15


In [10]:
# padding the sequence and reshaping it
eng_pad_sequence = pad_sequences(eng_text_tokenized,eng_sentence_max_length,padding='post')
lan1_pad_Sequence = pad_sequences(lan1_text_tokenized,lan1_sentence_max_length,padding='post')

In [11]:
print(eng_pad_sequence[0])
print(lan1_pad_Sequence[0])

[37  0  0  0  0  0  0  0]
[96  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [12]:
# building the model
input_sequence = Input(shape=(eng_sentence_max_length,))
embedding = Embedding(input_dim=eng_vocab,output_dim=128,)(input_sequence)
encoder =LSTM(64,return_sequences=False)(embedding)
r_vec = RepeatVector(lan1_sentence_max_length)(encoder)
decoder = LSTM(64,return_sequences=True,dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(lan1_vocab))(decoder)

In [13]:
# train the model
model = Model(input_sequence,Activation('softmax')(logits))
model.compile(loss='sparse_categorical_crossentropy',
             optimizer= Adam(0.001),
             metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 8)]               0         
                                                                 
 embedding (Embedding)       (None, 8, 128)            1028864   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 repeat_vector (RepeatVecto  (None, 15, 64)            0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 15, 64)            33024     
                                                                 
 time_distributed (TimeDist  (None, 15, 17502)         1137630   
 ributed)                                                    

In [None]:
results = model.fit(eng_pad_sequence,lan1_pad_Sequence,batch_size=30,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30

In [88]:
index_words = {idx:word for word, idx in lan1_text_tokenizer.word_index.items() }
index_words[0]='<empty>'
index = int(input(enter the index))
predictions = np.argmax(model.predict(eng_pad_sequence)[index],1)
mapped_prediction = [index_words[i] for i in predictions]
print('The English Sentence :', input_texts[index])
print('The French sentence :',target_texts[index])
print('The predicted french sentence:',' '.join(mapped_preiction))

te vous tu confiance <empty> <empty> <empty> <empty> <empty> <empty>


In [86]:
target_texts[9999]

'Me faites-vous confiance ?'

In [81]:
# sentence prediction
def output_sentence(model,tokenizer):

  return index_words

'''index=10
print('the English sentence:',input_texts[index])
print('the French sentence:',target_texts[index])
print('The predicted French sentence:')
print(output_sentence(model.predict(eng_pad_sequence[index:index+1][0]),lan1_text_tokenizer))'''

"index=10\nprint('the English sentence:',input_texts[index])\nprint('the French sentence:',target_texts[index])\nprint('The predicted French sentence:')\nprint(output_sentence(model.predict(eng_pad_sequence[index:index+1][0]),lan1_text_tokenizer))"