<a href="https://colab.research.google.com/github/Aayush360/Natural_langauge_processing/blob/main/Language_Translation(Seq2Seq).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Data Source](http://www.manythings.org/anki/)

In [52]:
import numpy as np
import pandas as pd
import string
import re
import io

from unicodedata import normalize
import keras, tensorflow

from keras.models import Model
from keras.layers import Input, LSTM,Dense

In [53]:
# reading a data

def read_data(file):
  data=[]
  with io.open(file,'r') as file:
    for entry in file:
      entry = entry.strip()
      data.append(entry)
  return data

In [54]:
data = read_data('bilingual_pairs.txt')

In [55]:
data[139990:140000]

['Never choose a vocation just because the hours are short.\tNe choisissez jamais une profession juste parce que les heures y sont courtes.',
 "No other mountain in the world is so high as Mt. Everest.\tAucune montagne au monde n'atteint la hauteur du Mont Everest.",
 "No sooner had he met his family than he burst into tears.\tÀ peine avait-il rencontré sa famille qu'il éclata en sanglots.",
 "Nothing is more disappointing than to lose in the finals.\tRien n'est plus décevant que de perdre en finale.",
 "Now that he is old, it is your duty to go look after him.\tÀ présent qu'il est vieux, c'est ton devoir de veiller sur lui.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que vous avez décidé de quitter votre emploi, vous avez l'air heureux.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que tu as décidé de quitter ton emploi, tu as l'air heureux.",
 "Now that you've decided to quit your job, you look happy.\tMaintenant que vous avez

In [56]:
len(data)

145437

In [57]:
data[500:501]

['I beg you.\tJe te prie.']

In [58]:
# let us use 140000 english-french sentence pair

In [59]:
data= data[:140000]

In [60]:
# separate data into english french list 

In [61]:
def build_english_french_sen(data):
  english_sentences = []
  french_sentences = []
  for datapoint in data:
    english_sentences.append(datapoint.split('\t')[0])
    french_sentences.append(datapoint.split('\t')[1])
  return english_sentences, french_sentences


eng_sen, french_sen = build_english_french_sen(data)

In [62]:
eng_sen[:10]

['Go.',
 'Run!',
 'Run!',
 'Wow!',
 'Fire!',
 'Help!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Stop!']

In [63]:
french_sen[:10]

['Va !',
 'Cours\u202f!',
 'Courez\u202f!',
 'Ça alors\u202f!',
 'Au feu !',
 "À l'aide\u202f!",
 'Saute.',
 'Ça suffit\u202f!',
 'Stop\u202f!',
 'Arrête-toi !']

In [64]:
# data cleaning

In [65]:
def clean_sentences(sentence):
  # prepare regex for char filtering
  re_print = re.compile('[^%s]'%  re.escape(string.printable)) # removes non-printable characters
  # prepare translation table for removing punctuation
  table = str.maketrans('','',string.punctuation)
  cleaned_sent = normalize('NFD', sentence).encode('ascii','ignore')
  cleaned_sent = cleaned_sent.decode('UTF-8')
  cleaned_sent = cleaned_sent.split()
  cleaned_sent = [word.lower() for word in cleaned_sent] # case-folding
  cleaned_sent = [word.translate(table) for word in cleaned_sent]
  cleaned_sent = [re_print.sub('',w) for w in cleaned_sent]
  cleaned_sent = [word for word in cleaned_sent if word.isalpha()] # keeps only alphabetic word
  return ' '.join(cleaned_sent)



In [66]:
re_print = re.compile('[^%s]'%  re.escape(string.printable))
re_print

re.compile(r'[^0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"\#\$%\&\'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~\ \\t\\n\\r\\x0b\\x0c]',
re.UNICODE)

In [67]:
s = "string. With. Punctuation?" # Sample string 
out = s.translate(str.maketrans("","", string.punctuation))
out

'string With Punctuation'

In [68]:
my_var = "this is a string"
my_var2 = " Esta es una oración que está en español "
my_var3 = normalize('NFD', my_var2).encode('ascii', 'ignore').decode('utf8')
output = my_var + my_var3
print(output)

this is a string Esta es una oracion que esta en espanol 


In [69]:
re_print.sub('','/punct,si?s.')

'/punct,si?s.'

In [70]:
# building clean engish french sentence

def build_clean_eng_french_sentence(eng_sent, french_sent):
  french_sent_cleaned = []
  eng_sent_cleaned = []

  for sent in french_sent:
    french_sent_cleaned.append(clean_sentences(sent))
  for sent in eng_sent:
    eng_sent_cleaned.append(clean_sentences(sent))
  
  return eng_sent_cleaned, french_sent_cleaned


eng_sent_cleaned, french_sent_cleaned = build_clean_eng_french_sentence(eng_sen,french_sen)

In [71]:
# now we should build our vocabulary and add token that convey start and end of a sequence as required by our decoder

In [72]:
# here instead of dwelling on word level, we will go to character level to build vocabulary

In [73]:
def build_data(eng_sent_cleaned,french_sent_cleaned):
  input_datasets=[]
  target_datasets=[]

  input_characters = set()
  target_characters = set()

  for french_sent in french_sent_cleaned:
    input_datapoint = french_sent
    input_datasets.append(input_datapoint)

    for char in input_datapoint:
      input_characters.add(char) # list of unique input characters
    
  for eng_sent in eng_sent_cleaned:
    target_datapoint = '\t'+eng_sent+'\n' # to convey strat and end of the sentence to the decoder
    target_datasets.append(target_datapoint)
    for char in target_datapoint:
      target_characters.add(char) # list of unique output characters
  return input_datasets,target_datasets,sorted(input_characters),sorted(target_characters)

input_datasets, target_datasets, input_characters, target_characters = build_data(eng_sent_cleaned,french_sent_cleaned)


In [74]:
# input_datasets=[]
# input_characters=set()
# for french_sent in french_sent_cleaned:
#     input_datapoint = french_sent
#     input_datasets.append(input_datapoint)

#     for char in input_datapoint:
#       input_characters.add(char)

In [75]:
input_datasets[50:54]

['hors de question', 'vraiment', 'vrai', 'ah bon']

In [76]:
# for char in input_datapoint:
#   print(char)

In [77]:
# target_datasets=[]
# target_characters=set()
# for eng_sent in eng_sent_cleaned:
#     target_datapoint = '\t'+eng_sent+'\n'
#     target_datasets.append(target_datapoint)
#     for char in target_datapoint:
#        target_characters.add(char)

In [78]:
# target_datapoint

In [79]:
target_datasets[:4]

['\tgo\n', '\trun\n', '\trun\n', '\twow\n']

In [80]:
print(input_characters)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [81]:
print(target_characters)

['\t', '\n', ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Our input and output vocabulary may not be the same for tasks such as natural language translation. In fact, at times, our character set may not be the same either. For example, we might be trying to translate between English and Nepali, which have different character sets altogether.

In [82]:
# also, input and output sequence may not be of same length as well

In [83]:
# let us find out some metadata about the data

In [84]:
len(target_characters), len(input_characters)

(29, 27)

In [85]:
def build_metadata(input_datasets,target_datasets,input_characters,target_characters):
  num_Encoder_tokens = len(input_characters)
  num_Decoder_tokens = len(target_characters)
  max_Encoder_Sequence_len = max(len(datapoint) for datapoint in input_datasets) # max character length of input sentence
  max_Decoder_Sequence_len = max(len(datapoint) for datapoint in target_datasets) # max char length in target sentences
  print('Number of datapoints',len(input_datasets))
  print('Number of unique input tokens',num_Encoder_tokens)
  print('Number of unique target tokens',num_Decoder_tokens)
  print('Maxumum sequence length for inputs:', max_Encoder_Sequence_len)
  print('Maximum sequence length for outputs: ', max_Decoder_Sequence_len)
  return num_Encoder_tokens, num_Decoder_tokens, max_Encoder_Sequence_len, max_Decoder_Sequence_len

num_Encoder_tokens, num_Decoder_tokens, max_Encoder_Sequence_len, max_Decoder_Sequence_len = build_metadata(input_datasets,target_datasets,input_characters,target_characters)

Number of datapoints 140000
Number of unique input tokens 27
Number of unique target tokens 29
Maxumum sequence length for inputs: 117
Maximum sequence length for outputs:  58


In [86]:
# build character to indices mapping and vice-versa

In [87]:
def build_indices(input_characters,target_characters):
  input_char_to_idx = {}
  input_idx_to_char = {}
  target_char_to_idx ={}
  target_idx_to_char = {}
  
  for i, char in enumerate(input_characters):
    input_char_to_idx[char]=i
    input_idx_to_char[i]=char
  
  for i,char in enumerate(target_characters):
    target_char_to_idx[char]=i
    target_idx_to_char[i]=char
  
  return input_char_to_idx,input_idx_to_char,target_char_to_idx, target_idx_to_char 

input_char_to_idx,input_idx_to_char,target_char_to_idx, target_idx_to_char = build_indices(input_characters,target_characters)

In [88]:
# now let us build datastructure based on the metadata information we obtained

In [89]:
len(input_datasets)

140000

In [90]:
def build_data_structure(len_input_dataset, max_Encoder_Sequence_len, max_Decoder_Sequence_len, num_Encoder_tokens, num_Decoder_tokens):
  Encoder_input_data = np.zeros((len_input_dataset,max_Encoder_Sequence_len,num_Encoder_tokens),dtype='float32')
  Decoder_input_data = np.zeros((len_input_dataset,max_Decoder_Sequence_len,num_Decoder_tokens),dtype='float32')
  Decoder_target_data = np.zeros((len_input_dataset,max_Decoder_Sequence_len,num_Decoder_tokens),dtype='float32')

  print('Dimensionality of encoder input data is: ',Encoder_input_data.shape)
  print('Dimensionality of Decoder input data is: ', Decoder_input_data.shape)
  print('Dimensionality of Decoder target data is: ', Decoder_target_data.shape)

  return Encoder_input_data, Decoder_input_data, Decoder_target_data
Encoder_input_data, Decoder_input_data, Decoder_target_data = build_data_structure(len(input_datasets),max_Encoder_Sequence_len, max_Decoder_Sequence_len, num_Encoder_tokens, num_Decoder_tokens)

Dimensionality of encoder input data is:  (140000, 117, 27)
Dimensionality of Decoder input data is:  (140000, 58, 29)
Dimensionality of Decoder target data is:  (140000, 58, 29)


The dimensionality of the input data is (140000, 117, 27):
The first dimension caters to the number of data points we have:
140,000.
The second dimension caters to the maximum length of our input sequence: 117.
The third dimension caters to the number of unique inputs we can have or the size of our input character set: 27.

In [91]:
# now that we have our datastructure ready let us add some data to our datastructure

In [92]:
def add_data_to_data_structure(input_datasets, target_datasets, Encoder_input_data, Decoder_input_data, Decoder_target_data):
  for i, (input_datapoint,target_datapoint) in enumerate(zip(input_datasets,target_datasets)):
    for t, char in enumerate(input_datapoint):
      Encoder_input_data[i,t,input_char_to_idx[char]]=1
    for t, char in enumerate(target_datapoint):
      Decoder_input_data[i,t,target_char_to_idx[char]]=1
      # since decoder target data is ahead of decoder input data by one timestamp
      if t>0: #  when building the decoder target data, we do not include anything for the <start> token
        Decoder_target_data[i,t-1,target_char_to_idx[char]] = 1
  return Encoder_input_data, Decoder_input_data, Decoder_target_data
Encoder_input_data, Decoder_input_data, Decoder_target_data= add_data_to_data_structure(input_datasets,target_datasets,Encoder_input_data, Decoder_input_data, Decoder_target_data)

In [93]:
# Our decoder target data is the same as the decoder input data, except that it is offset by one timestep.

In [94]:
# defining hyperparameters

batch_size = 256
epochs = 100
latent_dim = 256

In [95]:
# let us bring the Encoder into existence
# The encoder's job is to provide a context vector where it captures the context or thought in the input sentence.

In [96]:
Encoder_inputs = Input(shape=(None,num_Encoder_tokens))
Encoder = LSTM(latent_dim, return_state=True) # decoder returns us the last hidden state and memory, which will form the context vector
Encoder_outputs, state_h, state_c = Encoder(Encoder_inputs)
Encoder_states = [state_h,state_c]  # context_vector 

The encoder learns from the performance of the decoder, which happens further down the line. The decoder's error flows back and that's how the backpropagation in the encoder works and it learns.

In [97]:
## let us define the decoder

In [98]:
Decoder_inputs = Input(shape=(None, num_Decoder_tokens))
Decoder_LSTM = LSTM(latent_dim, return_sequences=True, return_state=True) 
# return_sequences, want an output from the decoder at every timestep and that is why we set this parameter to True.
Decoder_outputs,_,_ = Decoder_LSTM(Decoder_inputs, initial_state= Encoder_states)
Decoder_dense = Dense(num_Decoder_tokens,activation='softmax')
Decoder_outputs = Decoder_dense(Decoder_outputs) # prob vector of length 29


during training, the decoder is provided both the input data and the target data and is asked to predict the input data with an offset of 1. This helps the decoder to understand, given a context vector from the encoder, what it should be predicting. This method of learning is referred to as teacher forcing.

In [99]:
model = Model(inputs=[Encoder_inputs,Decoder_inputs], outputs=Decoder_outputs)

In [100]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 27)]   0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 29)]   0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, 256), (None, 290816      input_3[0][0]                    
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, None, 256),  292864      input_4[0][0]                    
                                                                 lstm_2[0][1]               

In [None]:
model.fit([Encoder_input_data,Decoder_input_data], Decoder_target_data,batch_size=batch_size, epochs=epochs,validation_split=0.2)

In [104]:
model.save('LangTras_fr_en.h5')

In [105]:
## define encoder and decoder model for inferencing

In [128]:
Encoder_model = Model(Encoder_inputs,Encoder_states) # initially encoder takes input and contex vector

# these are the initial input to decoder model
Decoder_state_input_c = Input(shape=(latent_dim,)) # since LSTM ouputs vector of length 256
Decoder_state_input_h = Input(shape=(latent_dim,))
Decoder_states_input = [Decoder_state_input_h, Decoder_state_input_c]  # merge to form context vector

Decoder_outputs, state_h, state_c = Decoder_LSTM(Decoder_inputs,initial_state= Decoder_states_input)

print(state_h.shape)
Decoder_states = [state_h, state_c]
Decoder_ouputs = Decoder_dense(Decoder_outputs) # prob vector of length 29
print(Decoder_ouputs.shape)
print(Decoder_states)

Decoder_model = Model([Decoder_inputs]+Decoder_states_input,
                      [Decoder_outputs]+Decoder_states)






(None, 256)
(None, None, 29)
[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_3')>, <KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm_3')>]


In [119]:
def decode_sequence(input_seq):
  state_value = Encoder_model.predict(input_seq)
  target_seq = np.zeros((1,1,num_Decoder_tokens))
  target_seq[0,0,target_char_to_idx['\t']]=1 # vector representation of start of sequence

  stop_cond = False
  decoded_sentence = ''

  while not stop_cond:
    output_tokens, h,c = Decoder_model.predict([target_seq]+state_value)
    print(output_tokens)
    print(output_tokens.shape)
    sampled_token_index = np.argmax(output_tokens[0,-1,:]) # see which character index has max prob of occurence
    print(sampled_token_index)
    sampled_char = target_idx_to_char[sampled_token_index]
    decoded_sentence+=sampled_char

    if (sampled_char=='\n' or len(decoded_sentence)>max_Decoder_Sequence_len):
      stop_cond = True
    
    target_seq = np.zeros((1,1,num_Decoder_tokens))
    target_seq[0,0,sampled_token_index] = 1
    state_value = [h,c]
  
  return decoded_sentence





In [120]:
# let us decode 


def decode(seq_index):
  input_seq = Encoder_input_data[seq_index:seq_index+1]
  decoded_sentence = decode_sequence(input_seq)
  print("::")

  print('Input sentence: ', input_datasets[seq_index])
  print('Decoded sentence: ', decoded_sentence)
  

In [121]:
decode(100)

[[[-8.79764020e-01 -9.13472056e-01 -6.66511655e-02  5.94963312e-01
   -5.70606589e-02 -6.94866627e-02 -2.55385280e-01  5.40829360e-01
   -4.18999642e-01 -2.68171757e-01  8.49566340e-01  8.20760489e-01
   -9.80789363e-01  8.92872810e-02  8.89609694e-01 -1.77664340e-01
   -4.11141843e-01 -2.54551291e-01 -6.01851521e-03 -2.32506990e-01
   -2.95446903e-01  8.70186627e-01 -5.36578119e-01  9.93433714e-01
    9.87477779e-01 -7.97906160e-01 -9.33349252e-01 -6.78995192e-01
    3.40876788e-01 -3.19709405e-02  5.55941761e-01 -1.26072377e-01
    9.84864831e-01  2.63091266e-01 -5.02285123e-01  8.78601894e-02
   -2.96545893e-01 -7.09043071e-02 -6.38931453e-01  7.50173926e-02
    9.68646884e-01  3.15000534e-01 -5.10170281e-01 -1.89677268e-01
   -7.30107054e-02 -9.97550189e-01  8.02389026e-01 -9.81229663e-01
    5.30606925e-01 -2.28877679e-01 -1.08366348e-01 -6.12667799e-01
   -7.26666212e-01  7.93497622e-01  7.55149007e-01 -8.41312826e-01
   -1.31731282e-03 -7.03657150e-01  1.83979601e-01  3.95318903

KeyError: ignored