Importing Required Libraries


In [1]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model,save_model
from tensorflow.keras.layers import Embedding,LSTM,TimeDistributed,RepeatVector,Input,Dense,Flatten

Reading Data From  Drive


In [2]:
source1 = "drive/MyDrive/dataset/en.txt"
source2 = "drive/MyDrive/dataset/dan.txt"
with open(source1, "r") as f:
  eng = f.read().split("\n")
print("this is english", len(eng)) # Total english sentences 
with open(source2, "r") as f:
  dan = f.read().split("\n")
print("this is danish", len(dan)) # Total danish sentences

this is english 137860
this is danish 137860


Adding Start and End tags in danish sentences


In [3]:
dan_modified = []
for i in dan:
  text = "start " + i + " end"
  dan_modified.append(text)

Defining Helper Functions

In [4]:
# Tokinizing the data i.e converting text data into integers 
def tokenize(text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)
  return tokenizer.texts_to_sequences(text), tokenizer.word_index
# Calcilating maximum and minimum length of sentences in the dataset
def maximum_and_minimum(data):
  max_len = max([len(i) for i in data])
  min_len = min([len(i) for i in data])
  return max_len, min_len
# Making all the sentences of equal length by adding zeros at end
def padding(sequences, maxLen):
  sequences = pad_sequences(sequences, maxlen=maxLen, padding="post")
  return sequences
# Pre Processing the data set
def preprocess(language):
  tokenized_sentences, vocab = tokenize(language)
  max_len,min_len = maximum_and_minimum(tokenized_sentences)
  sequences = padding(tokenized_sentences, max_len)
  return tokenized_sentences, vocab, sequences, max_len

In [5]:
eng_sentences_tokinzed,eng_vocab,eng_padded_sequences,eng_max_len = preprocess(eng)
dan_sentences_tokinzed,dan_vocab,dan_padded_sequences,dan_max_len = preprocess(dan_modified)
print("Length of english vocabulary", len(eng_vocab))
print("Length of danish vocabulary", len(dan_vocab))
# Dividing the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(eng_padded_sequences, dan_padded_sequences, test_size= 0.2, random_state=42) 

Length of english vocabulary 199
Length of danish vocabulary 286


Defining Neural Network Model


In [7]:
input = Input(shape=(eng_max_len,))
embed_eng = Embedding(input_dim=len(eng_vocab)+1, output_dim=128)(input)
# Encoder
lstm1 = LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = lstm1(embed_eng)

context_vec = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
embed_dan = Embedding(input_dim=len(dan_vocab)+1, output_dim=128)(decoder_inputs)
lstm2 = LSTM(512, return_sequences=True, return_state=True)
output,_,_ = lstm2(embed_dan, initial_state=context_vec)

# Dense layers
dense = TimeDistributed(Dense(len(dan_vocab)+1, activation="softmax"))
output = dense(output)

model = Model([input,decoder_inputs], output)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
print(model.summary())
model.fit([X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1],1)[:,1:], epochs=15, validation_split=0.2)
model.save("seq2seq.h5")

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 15, 128)      25600       input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 128)    36736       input_4[0][0]                    
____________________________________________________________________________________________

Infrence model

In [8]:
encoder_model = Model(input, context_vec)
encoder_model.save("encoder.h5")
print("Encoder model Saved!")

decoder_state_h = Input(shape=(512,))
decoder_state_c = Input(shape=(512,))
decoder_state_inputs = [decoder_state_h, decoder_state_c]

decoder_inputs = model.layers[1].output
dec_emb_layer = model.layers[3]
embed_dan2 = dec_emb_layer(decoder_inputs)

lstm2 = model.layers[5]
decoder_outputs, state_h, state_c = lstm2(embed_dan2, initial_state= decoder_state_inputs)
decoder_states = [state_h, state_c]

dense = model.layers[6]
decoder_outputs = dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states
)
decoder_model.save("decoder.h5")
print("Decoder model saved!")

Encoder model Saved!
Decoder model saved!


In [15]:
def decode_sequence(seq):
  states_value = encoder_model.predict(seq)
  # starting the target sequence with start
  target_seq = np.zeros((1,1))
  target_seq[0,0] = dan_vocab["start"]
  stop_condition = False
  decoded_sentence = ' '
  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    if sampled_token_index == 0:
      break
    else: 
      a = dan_vocab.keys()
      a = list(a)
      sampled_token = a[sampled_token_index-1]
      if sampled_token!='end':
        decoded_sentence += ' '+sampled_token
        if sampled_token == 'end' or len(decoded_sentence.split()) >= dan_max_len:
          stop_condition = True
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index
    states_value = [h, c]
  return decoded_sentence 


To convert tokenized integers back to text


In [12]:
def seq2word(seq, vocab):
  text = " "
  for i in seq:
    if i!=0 :
      dict_keys = vocab.keys()
      sample_list = list(dict_keys)
      sample = sample_list[i-1]     
      text = text + sample + " "
  return text


Testing model on test data 


In [16]:
for i in range(10):
  original_eng = seq2word(X_test[i], eng_vocab)
  print("Original english sentence:", original_eng)
  original_dan = seq2word(y_test[i], dan_vocab)
  original_dan = original_dan.replace("start", " ")
  original_dan = original_dan.replace("end", " ")
  print("Original danish sentence :", original_dan)
  predicted_sequence = decode_sequence(X_test[i].reshape(1,15))
  print("Predicted sentence.      :", predicted_sequence)
  print()

Original english sentence:  china is usually busy during september but it is sometimes cold in spring 
Original danish sentence :    kina har normalt travlt i september men det er undertiden koldt om foråret   
Predicted sentence.      :   kina har normalt travlt i september men det er undertiden koldt om foråret

Original english sentence:  he dislikes pears and peaches 
Original danish sentence :    han kan ikke lide pærer og ferskner   
Predicted sentence.      :   han kan ikke lide pærer og ferskner

Original english sentence:  the united states is sometimes rainy during january but it is mild in may 
Original danish sentence :    usa er nogle gange regnfuldt i januar men det er mildt i maj   
Predicted sentence.      :   usa er undertiden regnfuldt i januar men det er mildt i maj

Original english sentence:  california is mild during march but it is sometimes rainy in october 
Original danish sentence :    californien er mild i marts men det er undertiden regnfuldt i oktober   
Pr