### Importing required libraries

In [1]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import corpus_bleu
smoothing = SmoothingFunction().method4
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip ./fra-eng.zip
import warnings
warnings.filterwarnings("ignore")

--2021-06-09 00:47:11--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 172.67.173.198, 104.21.55.222, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|172.67.173.198|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6413399 (6.1M) [application/zip]
Saving to: ‘fra-eng.zip’


2021-06-09 00:47:13 (5.24 MB/s) - ‘fra-eng.zip’ saved [6413399/6413399]

Archive:  ./fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


### Data cleaning and train_test_split

In [2]:
data_path = '/content/fra.txt' 
num_sentences = 20000 

# opening the text file and getting the data 
with open(data_path,'r') as f:
    lines = f.read().split('\n')
#count the number of sentences
c=0 

# data cleaning
source_texts,target_texts = [],[]
for line in lines: # going through each lines
    if c == num_sentences: # if we have 20000 sentences than we will get out of this loop
        break 
    elif '\t' in line:
        op_data,ip_data,_ = line.lower().rstrip().split('\t') # lowering the data and then spliting the data
        # to remove the punctuation we did not include last character
        source_text = ip_data[:-1].strip()
        target_text = op_data[:-1].strip()
        # removing the unprintable character
        # for english and french we will take anly alphabets of brespective languages and numbers
        target_text = re.sub("[^a-z 1-9\'-]","",target_text) 
        source_text = re.sub("[^a-zàâãçéèêëîïôœùûüÿ 1-9\'-]","",source_text) 
        source_texts.append(source_text)
        target_texts.append(target_text)
        c+=1

# train_test_split of the source and target data
source_train,source_test,target_train,target_test = train_test_split(source_texts,target_texts,test_size = 0.2, random_state= 0)

### Making the required functions for the data preprocessing

In [3]:
# tokenizer for data
def create_tokenizer(texts):
    tokenizer = Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(texts)
    return tokenizer

# one_hot encoding of the target data
def one_hot(pad_seq,max_sent_length,num_vocab):
    target_data_one_hot = np.zeros((len(pad_seq),max_sent_length,num_vocab))
    for i,w in enumerate(pad_seq):
        for j,d in enumerate(w):
            target_data_one_hot[i,j,d] = 1
    return target_data_one_hot

# for padding the data
def encoding_text(tokenizer,text,max_length):
    text_seq = tokenizer.texts_to_sequences(text)
    pad_seq = pad_sequences(text_seq,maxlen= max_length)
    return pad_seq

# to find the maximum length of the sentence from data
def max_length(text):
    return max(len(l.split()) for l in text)
    

### Preparing training and testing data

In [4]:
# preparing source tokenizer and getting relevant information
source_tokenizer = create_tokenizer(source_train)
source_vocab = source_tokenizer.word_index
num_source_vocab = len(source_vocab)+1
max_source_length = max_length(source_train)
print("Number of Source Vocabulary :",num_source_vocab)
print("Maximum Source Length :",max_source_length)

Number of Source Vocabulary : 6052
Maximum Source Length : 11


In [5]:
# preparing target tokenizer and getting relevant information
target_tokenizer = create_tokenizer(target_train)
target_vocab = target_tokenizer.word_index
num_target_vocab = len(target_vocab)+1
max_target_length = max_length(target_train)
print("Number of Target Vocabulary :",num_target_vocab)
print("Maximum Target Length :",max_target_length)

Number of Target Vocabulary : 3201
Maximum Target Length : 5


In [6]:
# preparing the training data
# padding of the source sentences
source_train_seq_pad = encoding_text(source_tokenizer,source_train,max_source_length) 
# padding of the target sentences
target_train_seq_pad = encoding_text(target_tokenizer,target_train,max_target_length) 
# one hot encoding of the padded target senteces
target_train_seq_pad = one_hot(target_train_seq_pad,max_target_length,num_target_vocab) 
print("-------------------------------------")
print("Padded train source")
print(source_train_seq_pad)
print("-------------------------------------")
print("Padded train target")
print(target_train_seq_pad)
print("-------------------------------------")

-------------------------------------
Padded train source
[[   0    0    0 ...    7   77   40]
 [   0    0    0 ...   50 1537   59]
 [   0    0    0 ...    0    0 2058]
 ...
 [   0    0    0 ...   39   10   95]
 [   0    0    0 ...   23   34  417]
 [   0    0    0 ...    0 6051   23]]
-------------------------------------
Padded train target
[[[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0.

In [7]:
# preparing the test data
# padding of the source sentences
source_test_seq_pad = encoding_text(source_tokenizer,source_test,max_source_length) 
# padding of the target sentences
target_test_seq_pad = encoding_text(target_tokenizer,target_test,max_target_length) 
# one hot encoding of the padded target senteces
target_test_seq_pad = one_hot(target_test_seq_pad,max_target_length,num_target_vocab) 
print("-------------------------------------")
print("Padded test source")
print(source_train_seq_pad)
print("-------------------------------------")
print("Padded test target")
print(target_train_seq_pad)
print("-------------------------------------")

-------------------------------------
Padded test source
[[   0    0    0 ...    7   77   40]
 [   0    0    0 ...   50 1537   59]
 [   0    0    0 ...    0    0 2058]
 ...
 [   0    0    0 ...   39   10   95]
 [   0    0    0 ...   23   34  417]
 [   0    0    0 ...    0 6051   23]]
-------------------------------------
Padded test target
[[[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0

### Preparing and running the Autoencoder model 

In [8]:
model = Sequential()
model.add(Input(shape=(max_source_length,)))
model.add(Embedding(num_source_vocab,512,mask_zero=True))
model.add(LSTM(512,return_sequences = False))
model.add(RepeatVector(max_target_length))
model.add(LSTM(512,return_sequences = True))
model.add(TimeDistributed(Dense(num_target_vocab,activation = 'softmax')))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 512)           3098624   
_________________________________________________________________
lstm (LSTM)                  (None, 512)               2099200   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 512)            2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 3201)           1642113   
Total params: 8,939,137
Trainable params: 8,939,137
Non-trainable params: 0
_________________________________________________________________


In [9]:
es = EarlyStopping(monitor='val_acc',patience= 5,min_delta=0.01)
filepath = './french-english.h5' 
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 

In [10]:
history = model.fit(source_train_seq_pad, target_train_seq_pad, epochs= 100, batch_size=64, validation_data = (source_test_seq_pad,target_test_seq_pad), verbose=1,callbacks=[checkpoint,es])

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.42610, saving model to ./french-english.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.42610 to 0.49275, saving model to ./french-english.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.49275 to 0.52950, saving model to ./french-english.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.52950 to 0.55850, saving model to ./french-english.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.55850 to 0.57945, saving model to ./french-english.h5
Epoch 6/100

Epoch 00006: val_acc improved from 0.57945 to 0.59580, saving model to ./french-english.h5
Epoch 7/100

Epoch 00007: val_acc improved from 0.59580 to 0.60845, saving model to ./french-english.h5
Epoch 8/100

Epoch 00008: val_acc improved from 0.60845 to 0.62855, saving model to ./french-english.h5
Epoch 9/100

Epoch 00009: val_acc improved from 0.62855 to 0.63730, saving model to ./french-english.h5
Epoch 10/100

Epoch 00010: val_acc improved from 0.63730 to 0.64550

In [11]:
# loading the weights from the best saved model
model.load_weights(filepath)

### Making the functions to predict the sequence and BLEU_sccore

In [12]:
# a dictionary having key is a token number for a particular word and value is a word
# this will required to decode the predicted sequence
target_vocab_idx = {v:k for k,v in target_tokenizer.word_index.items()}

# function to predict the decoded sequence
def predict_sequence(model,sent,vocab_idx):
    prediction = model.predict(sent.reshape(1,max_source_length))[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        if i != 0:
            word = vocab_idx[i]
            if word is None:
                break
            target.append(word)
    return ' '.join(target)

# for evaluation of the model through BLEU_score
def bleu_score(model,ip,ip_raw,op_raw,vocab_idx):
    prediction,actual = [],[]
    for i,sent in enumerate(ip):
        if i%10 == 0:
            print('\rprogress ',(i+1)*100//len(ip),'%',sep='',end='',flush = True)
        translation = predict_sequence(model,sent,vocab_idx)
        prediction.append(translation)
        actual.append(op_raw[i])
    print()
    # printing the first ten sentences
    for i in range(10):
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
        print('FRENCH -->',ip_raw[i],' || ','ACTUAL ENGLISH -->',op_raw[i],' || ','PREDICTED ENGLISH -->',prediction[i])
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
    print()
    # printing the BLEU_score
    print("----------------------------------")
    print('Printing BLEU SCORE...')
    print("----------------------------------")
    print('First BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(1.0, 0, 0, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Second BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Third BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Fourth BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothing,auto_reweigh=False))
    print("----------------------------------")

### Evaluating the model on training data

In [13]:
bleu_score(model,source_train_seq_pad,source_train,target_train,target_vocab_idx)

progress 99%
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> je ne vois pas très bien  ||  ACTUAL ENGLISH --> i can't see well  ||  PREDICTED ENGLISH --> i can't see well
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> je fais rarement cela  ||  ACTUAL ENGLISH --> i seldom do that  ||  PREDICTED ENGLISH --> i don't do that
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> cal

### Evaluating the model on test data

In [14]:
bleu_score(model,source_test_seq_pad,source_test,target_test,target_vocab_idx)

progress 99%
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> laissez ça tranquille  ||  ACTUAL ENGLISH --> leave that alone  ||  PREDICTED ENGLISH --> leave that alone
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> j'irai te prendre  ||  ACTUAL ENGLISH --> i'll get you  ||  PREDICTED ENGLISH --> i'll get you
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
FRENCH --> vous temporisez  