### Importing required libraries

In [15]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import Dropout
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate.bleu_score import corpus_bleu
smoothing = SmoothingFunction().method4
!wget http://www.manythings.org/anki/spa-eng.zip
!unzip ./spa-eng.zip
import warnings
warnings.filterwarnings("ignore")

--2021-06-09 01:34:23--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5084241 (4.8M) [application/zip]
Saving to: ‘spa-eng.zip’


2021-06-09 01:34:25 (4.64 MB/s) - ‘spa-eng.zip’ saved [5084241/5084241]

Archive:  ./spa-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: _about.txt              
  inflating: spa.txt                 


### Data cleaning and train_test_split

In [16]:
data_path = '/content/spa.txt' 
num_sentences = 20000 

# opening the text file and getting the data 
with open(data_path,'r') as f:
    lines = f.read().split('\n')
#count the number of sentences
c=0 

# data cleaning
source_texts,target_texts = [],[]
for line in lines: # going through each lines
    if c == num_sentences: # if we have 20000 sentences than we will get out of this loop
        break 
    elif '\t' in line:
        op_data,ip_data,_ = line.lower().rstrip().split('\t') # lowering the data and then spliting the data
        # to remove the punctuation we did not include last character
        source_text = ip_data[:-1].strip()
        target_text = op_data[:-1].strip()
        # removing the unprintable character
        # for english and french we will take anly alphabets of brespective languages and numbers
        target_text = re.sub("[^a-z 1-9\'-]","",target_text) 
        source_text = re.sub("[^a-zàâãçéèêëîïôœùûüÿ 1-9\'-]","",source_text) 
        source_texts.append(source_text)
        target_texts.append(target_text)
        c+=1

# train_test_split of the source and target data
source_train,source_test,target_train,target_test = train_test_split(source_texts,target_texts,test_size = 0.2, random_state= 0)

### Making the required functions for the data preprocessing

In [17]:
# tokenizer for data
def create_tokenizer(texts):
    tokenizer = Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(texts)
    return tokenizer

# one_hot encoding of the target data
def one_hot(pad_seq,max_sent_length,num_vocab):
    target_data_one_hot = np.zeros((len(pad_seq),max_sent_length,num_vocab))
    for i,w in enumerate(pad_seq):
        for j,d in enumerate(w):
            target_data_one_hot[i,j,d] = 1
    return target_data_one_hot

# for padding the data
def encoding_text(tokenizer,text,max_length):
    text_seq = tokenizer.texts_to_sequences(text)
    pad_seq = pad_sequences(text_seq,maxlen= max_length)
    return pad_seq

# to find the maximum length of the sentence from data
def max_length(text):
    return max(len(l.split()) for l in text)
    

### Preparing training and testing data

In [18]:
# preparing source tokenizer and getting relevant information
source_tokenizer = create_tokenizer(source_train)
source_vocab = source_tokenizer.word_index
num_source_vocab = len(source_vocab)+1
max_source_length = max_length(source_train)
print("Number of Source Vocabulary :",num_source_vocab)
print("Maximum Source Length :",max_source_length)

Number of Source Vocabulary : 6530
Maximum Source Length : 15


In [19]:
# preparing target tokenizer and getting relevant information
target_tokenizer = create_tokenizer(target_train)
target_vocab = target_tokenizer.word_index
num_target_vocab = len(target_vocab)+1
max_target_length = max_length(target_train)
print("Number of Target Vocabulary :",num_target_vocab)
print("Maximum Target Length :",max_target_length)

Number of Target Vocabulary : 3468
Maximum Target Length : 6


In [20]:
# preparing the training data
# padding of the source sentences
source_train_seq_pad = encoding_text(source_tokenizer,source_train,max_source_length) 
# padding of the target sentences
target_train_seq_pad = encoding_text(target_tokenizer,target_train,max_target_length) 
# one hot encoding of the padded target senteces
target_train_seq_pad = one_hot(target_train_seq_pad,max_target_length,num_target_vocab) 
print("-------------------------------------")
print("Padded train source")
print(source_train_seq_pad)
print("-------------------------------------")
print("Padded train target")
print(target_train_seq_pad)
print("-------------------------------------")

-------------------------------------
Padded train source
[[   0    0    0 ...  258    3  148]
 [   0    0    0 ...   33   12  595]
 [   0    0    0 ...    0   24 1229]
 ...
 [   0    0    0 ...   64  127    2]
 [   0    0    0 ...   64  133 1913]
 [   0    0    0 ...    0  106 1685]]
-------------------------------------
Padded train target
[[[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0.

In [21]:
# preparing the test data
# padding of the source sentences
source_test_seq_pad = encoding_text(source_tokenizer,source_test,max_source_length) 
# padding of the target sentences
target_test_seq_pad = encoding_text(target_tokenizer,target_test,max_target_length) 
# one hot encoding of the padded target senteces
target_test_seq_pad = one_hot(target_test_seq_pad,max_target_length,num_target_vocab) 
print("-------------------------------------")
print("Padded test source")
print(source_train_seq_pad)
print("-------------------------------------")
print("Padded test target")
print(target_train_seq_pad)
print("-------------------------------------")

-------------------------------------
Padded test source
[[   0    0    0 ...  258    3  148]
 [   0    0    0 ...   33   12  595]
 [   0    0    0 ...    0   24 1229]
 ...
 [   0    0    0 ...   64  127    2]
 [   0    0    0 ...   64  133 1913]
 [   0    0    0 ...    0  106 1685]]
-------------------------------------
Padded test target
[[[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 ...

 [[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[1. 0. 0. ... 0. 0

### Preparing and running the Autoencoder model 

In [22]:
model = Sequential()
model.add(Input(shape=(max_source_length,)))
model.add(Embedding(num_source_vocab,512,mask_zero=True))
model.add(LSTM(512,return_sequences = False))
model.add(RepeatVector(max_target_length))
model.add(LSTM(512,return_sequences = True))
model.add(TimeDistributed(Dense(num_target_vocab,activation = 'softmax')))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 15, 512)           3343360   
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 6, 512)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 6, 512)            2099200   
_________________________________________________________________
time_distributed_1 (TimeDist (None, 6, 3468)           1779084   
Total params: 9,320,844
Trainable params: 9,320,844
Non-trainable params: 0
_________________________________________________________________


In [24]:
es = EarlyStopping(monitor='val_acc',patience= 5,min_delta=0.01)
filepath = './spanish-english.h5' 
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') 

In [25]:
history = model.fit(source_train_seq_pad, target_train_seq_pad, epochs= 100, batch_size=64, validation_data = (source_test_seq_pad,target_test_seq_pad), verbose=1,callbacks=[checkpoint,es])

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.48354, saving model to ./spanish-english.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.48354 to 0.51267, saving model to ./spanish-english.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.51267 to 0.53783, saving model to ./spanish-english.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.53783 to 0.56604, saving model to ./spanish-english.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.56604 to 0.59254, saving model to ./spanish-english.h5
Epoch 6/100

Epoch 00006: val_acc improved from 0.59254 to 0.60992, saving model to ./spanish-english.h5
Epoch 7/100

Epoch 00007: val_acc improved from 0.60992 to 0.62329, saving model to ./spanish-english.h5
Epoch 8/100

Epoch 00008: val_acc improved from 0.62329 to 0.63421, saving model to ./spanish-english.h5
Epoch 9/100

Epoch 00009: val_acc improved from 0.63421 to 0.64283, saving model to ./spanish-english.h5
Epoch 10/100

Epoch 00010: val_acc improved from 0.64283 t

In [26]:
# loading the weights from the best saved model
model.load_weights(filepath)

### Making the functions to predict the sequence and BLEU_sccore

In [29]:
# a dictionary having key is a token number for a particular word and value is a word
# this will required to decode the predicted sequence
target_vocab_idx = {v:k for k,v in target_tokenizer.word_index.items()}

# function to predict the decoded sequence
def predict_sequence(model,sent,vocab_idx):
    prediction = model.predict(sent.reshape(1,max_source_length))[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        if i != 0:
            word = vocab_idx[i]
            if word is None:
                break
            target.append(word)
    return ' '.join(target)

# for evaluation of the model through BLEU_score
def bleu_score(model,ip,ip_raw,op_raw,vocab_idx):
    prediction,actual = [],[]
    for i,sent in enumerate(ip):
        if i%10 == 0:
            print('\rprogress ',(i+1)*100//len(ip),'%',sep='',end='',flush = True)
        translation = predict_sequence(model,sent,vocab_idx)
        prediction.append(translation)
        actual.append(op_raw[i])
    print()
    # printing the first ten sentences
    for i in range(10):
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
        print('SPANISH -->',ip_raw[i],' || ','ACTUAL ENGLISH -->',op_raw[i],' || ','PREDICTED ENGLISH -->',prediction[i])
        print("--------------------------------------------------------------------------------------------------------------------------------------------------")
    print()
    # printing the BLEU_score
    print("----------------------------------")
    print('Printing BLEU SCORE...')
    print("----------------------------------")
    print('First BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(1.0, 0, 0, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Second BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Third BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothing,auto_reweigh=False))
    print('Fourth BLEU score --> %f' % corpus_bleu(actual, prediction, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothing,auto_reweigh=False))
    print("----------------------------------")

### Evaluating the model on training data

In [30]:
bleu_score(model,source_train_seq_pad,source_train,target_train,target_vocab_idx)

progress 99%
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> cul es bueno  ||  ACTUAL ENGLISH --> which one is good  ||  PREDICTED ENGLISH --> which one is good
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> te gusta el colegio  ||  ACTUAL ENGLISH --> do you like school  ||  PREDICTED ENGLISH --> do you like school
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> soy gem

### Evaluating the model on test data

In [31]:
bleu_score(model,source_test_seq_pad,source_test,target_test,target_vocab_idx)

progress 99%
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> tengo una tos seca  ||  ACTUAL ENGLISH --> i have a dry cough  ||  PREDICTED ENGLISH --> i have a black eye
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> no es broma  ||  ACTUAL ENGLISH --> i kid you not  ||  PREDICTED ENGLISH --> isn't no joke
--------------------------------------------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------------------------------------------
SPANISH --> firme aqu por fav