In [1]:
import io
import re
import pickle
import numpy as np 
import pandas as pd
from tqdm import tqdm, tqdm_gui
from sklearn.model_selection import train_test_split
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
random_state = 20

In [3]:
eng, fr = [], []
with open('../../../Downloads/fra-eng/fra.txt', 'r') as file:
    for line in file.readlines():
        lines = line.split('\t')[:2]
        eng.append(lines[0])
        fr.append(' '.join(lines[1].split("\u202f")))

        
# Convert to pandas dataframe
data = pd.DataFrame({'english': eng, 'french': fr})

In [4]:
data.head()

Unnamed: 0,english,french
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [5]:
data.shape

(175623, 2)

In [6]:
def clean_en_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'s", "", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"' ", " ", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]$%&^_\\", " ", text)
    text = " ".join(text.split())
    
    return text

def remove_symb(text):
    for symb in list("[-()\"#/@;:<>{}`+=~|.!?,]$%&^_\\*"):
        text = text.replace(symb, ' ')
    return ' '.join(text.split())

def tagger(text):
    """
    Add <BOS> and <EOS> at beginning and 
    end of sentence.
    """
    return f'<BOS> {text} <EOS>'

In [7]:
# Clean text data
data.english = data.english.apply(clean_en_text)
data.english = data.english.apply(remove_symb)
data.french = data.french.apply(remove_symb)
data.french = data.french.str.lower()

# Apply tagger and insert (<EOS> and <BOS>) to decoder texts
data.french = data.french.apply(tagger)

### Split data
We will be reserving the last 1000 rows for testing.

In [8]:
train_en, test_en, train_fr, test_fr = train_test_split(data.english.values,
                                                      data.french.values,
                                                      test_size=0.001,
                                                      random_state=random_state)

In [9]:
len(test_en), len(test_fr)

(176, 176)

### Tokenization

In [10]:
# Initialize tokenizers
tokenizer_en = Tokenizer(num_words=70000)
tokenizer_fr = Tokenizer(num_words=70000)

# Fit tokenizer
tokenizer_en.fit_on_texts(train_en)
tokenizer_fr.fit_on_texts(train_fr)

# Create word and index dictionary for english
en_word_to_int = tokenizer_en.word_index
int_to_en_word = tokenizer_en.index_word

# Create word and index dictionary for french
fr_word_to_int = tokenizer_fr.word_index
int_to_fr_word = tokenizer_fr.index_word

# Add BOS and EOS to encoder vocabolary
max_indx = sorted(int_to_fr_word)[-1]
fr_word_to_int['<BOS>'] = max_indx + 1
fr_word_to_int['<EOS>'] = max_indx + 2

int_to_fr_word[max_indx + 1] = '<BOS>'
int_to_fr_word[max_indx + 2] = '<EOS>'

In [11]:
# Create vocabulary word list
vocab_list_en = list(en_word_to_int.keys())
vocab_list_fr = list(fr_word_to_int.keys())

# Set parameters
vocab_size_en = len(vocab_list_en) + 1
vocab_size_fr = len(vocab_list_fr) + 1
embedding_dim = 100
samples_size = len(train_en)

### Word Embedding

In [12]:
def load_vectors(fname, vocab):
    """
    Loads fasttext embedding and return only words and vectors 
    in the vocabulary list.
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    dataset = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        if tokens[0] in vocab:
            dataset[tokens[0]] = list(map(float, tokens[1:embedding_dim + 1]))
    return dataset

In [13]:
# # Load embedding vectors
# data_en = load_vectors('../../../Downloads/cc.en.300.vec', vocab_list_en)
# print('English is done.')
# data_fr = load_vectors('../../../Downloads/cc.fr.300.vec', vocab_list_fr)
# print('French is done.')

In [14]:
# # Save embedded dictionary
# with open('./Data/Embeddings/fast_text_french_word_vec.pickle', 'wb') as f:
#     pickle.dump(data_fr, f)

# with open('./Data/Embeddings/fast_text_english_word_vec.pickle', 'wb') as f:
#     pickle.dump(data_en, f)

In [15]:
# Load embedded dictionary
with open('./Data/Embeddings/fast_text_french_word_vec.pickle', 'rb') as p:
    embed_vec_fr = pickle.load(p)


with open('./Data/Embeddings/fast_text_english_word_vec.pickle', 'rb') as p:
    embed_vec_en = pickle.load(p)

In [16]:
# create a weight matrix for words in training docs
embedding_matrix_en = np.zeros((vocab_size_en, embedding_dim))
embedding_matrix_fr = np.zeros((vocab_size_fr, embedding_dim))

for en_word, i in en_word_to_int.items():
    if en_word in embed_vec_en:
        embedding_matrix_en[i] = embed_vec_en[en_word]
    
    
for fr_word, j in fr_word_to_int.items():
    if fr_word in embed_vec_fr:
        embedding_matrix_fr[j] = embed_vec_fr[fr_word]

In [17]:
# Maximun length of sententence
max_len_en = max(len(seq.split()) for seq in train_en)
max_len_fr = max(len(seq.split()) for seq in train_fr)

# max_len_en = max(train.english.apply(lambda x: len(x.split())))
# max_len_fr = max(train.french.apply(lambda x: len(x.split())))

# # Inputs
# encoder_inputs = tokenizer_en.texts_to_sequences(corpus_en)
# decoder_inputs = tokenizer_fr.texts_to_sequences(corpus_fr)

# # Pad Sentence
# encoder_inputs = pad_sequences(encoder_inputs, padding='post', maxlen=max_len_en)
# decoder_inputs = pad_sequences(decoder_inputs, padding='post', maxlen=max_len_fr)

In [18]:
# # initiate numpy arrays to hold the data that our seq2seq model will use:
# encoder_input_data = np.zeros(
#                             (samples_size, max_len_en),
#                             dtype='float32')
# decoder_input_data = np.zeros(
#                             (samples_size, max_len_fr),
#                             dtype='float32')
# decoder_target_data = np.zeros(
#                             (samples_size, max_len_fr, vocab_size_fr),
#                             dtype='float32')

In [19]:
# # Process samples, to get input, output, target data:
# for i, (input_text, target_text) in tqdm(enumerate(zip(corpus_en, corpus_fr))):
# #     print(input_text)
#     for t, word in enumerate(input_text.split()):
        
#         encoder_input_data[i, t] = en_word_to_int[word]
        
#     for t, word in enumerate(target_text.split()):
#         # decoder_target_data is ahead of decoder_input_data by one timestep
#         decoder_input_data[i, t] = fr_word_to_int[word]
#         if t > 0:
#             # decoder_target_data will be ahead by one timestep
#             # and will not include the start word.
#             decoder_target_data[i, t - 1, fr_word_to_int[word]] = 1.

### Extract validation data

In [20]:
# Slicing training and validation
X_train = train_en[:-1000]
y_train = train_fr[:-1000]
val_en = train_en[-1000:]
val_fr = train_fr[-1000:]

In [21]:
def extract_batch_data(X, y, batch):
    
    while True:
        for j in range(0, len(X), batch):
            # initiate numpy arrays to hold the data that our seq2seq model will use:
            encoder_input_data1 = np.zeros((batch, max_len_en),
                                        dtype='float32')
            decoder_input_data1 = np.zeros((batch, max_len_fr),
                                        dtype='float32')
            decoder_target_data1 = np.zeros((batch, max_len_fr, vocab_size_fr),
                                        dtype='float32')

            # Process samples, to get input, output, target data:
            for i, (input_text1, target_text1) in enumerate(zip(X[j:j+batch], y[j:j+batch])):
            #  print(input_text)
                for t, word in enumerate(input_text1.split()):

                    encoder_input_data1[i, t] = en_word_to_int[word]

                for t, word in enumerate(target_text1.split()):
                    # decoder_target_data is ahead of decoder_input_data by one timestep
                    decoder_input_data1[i, t] = fr_word_to_int[word]
                    if t > 0:
                        # decoder_target_data will be ahead by one timestep
                        # and will not include the start word.
                        decoder_target_data1[i, t - 1, fr_word_to_int[word]] = 1.
                        
            yield([encoder_input_data1, decoder_input_data1], decoder_target_data1)

In [22]:
# shuffled_data.english[shuffled_data.english.str.contains('offs')].values

In [23]:
# train.english[train.english.str.contains('offs')].values

In [24]:
# test.english[test.english.str.contains('offs')]#.values

In [75]:
# Model parameters
num_units = 128 
batch_size = 100
epochs = 3

In [26]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
encoder_embedding_layer = Embedding(vocab_size_en, embedding_dim,
                               weights=[embedding_matrix_en],
                               input_length=max_len_en,
                               trainable=False)

# encoder_embedding(encoder_embedding_inputs)
encoder_embedding_inputs = encoder_embedding_layer(encoder_inputs)


encoder = LSTM(num_units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(vocab_size_fr, embedding_dim,
                               weights=[embedding_matrix_fr],
                               input_length=max_len_fr,
                               trainable=False)

decoder_embedding_inputs = decoder_embedding_layer(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder = LSTM(num_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_embedding_inputs,
                                initial_state=encoder_states)
decoder_dense = Dense(vocab_size_fr, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 100)    1410900     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    2729800     input_2[0][0]                    
______________________________________________________________________________________________

In [76]:
# Compile model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [77]:
val_samples = test_en.shape[0]

In [None]:
history=model.fit_generator(generator=extract_batch_data(X_train, y_train, batch=batch_size),
                    steps_per_epoch=samples_size//batch_size,
                    epochs=epochs,
                    validation_data=extract_batch_data(val_en, val_fr, batch=batch_size),
                    validation_steps = val_samples//batch_size)

Epoch 1/3
 109/1754 [>.............................] - ETA: 4:14:43 - loss: 0.6073

In [67]:
# # Save models
# model.save('./Data/models/model.h5')
# model.save_weights('./Data/models/weight.h5')

In [None]:
# # Model training
# model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_split=0.2)

### Inference model

In [43]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(num_units,))
decoder_state_input_c = Input(shape=(num_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder(
                                    decoder_embedding_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                        [decoder_outputs] + decoder_states)

### Decoding sequence

In [56]:
def decode_sequence(input_seq, max_decoder_seq_length=70):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first word of target sequence with the BOS.
    target_seq[0, 0] = fr_word_to_int['<BOS>']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = int_to_fr_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '<EOS>' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

### Results

In [74]:
test_data_sequence = extract_batch_data(test_en, test_fr, batch=1)
k=-1
#Adam
k+=1
(input_seq, actual_output), _ = next(test_data_sequence)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence as per data =>>', test_en[k:k+1][0])
print('Actual French Translation as per data =>>', test_fr[k:k+1][0][6:-4])
print('Predicted French Translation predicted by model =>>', decoded_sentence[:-4])

Input English sentence as per data =>> would it be ethical to sacrifice one person to save many
Actual French Translation as per data =>> serait il moral de sacrifier une personne pour en sauver plusieurs <
Predicted French Translation predicted by model =>>  il pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas pas
