In [2]:
import io
import re
import pickle
import numpy as np 
import pandas as pd
from tqdm import tqdm, tqdm_gui
from sklearn.model_selection import train_test_split
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
random_state = 20

In [4]:
eng, fr = [], []
with open('../../../Downloads/fra-eng/fra.txt', 'r') as file:
    for line in file.readlines():
        lines = line.split('\t')[:2]
        eng.append(lines[0])
        fr.append(' '.join(lines[1].split("\u202f")))

        
# Convert to pandas dataframe
data = pd.DataFrame({'english': eng, 'french': fr})

In [5]:
data.head()

Unnamed: 0,english,french
0,Go.,Va !
1,Hi.,Salut !
2,Hi.,Salut.
3,Run!,Cours !
4,Run!,Courez !


In [6]:
data.shape

(175623, 2)

In [7]:
def clean_en_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'s", "", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"' ", " ", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]$%&^_\\", " ", text)
    text = " ".join(text.split())
    
    return text

def remove_symb(text):
    for symb in list("[-()\"#/@;:<>{}`+=~|.!?,]$%&^_\\*"):
        text = text.replace(symb, ' ')
    return ' '.join(text.split())

def tagger(text):
    """
    Add <BOS> and <EOS> at beginning and 
    end of sentence.
    """
    return f'<BOS> {text} <EOS>'

In [8]:
# Clean text data
data.english = data.english.apply(clean_en_text)
data.english = data.english.apply(remove_symb)
data.french = data.french.apply(remove_symb)
data.french = data.french.str.lower()

# Apply tagger and insert (<EOS> and <BOS>) to decoder texts
data.french = data.french.apply(tagger)

### Split data
We will be reserving the last 1000 rows for testing.

In [9]:
shuffled_data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

In [10]:
shuffled_data.head()

Unnamed: 0,english,french
0,would it be ethical to sacrifice one person to...,<BOS> serait il moral de sacrifier une personn...
1,we will sort this out tomorrow,<BOS> nous réglerons cela demain <EOS>
2,fur coats are on sale,<BOS> les manteaux de fourrure sont en promoti...
3,i have never told anyone that before,<BOS> je n'ai jamais dit cela à quiconque aupa...
4,i must warn them,<BOS> il me faut les prévenir <EOS>


In [11]:
train = shuffled_data.iloc[:-1000]
test = shuffled_data.iloc[-1000:]

### Tokenization

In [12]:
# Extract corpus
corpus_en = train.english.values
corpus_fr = train.french.values

# Initialize tokenizers
tokenizer_en = Tokenizer(num_words=70000)
tokenizer_fr = Tokenizer(num_words=70000)

# Fit tokenizer
tokenizer_en.fit_on_texts(corpus_en)
tokenizer_fr.fit_on_texts(corpus_fr)

# Create word and index dictionary for english
en_word_to_int = tokenizer_en.word_index
int_to_en_word = tokenizer_en.index_word

# Create word and index dictionary for french
fr_word_to_int = tokenizer_fr.word_index
int_to_fr_word = tokenizer_fr.index_word

# Add BOS and EOS to encoder vocabolary
max_indx = sorted(int_to_fr_word)[-1]
fr_word_to_int['<BOS>'] = max_indx + 1
fr_word_to_int['<EOS>'] = max_indx + 2

int_to_fr_word[max_indx + 1] = '<BOS>'
int_to_fr_word[max_indx + 2] = '<EOS>'

In [13]:
# Create vocabulary word list
vocab_list_en = list(en_word_to_int.keys())
vocab_list_fr = list(fr_word_to_int.keys())

# Set parameters
vocab_size_en = len(vocab_list_en) + 1
vocab_size_fr = len(vocab_list_fr) + 1
embedding_dim = 100
samples_size = train.shape[0]

### Word Embedding

In [14]:
def load_vectors(fname, vocab):
    """
    Loads fasttext embedding and return only words and vectors 
    in the vocabulary list.
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    dataset = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        if tokens[0] in vocab:
            dataset[tokens[0]] = list(map(float, tokens[1:embedding_dim + 1]))
    return dataset

In [15]:
# # Load embedding vectors
# data_en = load_vectors('../../../Downloads/cc.en.300.vec', vocab_list_en)
# print('English is done.')
# data_fr = load_vectors('../../../Downloads/cc.fr.300.vec', vocab_list_fr)
# print('French is done.')

In [16]:
# # Save embedded dictionary
# with open('./Data/Embeddings/fast_text_french_word_vec.pickle', 'wb') as f:
#     pickle.dump(data_fr, f)

# with open('./Data/Embeddings/fast_text_english_word_vec.pickle', 'wb') as f:
#     pickle.dump(data_en, f)

In [17]:
# Load embedded dictionary
with open('./Data/Embeddings/fast_text_french_word_vec.pickle', 'rb') as p:
    embed_vec_fr = pickle.load(p)


with open('./Data/Embeddings/fast_text_english_word_vec.pickle', 'rb') as p:
    embed_vec_en = pickle.load(p)

In [18]:
# create a weight matrix for words in training docs
embedding_matrix_en = np.zeros((vocab_size_en, embedding_dim))
embedding_matrix_fr = np.zeros((vocab_size_fr, embedding_dim))

for en_word, i in en_word_to_int.items():
    if en_word in embed_vec_en:
        embedding_matrix_en[i] = embed_vec_en[en_word]
    
    
for fr_word, j in fr_word_to_int.items():
    if fr_word in embed_vec_fr:
        embedding_matrix_fr[j] = embed_vec_fr[fr_word]

In [19]:
# Maximun length of sententence
max_len_en = max(train.english.apply(lambda x: len(x.split())))
max_len_fr = max(train.french.apply(lambda x: len(x.split())))

# # Inputs
# encoder_inputs = tokenizer_en.texts_to_sequences(corpus_en)
# decoder_inputs = tokenizer_fr.texts_to_sequences(corpus_fr)

# # Pad Sentence
# encoder_inputs = pad_sequences(encoder_inputs, padding='post', maxlen=max_len_en)
# decoder_inputs = pad_sequences(decoder_inputs, padding='post', maxlen=max_len_fr)

In [20]:
# initiate numpy arrays to hold the data that our seq2seq model will use:
encoder_input_data = np.zeros(
                            (samples_size, max_len_en),
                            dtype='float32')
decoder_input_data = np.zeros(
                            (samples_size, max_len_fr),
                            dtype='float32')
decoder_target_data = np.zeros(
                            (samples_size, max_len_fr, vocab_size_fr),
                            dtype='float32')

In [21]:
# Process samples, to get input, output, target data:
for i, (input_text, target_text) in tqdm(enumerate(zip(corpus_en, corpus_fr))):
#     print(input_text)
    for t, word in enumerate(input_text.split()):
        
        encoder_input_data[i, t] = en_word_to_int[word]
        
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = fr_word_to_int[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start word.
            decoder_target_data[i, t - 1, tokenizer_fr.word_index[word]] = 1.

174623it [00:08, 20156.37it/s]


In [26]:
# Model parameters
num_units = 128 
batch_size = 100
epochs = 1

In [27]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
encoder_embedding_inputs = Embedding(vocab_size_en, embedding_dim,
                               weights=[embedding_matrix_en],
                               input_length=max_len_en,
                               trainable=False)(encoder_inputs)

# encoder_embedding(encoder_embedding_inputs)
encoder = LSTM(num_units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(vocab_size_fr, embedding_dim,
                               weights=[embedding_matrix_fr],
                               input_length=max_len_fr,
                               trainable=False)(decoder_inputs)

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder = LSTM(num_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder(decoder_embedding,
                                initial_state=encoder_states)
decoder_dense = Dense(vocab_size_fr, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 100)    1407600     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    2725200     input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

In [None]:
# Encoder inference model
encoder_model = Model(encoder_inputs, encoder_states)

# Dencoder inference model
decoder_state_input_h = Input(shape=(num_units,))
decoder_state_input_c = Input(shape=(num_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(
                                    decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                        [decoder_outputs] + decoder_states)