In [1]:
# Import des librairies nécessaires
import pandas as pd
import numpy as np 
import tensorflow_datasets as tfds
import tensorflow as tf 
tf.__version__

'2.1.0'

## Import des données 

In [2]:
# Fonction de chargement du document txt
def load_doc(url):
    df = pd.read_csv("https://go.aws/38ECHUB", delimiter="\t", header=None)
    return df

In [3]:
# Chargement du document txt
doc = load_doc("https://go.aws/38ECHUB")
doc.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !


In [4]:
# Prenons simplement un sample de 5000 phrases pour éviter un temps de traitement trop long. Si votre machine dépote, on peut bien sûr augmenter sa taille
doc = doc.sample(5000)

In [5]:
# Add a <start> and <end> token 
def begin_end_sentence(sentence):
    sentence = "<start> "+ sentence + " <end>"
    return sentence

In [6]:
# Add <start> and <end> token
doc.iloc[:, 0] = doc.iloc[:, 0].apply(lambda x: begin_end_sentence(x))
#doc.iloc[:, 1] = doc.iloc[:, 1].apply(lambda x: begin_end_sentence(x))

In [7]:
# Import de chacun des langages
import fr_core_news_sm
import en_core_web_sm
nlp_fr = fr_core_news_sm.load()
nlp_en = en_core_web_sm.load()

In [8]:
# Add <start> & <end> special case
from spacy.symbols import ORTH

start_case = [{ORTH:"<start>"}]
end_case = [{ORTH: "<end>"}]

#nlp_fr.tokenizer.add_special_case("<start>", start_case)
#nlp_fr.tokenizer.add_special_case("<end>", end_case)

nlp_en.tokenizer.add_special_case("<start>", start_case)
nlp_en.tokenizer.add_special_case("<end>", end_case)

In [9]:
# Chargement du corpus entier de phrases françaises et anglaises
fr_corpus = " ".join(doc.iloc[:, 1].to_list())
en_corpus = " ".join(doc.iloc[:, 0].to_list())

In [10]:
%%time
import time
# Chargement des deux corpus dans spacy 

nlp_fr.max_length = len(fr_corpus)
nlp_en.max_length = len(en_corpus)

fr_doc = nlp_fr(fr_corpus)
en_doc = nlp_en(en_corpus)

Wall time: 5min 41s


In [1]:
fr_doc

NameError: name 'fr_doc' is not defined

In [12]:
%%time
# Tokenisation de chacune des phrases via spacy 

doc["fr_tokens"] = doc.iloc[:, 1].apply(lambda x: nlp_fr.tokenizer(x))
doc["en_tokens"] = doc.iloc[:, 0].apply(lambda x: nlp_en.tokenizer(x))

Wall time: 2.53 s


In [13]:
doc.tail()

Unnamed: 0,0,1,fr_tokens,en_tokens
97878,<start> Are you going to eat those eggs? <end>,Vas-tu manger ces œufs ?,"(Vas, -, tu, manger, ces, œufs, , ?)","(<start>, Are, you, going, to, eat, those, egg..."
40227,<start> Marriage is a lottery. <end>,Le mariage est une loterie.,"(Le, mariage, est, une, loterie, .)","(<start>, Marriage, is, a, lottery, ., <end>)"
135212,<start> They're expressing their love by huggi...,Ils expriment leur amour en s'enlaçant.,"(Ils, expriment, leur, amour, en, s', enlaçant...","(<start>, They, 're, expressing, their, love, ..."
78744,<start> Tom and Mary know John lied. <end>,Tom et Mary savent que John a menti.,"(Tom, et, Mary, savent, que, John, a, menti, .)","(<start>, Tom, and, Mary, know, John, lied, .,..."
24790,<start> Tom led the attack. <end>,Tom a dirigé l'offensive.,"(Tom, a, dirigé, l', offensive, .)","(<start>, Tom, led, the, attack, ., <end>)"


In [14]:
# Création d'un set() qui va prendre tous les tokens unique de notre corpus de texte
en_tokens = [token.text for token in en_doc]
en_vocabulary_set= set(en_tokens)
en_vocab_size = len(en_vocabulary_set)
print(en_vocab_size)

8262


In [15]:
# Même chose pour le français 
fr_tokens = [token.text for token in fr_doc]
fr_vocabulary_set= set(fr_tokens)
fr_vocab_size = len(fr_vocabulary_set)
print(fr_vocab_size)

12643


In [16]:
en_tokens[:10]

['<start>',
 'Reading',
 'books',
 'is',
 'my',
 'hobby',
 '.',
 '<end>',
 '<start>',
 'Tom']

In [17]:
[word for word in en_vocabulary_set][:10]

['pool',
 'inferior',
 'oak',
 'puddle',
 'winks',
 'birthday',
 'Nice',
 'screamed',
 'rapid',
 'anybody']

In [18]:
# Création d'un id pour chacun des tokens
all_en_tokens = {}
for i,en_token in enumerate(en_vocabulary_set):
    all_en_tokens[en_token] = i+1 # On prend à i+1 pour laisser la valeur 0 pour la création des padded_sequences

all_fr_tokens = {}
for i, fr_token in enumerate(fr_vocabulary_set):
    all_fr_tokens[fr_token] = i+1

In [19]:
# Fonction de distance pour évaluer la proximité entre deux mots.
def distance_edition(m1,m2,cache=None):
    m1 = "  " + m1 + "  "
    m2 = "  " + m2 + "  "
    dist = {}
    dist[-1,-1] = 0
    for i in range(0,len(m1)):
        dist[i,-1] = i
    for j in range(0,len(m2)):
        dist[-1,j] = j

    for i, c in enumerate(m1):
        for j, d in enumerate(m2):
            d1 = dist[i-1,j] + 1 # insertion
            d2 = dist[i,j-1] + 1 # suppression
            x = 0 if c == d else 1
            d3 = dist[i-1,j-1] + x
            dist[i,j] = min(d1,d2,d3)
    return dist[len(m1)-1, len(m2)-1]

In [20]:
# fonction qui renvoie pour un mot donnés, le mot le plus proche parmi ceux qui existent pour le modèle
def closest_existing_word(string) :
    lst = []
    for i in all_fr_tokens.keys() :
        lst.append((distance_edition(string, i), i))
    return (min(lst)[1])

In [21]:
# Création de fonction qui vont créer un vecteur d'indices pour chacune des séquences de tokens
def en_tokens_to_index(tokens):
    indices = []
    for token in tokens:
        indices.append(all_en_tokens[closest_existing_word(token.text)])
    return indices

def fr_tokens_to_index(tokens):
    indices = []
    for token in tokens:
        indices.append(all_fr_tokens[closest_existing_word(token.text)])
    return indices

In [22]:
# Transformation des tokens en indices
doc["fr_indices"] = doc["fr_tokens"].apply(lambda x: fr_tokens_to_index(x))
doc["en_indices"] = doc["en_tokens"].apply(lambda x: en_tokens_to_index(x))

KeyboardInterrupt: 

In [None]:
doc.tail()

In [None]:
# Création d'une fonction qui va compter la longueur maximum d'une phrase
def max_len(lines):
    return max(len(line) for line in lines)

In [None]:
# Application de la fonction sur les tokens français et anglais 
fr_max_len = max_len(doc['fr_indices'].to_list())
en_max_len = max_len(doc['en_indices'].to_list())

In [None]:
%%time
# Utilisation de Keras pour créer des séquences de tokens de la même longueur

padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["fr_indices"], maxlen=fr_max_len, padding="post")
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["en_indices"], maxlen=en_max_len, padding="post")

In [None]:
padded_en_indices

In [None]:
# Création de variables que l'on va réutiliser pour nos modèles
BATCH_SIZE = 64
TAKE_SIZE = int(0.7*len(doc)/BATCH_SIZE)
BUFFER_SIZE = TAKE_SIZE * BATCH_SIZE
steps_per_epoch = TAKE_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = fr_vocab_size
vocab_tar_size = en_vocab_size

In [None]:
# Create a tensorflow dataset complet
tf_ds = tf.data.Dataset.from_tensor_slices((padded_fr_indices, padded_en_indices))

In [None]:
# Shuffle & Batch
tf_ds = tf_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Train Test Split
train_data = tf_ds.take(TAKE_SIZE)
test_data = tf_ds.skip(TAKE_SIZE)

In [None]:
input_text, output_text = next(iter(train_data))
print(input_text.numpy().shape)
print(output_text.numpy().shape)

In [None]:
# Encoder 
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size +1, embedding_dim, units, BATCH_SIZE)

# Echantillon d'output
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(input_text, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
    # hidden shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # Ceci est fait pour pour calculer notre score "d'attention"
        hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # On obtient 1 sur le dernier axe car on applique le score à self.V
    # La shape du tenseur avant que l'on applique self.V est (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    # Utilisé pour attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape après embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

    # x shape après concaténation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # Passage du vecteur concaténé à la couche GRU
        output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_tar_size + 1, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

# Loss

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Training 

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([all_en_tokens["<start>"]] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
steps_per_epoch = TAKE_SIZE

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(train_data.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 10 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  
  # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [None]:
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
for example, label in test_data.take(10):

    hidden = [tf.zeros((1, units))]
    input_t = example[0]
    output_label = label[0]
    enc_out, enc_hidden = encoder(tf.expand_dims(input_t, axis=0), hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([all_en_tokens["<start>"]], 0)

    result = ""
    for t in range(padded_fr_indices.shape[-1]):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                          dec_hidden,
                                                          enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()
        corresponding_word = [word for word, id in all_en_tokens.items() if id==predicted_id]
        result += corresponding_word[0] + " "

        if corresponding_word[0] == '<end>':
            break

    # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    input_sentence = ""
    for token_id in input_t:
        if token_id==0:
            break
    
        corresponding_word = [word for word, id in all_fr_tokens.items() if id==token_id]
        input_sentence += corresponding_word[0] + " "
        if corresponding_word[0] == "<end>":
            break

        true_translation = ""
    for token_id in output_label:
        if token_id==0:
            break
        corresponding_word = [word for word, id in all_en_tokens.items() if id==token_id]
        true_translation += corresponding_word[0] + " "
        if corresponding_word[0] == "<end>":
            break 

    true_translation = true_translation[8:]
    print("French sentence: {}".format(input_sentence))
    print("True translation: {}".format(true_translation))
    print("Modl translation: {}".format(result))
        

# Application concrete

In [None]:
phrase = input()
new_df = pd.DataFrame({'en': ["An English sentence"], 'fr' : [phrase]})
new_df.iloc[:, 0] = new_df.iloc[:, 0].apply(lambda x: begin_end_sentence(x))
new_fr_corpus = " ".join(new_df.iloc[:, 1].to_list())
new_en_corpus = " ".join(new_df.iloc[:, 0].to_list())
new_df["fr_tokens"] = new_df.iloc[:, 1].apply(lambda x: nlp_fr.tokenizer(x))
new_df["en_tokens"] = new_df.iloc[:, 0].apply(lambda x: nlp_en.tokenizer(x))
new_df["fr_indices"] = new_df["fr_tokens"].apply(lambda x: fr_tokens_to_index(x))
new_df["en_indices"] = new_df["en_tokens"].apply(lambda x: en_tokens_to_index(x))
new_padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(new_df["fr_indices"], maxlen=fr_max_len, padding="post")
new_padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(new_df["en_indices"], maxlen=en_max_len, padding="post")
new_tf_ds = tf.data.Dataset.from_tensor_slices((new_padded_fr_indices, new_padded_en_indices))
new_tf_ds = new_tf_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=False)
test = new_tf_ds.skip(0)
for example, label in test:

    hidden = [tf.zeros((1, units))]
    input_t = example[0]
    output_label = label[0]
    enc_out, enc_hidden = encoder(tf.expand_dims(input_t, axis=0), hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([all_en_tokens["<start>"]], 0)

    result = ""
    for t in range(new_padded_fr_indices.shape[-1]):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                          dec_hidden,
                                                          enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        corresponding_word = [word for word, id in all_en_tokens.items() if id==predicted_id]
        result += corresponding_word[0] + " "

        if corresponding_word[0] == '<end>':
            break
    
    # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    input_sentence = ""
    for token_id in input_t:
        if token_id==0:
            break
    
        corresponding_word = [word for word, id in all_fr_tokens.items() if id==token_id]
        input_sentence += corresponding_word[0] + " "
        if corresponding_word[0] == "<end>":
            break

    true_translation = true_translation[8:]
    print("French sentence: {}".format(input_sentence))
    print("Model translation: {}".format(result))
        

In [None]:
corresponding_word = [word for word, id in all_en_tokens.items() if id==predicted_id]
print(corresponding_word)

In [None]:
all_en_tokens.items()

In [None]:
pip install jupyterthemes

In [None]:
!jt -t chesterish

In [None]:
def distance_edition(m1,m2,cache=None):
    m1 = "  " + m1 + "  "
    m2 = "  " + m2 + "  "
    dist = {}
    dist[-1,-1] = 0
    for i in range(0,len(m1)):
        dist[i,-1] = i
    for j in range(0,len(m2)):
        dist[-1,j] = j

    for i, c in enumerate(m1):
        for j, d in enumerate(m2):
            d1 = dist[i-1,j] + 1 # insertion
            d2 = dist[i,j-1] + 1 # suppression
            x = 0 if c == d else 1
            d3 = dist[i-1,j-1] + x
            dist[i,j] = min(d1,d2,d3)
    return dist[len(m1)-1, len(m2)-1]

In [None]:
distance_edition("Checks", "checks")

In [None]:
def fr_tokens_to_index(tokens):
    indices = []
    for token in tokens:
        indices.append(all_fr_tokens[closest_existing_word(token.text)])
    return indices

In [None]:
def closest_existing_word(string) :
    lst = []
    for i in all_fr_tokens.keys() :
        lst.append((distance_edition(string, i), i))
    return (min(lst)[1])

In [None]:
    lst = []
    for i in all_fr_tokens.keys() :
        lst.append((distance_edition("il", i), i))
    lst.sort()
    lst

In [None]:
all_fr_tokens.keys()

In [None]:
new_df["fr_tokens"]

In [None]:
all_fr_tokens