## **Sequence-to-sequence learning : EN to FR translation** - revision 01
## **Using pre-trained models**

### **1. Small_vocab translation**

In [1]:
import os

def load_data(path):

    # Nombre maximum de lignes à renvoyer
    max_lines = 140000
    
    input_file = os.path.join(path)
    with open(input_file, "r",  encoding="utf-8") as f:
        data = f.read()
        
    # On convertit les majuscules en minulcule
    data = data.lower()
    
    data = data.split('\n')
    return data[:min(len(data),max_lines)]

#Chargement des textes dans les 2 langues (max lignes = max_lines)
txt_en = load_data('../data/small_vocab_en')
txt_fr = load_data('../data/small_vocab_fr')

text_pairs = []
for line in range(len(txt_en)):
    txt_fr[line]=txt_fr[line].replace('à', 'a')
    text_pairs.append((txt_en[line], "[start] " + txt_fr[line] + " [end]" ))
    

In [2]:
import random
print(random.choice(text_pairs))
print(random.choice(text_pairs))

('california is sometimes warm during december , and it is never mild in march .', '[start] californie est parfois chaud en décembre , et il est doux jamais en mars . [end]')
('china is beautiful during july , and it is never quiet in march .', '[start] chine est belle en juillet , et il est jamais tranquille en mars . [end]')


**Vectorizing the English and French text pairs**

In [3]:
import tensorflow as tf
import string
import re
from tensorflow import keras
from tensorflow.keras import layers

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

def load_vocab(file_path):
    with open(file_path, "r",  encoding="utf-8") as file:
        return file.read().split('\n')[:-1]

vocab_size = 15000
sequence_length = 30

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    vocabulary = load_vocab("../data/eng_vocab.txt"),
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    # standardize=custom_standardization,
    vocabulary = load_vocab("../data/fra_vocab.txt"),
)
english_texts = [pair[0] for pair in text_pairs]
french_texts = [pair[1] for pair in text_pairs]


**Defining Sample sentences for translation**

In [4]:
input_sentence=[]
for i in range(20):
    input_sentence.append(random.choice(english_texts))

### **2. Sequence-to-sequence learning with RNNs**

**Loading of the trained RNN model**

In [5]:
seq2seq_rnn = keras.models.load_model("../data/seq2seq_rnn-fra-en-model.h5")

In [6]:
# seq2seq_rnn.load_weights("../data/seq2seq_rnn-fra-en-model.weights.h5")

In [7]:
import os
import h5py


def split(fname_src: str, fname_dest_prefix: str, maxsize_per_file: float):
    """
    Splits an `h5` file into smaller parts, size of each not exceeding
    `maxsize_per_file`.
    """
    idx = 0
    dest_fnames = []
    is_file_open = False
    
    with h5py.File(fname_src, "r") as src:
        print("src = ",src)
        for group in src:
            print("group = ",group)
            fname = f"{fname_dest_prefix}{idx}.h5"
            
            if not is_file_open:
                dest = h5py.File(fname, "w")
                dest.attrs.update(src.attrs)
                dest_fnames.append(fname)
                is_file_open = True
                
            group_id = dest.require_group(src[group].parent.name)
            src.copy(f"/{group}", group_id, name=group)
            
            if os.path.getsize(fname) > maxsize_per_file:
                dest.close()
                idx += 1
                is_file_open = False            
        dest.close()

    return dest_fnames
    

def combine(fname_in: list, fname_out: str):
    """
    Combines a series of `h5` files into a single file.
    """
    with h5py.File(fname_out, "w") as combined:
        for fname in fname_in:
            with h5py.File(fname, "r") as src:
                combined.attrs.update(src.attrs)
                for group in src:
                    group_id = combined.require_group(src[group].parent.name)
                    src.copy(f"/{group}", group_id, name=group)
                    
                    

prefix = "../data/seq2seq_rnn-fra-en-model2_part"
fname_src = "../data/seq2seq_rnn-fra-en-model.weights.h5"
size_max = 90 * 1024**2  # maximum size allowed in bytes
fname_parts = split(fname_src, fname_dest_prefix=prefix, maxsize_per_file=size_max)
combine(fname_in=fname_parts, fname_out="../data/seq2seq_rnn-fra-en-model2.h5")

src =  <HDF5 file "seq2seq_rnn-fra-en-model.weights.h5" (mode r)>
group =  bidirectional
group =  dense
group =  dropout
group =  embedding
group =  embedding_1
group =  english
group =  french
group =  gru_1
group =  top_level_model_weights


**Translating new sentences with our RNN encoder and decoder**

In [8]:
import numpy as np
fra_vocab = target_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 30

def decode_sequence_rnn(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence], verbose=0)
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence[8:-6]

for i in range(20):
    print("-")
    print("EN  ",input_sentence[i])
    print("FR->",decode_sequence_rnn(input_sentence[i]))

-
EN   india is beautiful during october , and it is quiet in fall .
FR-> l inde est beau en octobre et il est calme a l automne
-
EN   india is pleasant during fall , but it is never beautiful in winter .
FR-> l inde est agréable a lautomne mais il est beau jamais en hiver
-
EN   the united states is snowy during january , and it is never chilly in autumn .
FR-> les étatsunis est la neige en janvier et il est jamais froid a l automne
-
EN   new jersey is usually beautiful during april , but it is dry in may .
FR-> new jersey est généralement beau en avril mais il est sec en mai
-
EN   he dislikes bananas , grapes , and strawberries .
FR-> il naime les bananes les raisins et les fraises
-
EN   our least favorite fruit is the banana , but my least favorite is the strawberry .
FR-> notre fruit préféré moins est la banane mais mon préféré moins est la fraise
-
EN   the peach is my favorite fruit , but the grapefruit is her favorite .
FR-> la pêche est mon fruit préféré mais le pamplemouss

<br></br>
**You turn to play:** Enter a sentence..

In [9]:
your_sentence = "paris   is usually rainy during summer , but france is never wonderful in winter"
print("FR->",decode_sequence_rnn(your_sentence))

FR-> paris est généralement pluvieux pendant l été mais il est jamais merveilleux en hiver


### **3. Sequence-to-sequence learning with Transformer**

#### The Transformer decoder

**The `TransformerDecoder`**

In [10]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

#### Putting it all together: A Transformer for machine translation

**PositionalEmbedding layer**

In [11]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

**Loading of the trained Transformer model**

In [12]:
from keras_nlp.layers import TransformerEncoder
transformer = keras.models.load_model(
    "../data/transformer-fra-en-model.h5",
    custom_objects={"PositionalEmbedding": PositionalEmbedding, "TransformerDecoder": TransformerDecoder},
)
transformer.load_weights("../data/transformer-fra-en-model.weights.h5")

Using TensorFlow backend


**Translating new sentences with our Transformer model**

In [13]:
import numpy as np
fra_vocab = target_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 30

def decode_sequence_tranf(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence[8:-6]

for i in range(20):
    print("-")
    print("EN  ",input_sentence[i])
    print("FR->",decode_sequence_tranf(input_sentence[i]))

-
EN   india is beautiful during october , and it is quiet in fall .
FR-> l inde est beau en octobre et il est calme a l automne
-
EN   india is pleasant during fall , but it is never beautiful in winter .
FR-> l inde est agréable a lautomne mais il est beau jamais en hiver
-
EN   the united states is snowy during january , and it is never chilly in autumn .
FR-> les étatsunis est la neige en janvier et il est jamais froid a l automne
-
EN   new jersey is usually beautiful during april , but it is dry in may .
FR-> new jersey est généralement beau en avril mais il est sec en mai
-
EN   he dislikes bananas , grapes , and strawberries .
FR-> il naime les bananes les raisins et les fraises
-
EN   our least favorite fruit is the banana , but my least favorite is the strawberry .
FR-> notre fruit préféré moins est la banane mais mon préféré moins est la fraise
-
EN   the peach is my favorite fruit , but the grapefruit is her favorite .
FR-> la pêche est mon fruit préféré mais le pamplemouss

<br></br>
**You turn to play:** Enter a sentence..

In [14]:
your_sentence = "paris   is usually rainy during summer , but france is never wonderful in winter"
print("FR->",decode_sequence_tranf(your_sentence))

FR-> paris est généralement pluvieux en été mais il est jamais merveilleux en hiver
