<a href="https://colab.research.google.com/github/DarksterTwilight/Eng_to_French-RNN-/blob/main/My_lstm_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Character-level recurrent sequence-to-sequence model



## Introduction


## Setup


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras


## Download the data


In [None]:
!!curl -O http://www.manythings.org/anki/fra-eng.zip
!!unzip fra-eng.zip


['Archive:  fra-eng.zip',
 '  inflating: _about.txt              ',
 '  inflating: fra.txt                 ']

## Configuration


In [None]:
batch_size = 64                      # Batch size for training.
epochs = 100                         # Number of epochs to train for.
latent_dim = 256                     # Latent dimensionality of the encoding space.
num_samples = 10000                  # Number of samples to train on.
data_path = "fra.txt"                # Path to the data txt file on disk.


## Prepare the data


In [None]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
print(type(input_characters))

<class 'set'>


In [None]:
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")                    #code to open and read data from a File (provided by Python3)

total_lines = len(lines)
if num_samples >= total_lines:
    start_sample = 0
else:
    start_sample = total_lines - num_samples - 1    #code to select last 10000 samples from the Corpus

for line in lines[start_sample: len(lines) - 1]:
    input_text, target_text, _ = line.split("\t")
    target_text = "\t" + target_text + "\n"         # We use "tab" as the "start sequence" character for the targets, and "\n" as "end sequence" character.
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

#print(type(input_characters))

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters) + 1 #pad character
num_decoder_tokens = len(target_characters) + 1 #pad character
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

#print(type(input_characters))

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)


<class 'set'>
<class 'list'>
Number of samples: 10000
Number of unique input tokens: 80
Number of unique output tokens: 103
Max sequence length for inputs: 286
Max sequence length for outputs: 351


In [None]:
input_token_index = dict([(char, i+1) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i+1) for i, char in enumerate(target_characters)])
print(input_token_index)
print(target_token_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, "'": 6, '+': 7, ',': 8, '-': 9, '.': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, '\xa0': 76, 'é': 77, '—': 78, '’': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '%': 6, "'": 7, '+': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I'

In [None]:
print('Integer Value of A is: '+str(input_token_index['A']))
print('Integer Value of B is: '+str(input_token_index['B']))
print('Integer Value of space is: '+str(input_token_index[' ']))

Integer Value of A is: 24
Integer Value of B is: 25
Integer Value of space is: 1


In [None]:
input_token_index['pad'] = 0
target_token_index['pad'] = 0

In [None]:
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length), dtype="float32"             #shapes of numpy arrays for storing data
)     # 10,000   ,   286
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length), dtype="float32"
)    #    10000    , 351
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)    #   10000    ,   351   , 103

print(type(encoder_input_data))
print(type(decoder_input_data))
print(type(decoder_target_data))

print(np.shape(encoder_input_data))
print(np.shape(decoder_input_data))
print(np.shape(decoder_target_data))

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]                      # Save the value corresponding to the "char" at position i, t
    encoder_input_data[i, t + 1 :] = input_token_index['pad']
    
    for t, char in enumerate(target_text):
        decoder_input_data[i, t] = target_token_index[char]                     # decoder_target_data is ahead of decoder_input_data by one timestep
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0       # decoder_target_data will be ahead by one timestep and will not include the start character.
    decoder_input_data[i, t + 1 :] = target_token_index['pad']
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(10000, 286)
(10000, 351)
(10000, 351, 103)


## Build the model


In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, embed_dim, num_encoder_tokens):
        super(Encoder, self).__init__()
        
        self.embedding_layer = tf.keras.layers.Embedding(embed_dim, num_encoder_tokens, mask_zero=True)  
            # bs , seq_len, emb_dim
        self.lstm_layer = tf.keras.layers.LSTM(embed_dim, return_state=True)

    def call(self, input, training = True):
        encoder_embeddings = self.embedding_layer(input)
        encoder_outputs, state_h, state_c = self.lstm_layer(encoder_embeddings)   # bs, emb dim  # bs, emb dim   # bs, emb dim
        encoder_states = [state_h, state_c]                                       # List of Encoder States
        return encoder_outputs, encoder_states

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, embed_dim, num_decoder_tokens):
        super(Decoder, self).__init__()
        
        self.embedding_layer = tf.keras.layers.Embedding(embed_dim, num_decoder_tokens, mask_zero=True)
        self.lstm_layer = keras.layers.LSTM(embed_dim, return_sequences=True, return_state=True)
        self.dense = keras.layers.Dense(num_decoder_tokens)

    def call(self, inputs, training = True):
        text_input = inputs[0]
        state_input = inputs[1]

        text_embedding = self.embedding_layer(text_input)

        decoder_ouputs, state_h, state_c = self.lstm_layer(text_embedding, initial_state=state_input)
        decoder_states = [state_h, state_c]

        decoder_op = self.dense(decoder_ouputs)

        return decoder_op, decoder_states


In [None]:
class training_model(tf.keras.Model):
    def __init__(self, encoder_model, decoder_model):
        super(training_model, self).__init__()
        
        self.encoder = encoder_model
        self.decoder = decoder_model

    def call(self, inputs, training = True):
        encoder_text = inputs[0]
        decoder_text = inputs[1]

        encoder_op, encoder_state_op = self.encoder(encoder_text, training = training)

        decoder_op, _ = self.decoder([decoder_text, encoder_state_op], training = training)

        return decoder_op

In [None]:
encoder_model = Encoder(latent_dim, num_encoder_tokens)
decoder_model = Decoder(latent_dim,  num_decoder_tokens)
model = training_model(encoder_model, decoder_model)

## Train the model


In [None]:
model.compile(
    optimizer="rmsprop", loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)
# Save model
model.save("s2s")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78



INFO:tensorflow:Assets written to: s2s\assets


INFO:tensorflow:Assets written to: s2s\assets


## Run inference (sampling)

1. encode input and retrieve initial decoder state
2. run one step of decoder with this initial state
and a "start of sequence" token as target.
Output will be the next target token.
3. Repeat with the current target token and current states


In [None]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

def decode_sequence(input_seq):
    # Encode the input as state vectors.
    _, states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index["\t"]

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, [h, c] = decoder_model.predict([target_seq , states_value])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]
    return decoded_sentence



You can now generate decoded sentences as such:


In [None]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)


-
Input sentence: It seems unlikely that the train will arrive on time.
Decoded sentence: Il se passe dans la pièce et le train était arrivé de l'école.

-
Input sentence: It was much more difficult than we initially thought.
Decoded sentence: Ce fut la semaine de la maison comportement la plus grosse erreur.

-
Input sentence: It was much more difficult than we initially thought.
Decoded sentence: Ce fut la semaine de la maison comportement la plus grosse erreur.

-
Input sentence: It was much more difficult than we initially thought.
Decoded sentence: Ce fut la semaine de la maison comportement la plus grosse erreur.

-
Input sentence: It was much more difficult than we initially thought.
Decoded sentence: Ce fut la semaine de la maison comportement la plus grosse erreur.

-
Input sentence: It was so hot that I thought I was going to pass out.
Decoded sentence: Il faisait tellement froid que tu ne sais pas simpler le temps de rentrer chez moi.

-
Input sentence: It was so hot that I 