In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/Corpus/

Mounted at /content/drive
/content/drive/My Drive/Corpus


In [3]:
# Opening English file and reading the sentences
text_file = "europarl-v7.nl-en(english).txt"
with open(text_file) as f:
    lines_english = f.read().split("\n")[:-1]

# creating a text lines for English
text_eng = []
for line in lines_english:
    english = line.split("\t")
    text_eng.append(english)


In [4]:
# Opening Dutch file and reading the sentences
text_file = "europarl-v7.nl-en(nl).txt"
with open(text_file) as f:
    lines_dutch = f.read().split("\n")[:-1]

# creating a list for lines in Dutch
text_nl = []
for line in lines_dutch:
    dutch = line.split("\t")
    text_nl.append(dutch)

In [5]:
print("The length of the English corpus is", len(text_eng))
print("The length of the Dutch corpus is", len(text_nl))

The length of the English corpus is 1997775
The length of the Dutch corpus is 1997775


In [8]:
# Cutting the list due to resource constraints
new_text_eng = []
new_text_nl = []

# selecting sentences which are 15 words or less
for i in range(len(text_eng)):
  for j in text_eng[i]:
    eng = j
  eng = eng.split(" ")
  if len(eng) >= 1 and len(eng) <= 15:
    new_text_eng.append(text_eng[i])
    new_text_nl.append(text_nl[i])
  if len(new_text_eng) == 100000:
    break


In [9]:
# Pairing the sentences together
text_pairs = []
for i in range(100000):
  english_sent = str(new_text_eng[i])
  dutch_sent = "[start] " + str(new_text_nl[i]) + " [end]"
  text_pairs.append((english_sent, dutch_sent))

In [11]:
import random

# Creating the Training, validation and testing pairs
random.shuffle(text_pairs)

num_val_samples = int(0.15 * len(text_pairs)) #15%
num_train_samples = len(text_pairs) - 2 * num_val_samples #70%

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

### Vectorising Text

In [12]:
# Libraries
import tensorflow as tf
import string
import re
from tensorflow import keras
from tensorflow.keras import layers


# removing punctuations
strip_chars = string.punctuation
strip_chars = strip_chars.replace("[","")
strip_chars = strip_chars.replace("]","")

# Function for preprocessing
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

# defining the size of the vector
vocab_size = 15000
sequence_length = 15

# converting the sentences into integer sentences. The source_vectorization is
# the english sentences
source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# converting the sentences into integer sentences. The source_vectorization is
# the dutch sentences
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

# vectorising the training text pairs
train_english_texts = [pair[0] for pair in train_pairs]
train_dutch_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_dutch_texts)

In [13]:
# Creating batches for training and validation
batch_size = 64

# This function vectorizes the text pairs using the previously defined source &
# target vorization layers
def format_dataset(eng, dutch):
    # removing the punctuation and turning it lower case
    eng = source_vectorization(eng)
    dutch = target_vectorization(dutch)
    # the first token is excluded as the model will predict the next tokebn
    return ({
        "english": eng,
        "dutch": dutch[:, :-1],
    }, dutch[:, 1:])

# This function takes in the pairs and creates a datasets with batches
def make_dataset(pairs):
    eng_texts, nl_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    nl_texts = list(nl_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, nl_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

# calling the functiom to create the dataset
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [14]:
# Printing the structure of the batches
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['dutch'].shape: {inputs['dutch'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 15)
inputs['dutch'].shape: (64, 15)
targets.shape: (64, 15)


### Sequence to Sequence Learning with RNN

In [None]:
# Importing Libaries
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# Setting size of embedding vector and the number of units in the latent dimension
embed_dim = 256
latent_dim = 1024

# The source represents the input (which is English words)
source = keras.Input(shape=(None,), dtype="int64", name="english")
# Defining the input, which is converted into a vector
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
# Bidirectional means that it can be processed in both directions
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim),
    merge_mode="sum")(x)

# Represents the output (Dutch sentences)
past_target = keras.Input(shape=(None,), dtype="int64", name="dutch")
# Defining the second input
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
# Also defining original sentence in Englist
x = decoder_gru(x, initial_state=encoded_source)

# Introducing dropout for regularisation
x = layers.Dropout(0.5)(x)
# Softmax activation function to generate probabilities of possible answer
target_next_step = layers.Dense(vocab_size, activation="softmax")(x)

# Defines the seq2seq model with the two inputs and one output
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [None]:
# Compiling and Fitting the model
seq2seq_rnn.compile(
    optimizer="rmsprop", # Defining optimzer function - how NN will update weights
    loss="sparse_categorical_crossentropy", # Defining loss function
    metrics=["accuracy"]) # Defining the metric which will be used for evaluation
seq2seq_rnn.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x788b200c89d0>

In [None]:
# Printing sentences and calculated the translated sentence in dutch
import numpy as np
dutch_vocab = target_vectorization.get_vocabulary()
dutch_index_lookup = dict(zip(range(len(dutch_vocab)), dutch_vocab))
max_decoded_sentence_length = 15

# function which calculates the translated sentence
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = dutch_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

# printing the english sentences and aim to translate it to dutch
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
['The framework for these benchmarks has now been identified.']
[start] [de [UNK] voor deze kwestie is nu [UNK] [end]
-
['Between our rapporteur and our draftsman, they produced a very sound report.']
[start] [de voorstellen van onze fractie en ons zijn een goed verslag van de heer [UNK]
-
['According to the Rules of Procedure, this is not possible.']
[start] [volgens de onderhandelingen is het niet [UNK] om dit te doen] [end]
-
['This is one perception of Europe, but it is one that I do not support.']
[start] [dat is een [UNK] maar die ik [UNK] niet [UNK] [end]
-
['I could go on and on.']
[start] [ik wil daar nu over tot de [UNK] en zeggen] [end]
-
['I cannot understand it.']
[start] [ik kan het niet [UNK] [end]
-
['I will also be shocked if Labour MEPs vote against that.']
[start] [ik zal ook op de collegas van de [UNK] ook tegen dit verslag stemmen] [end]
-
['The concept of prosperity includes both economic growth and social equilibrium.']
[start] [het [UNK] van de economische en 

### Sequence to Sequence Learning with Transformer

In [19]:
# Importing Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [20]:
# A class to define the single encoder layer in the transformer
class TransformerEncoder(layers.Layer):
  # This class initialises the encoder layer, by defining the parameters, layers.
  # Furthermore, it uses inheritance
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        # multi-head attnetion mechanism
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        # a feed-forward neural network is defined with two dense layers
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        # two normalisation layers
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

  # The call function defines the forward pass, computing attention scores
    def call(self, inputs, mask=None):
        # applying mask if not applied, to reshape for attention mechanism
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        # multi-head attention mechanism is applied to the inputs
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        # normalised using a normalisation layer
        proj_input = self.layernorm_1(inputs + attention_output)
        # the result of normalisation is passed through feed-forward loop
        proj_output = self.dense_proj(proj_input)
        # the output is added with a residual connection and passed through normalisation layer
        return self.layernorm_2(proj_input + proj_output)

  # The get config function allows the model to configure the model and obtain
  # and load previous model results
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [21]:
# Defining a single layer in the Decoder
class TransformerDecoder(layers.Layer):

  # This function initialises the decoder layer, it uses Inheritance and defines paramters.
  # Furthermore, it defines the instance variables and layers
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        # two multi-head attention mechanism are created
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        # a feed forward loop
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        # three normalisation layers
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        # set to true to ensure that the layer can handle masking
        self.supports_masking = True

  # The get config function allows the model to configure the model and obtain and load previous model results
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

  # The function will define causal attention mask to ensure correct processing of tokens
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    # The class defines the forward pass of the decoder layer, which includes a self-attention mechanism,
    # a cross-attention mechanism to attend over the encoder's outputs, a feed-forward neural network, and layer normalization steps.
    def call(self, inputs, encoder_outputs, mask=None):
      # the attention mask is generated and applied
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        # self-attention is applied
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        # passing output and residual connection through normalisation layer
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        # cross-attention is applied
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        # passing output and residual connection through normalisation layer
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        # passing the output through a feed-forward network
        proj_output = self.dense_proj(attention_output_2)
        # returning the output and residual connection after the normalisation
        return self.layernorm_3(attention_output_2 + proj_output)

In [22]:
# Defining the positional encoding (to include position of the word)

class PositionalEmbedding(layers.Layer):
    # This function initialises the decoder layer, it uses Inheritance and defines paramters.
    # Furthermore, it defines the instance variables and layers
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        # embedding layer for the tokens
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        # embedding layer for the positional tokens
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    # Defines the forward pass of the layer, embedding tokens and their positions
    def call(self, inputs):
        # positions of the tokens in the sequence are generated
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        # tokens are embedded
        embedded_tokens = self.token_embeddings(inputs)
        # positions are embedded
        embedded_positions = self.position_embeddings(positions)
        # the two embeddings are added together
        return embedded_tokens + embedded_positions

    # This function returns the mask of the input tokents
    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    # The get config function allows the model to configure the model and obtain and load previous model results
    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [23]:
# Defining the size of the respective layers
embed_dim = 256
dense_dim = 2048
num_heads = 8

# Defining the encoder phase
# size of the input layer is defined below
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
# the dutch input tokens are passed through the Positional embedding layer to get
# unique embedding of token and the position
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
# the embedded layer and encoder outputs are passed through the layer
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

# Defining the decoder phase
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="dutch")
# passed through positional embedding layer
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
# passed through decoder layer
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
# dropout layer for regularisation
x = layers.Dropout(0.5)(x)
# softmax function to generate probabiltiy distributions over the Dutch
# vocabulary for each position in the output sequence
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

# Defining the transformer to translate sentences
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 dutch (InputLayer)          [(None, None)]               0         []                            
                                                                                                  
 positional_embedding (Posi  (None, None, 256)            3843840   ['english[0][0]']             
 tionalEmbedding)                                                                                 
                                                                                                  
 positional_embedding_1 (Po  (None, None, 256)            3843840   ['dutch[0][0]']           

In [25]:
# Compiling and fitting the Transformer
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7a184038f640>

In [30]:
import numpy as np

# retrieves the vocabulary
dutch_vocab = target_vectorization.get_vocabulary()
dutch_index_lookup = dict(zip(range(len(dutch_vocab)), dutch_vocab))
max_decoded_sentence_length = 15

# this function will calculate the decoded sentence from the input sentence
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization(
            [decoded_sentence])[:, :-1]
        # making predictions using the transformer model
        predictions = transformer(
            [tokenized_input_sentence, tokenized_target_sentence])
        # selecting probable next word
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = dutch_index_lookup[sampled_token_index]
        # appends the most probable word
        decoded_sentence += " " + sampled_token
        # breaks the loop if at the end of the loop
        if sampled_token == "[end]":
            break
    return decoded_sentence

# extracting a sentence from the Enlish sentences and printing their translations
# using the function
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_eng_texts)
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence))

-
['These too are all firmly on the agenda.']
[start] [dit gaat ook om de [UNK] [end]
-
['Implementing the Mexico agreement therefore serves a pilot function.']
[start] [daarom is het overeenkomst ook een duidelijk dat de eerste keer zo zal worden gemaakt
-
['I believe it is fair to say that the euro tends to promote economic stability.']
[start] [ik denk dat het goede [UNK] zijn om de euro te bereiken over economische stabiliteit
-
['Can you give us some idea whether there will be any across-the-board reductions?']
[start] [kunt u ons een mening geven of er op dit punt dat er zich ook
-
['The next item is the joint debate on the following reports:']
[start] [aan de orde is het gecombineerd debat over de volgende verslagen] [end]
