# Text Generation using FNet

**Description:** FNet transformer for text generation in Keras.

## Imports

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import pandas as pd

# Defining hyperparameters

VOCAB_SIZE = 8192
MAX_SAMPLES = 50000
BUFFER_SIZE = 20000
MAX_LENGTH = 100
EMBED_DIM = 256
LATENT_DIM = 512
NUM_HEADS = 8
BATCH_SIZE = 8
EPOCHS = 50

2024-04-14 04:24:00.583678: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-14 04:24:00.944951: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 04:24:00.945036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 04:24:01.003426: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-14 04:24:01.154422: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-14 04:24:01.156399: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
# Load train, valid, and test datasets from CSV files
train_df = pd.read_csv('empathetic_dialogues/train.csv', on_bad_lines='skip')
valid_df = pd.read_csv('empathetic_dialogues/valid.csv', on_bad_lines='skip')
test_df = pd.read_csv('empathetic_dialogues/test.csv', on_bad_lines='skip')

# Concatenate the datasets
df = pd.concat([train_df, valid_df, test_df], ignore_index=True)

# Selecting the first 50% of the dataframe
half_len = len(df) // 2
df_half = df.iloc[:half_len]

# Print unique values of the 'context' column
unique_contexts = df['context'].unique()
print(unique_contexts)

['sentimental' 'afraid' 'proud' 'faithful' 'terrified' 'joyful' 'angry'
 'sad' 'jealous' 'grateful' 'prepared' 'embarrassed' 'excited' 'annoyed'
 'lonely' 'ashamed' 'guilty' 'surprised' 'nostalgic' 'confident' 'furious'
 'disappointed' 'caring' 'trusting' 'disgusted' 'anticipating' 'anxious'
 'hopeful' 'content' 'impressed' 'apprehensive' 'devastated']


In [3]:
# # Define the mapping
# mapping = {
#     'afraid': 'fearful',
#     'angry': 'angry',
#     'annoyed': 'angry',
#     'anticipating': 'neutral',
#     'anxious': 'fearful',
#     'apprehensive': 'neutral',
#     'ashamed': 'sad',
#     'caring': 'neutral',
#     'confident': 'neutral',
#     'content': 'joyful',
#     'devastated': 'sad',
#     'disappointed': 'sad',
#     'disgusted': 'angry',
#     'embarrassed': 'sad',
#     'excited': 'joyful',
#     'faithful': 'neutral',
#     'furious': 'angry',
#     'grateful': 'joyful',
#     'guilty': 'sad',
#     'hopeful': 'neutral',
#     'impressed': 'surprised',
#     'jealous': 'angry',
#     'joyful': 'joyful',
#     'lonely': 'sad',
#     'nostalgic': 'sad',
#     'prepared': 'neutral',
#     'proud': 'joyful',
#     'sad': 'sad',
#     'sentimental': 'sad',
#     'surprised': 'surprised',
#     'terrified': 'fearful',
#     'trusting': 'neutral'
# }

# # Apply the mapping to the 'context' column
# df['context'] = df['context'].map(mapping)

# df['context'].unique()


In [4]:
questions = []
answers = []

conversation = []
conversation_context = None

for entry in df.itertuples():
    # Check if the context changes
    if entry.context != conversation_context:
        # If there's already a conversation, split it into questions and answers
        if conversation:
            # Ensure that the conversation has at least one question and one answer
            if len(conversation) >= 2:
                # Iterate over the conversation, starting from the second utterance
                for idx in range(0, len(conversation)):
                    # If the utterance_idx is odd, it's an answer; otherwise, it's a question
                    if idx % 2 == 1:
                        user_entry_context = f"{conversation_context}: {conversation[idx - 1]}"
                        questions.append(user_entry_context)
                        answers.append(conversation[idx])
        # Start a new conversation
        conversation = [entry.utterance]
        conversation_context = entry.context
    else:
        conversation.append(entry.utterance)

# Create a new DataFrame from the lists
qa_df = pd.DataFrame({'question': questions, 'answer': answers})

In [5]:
qa_df.head(20)

Unnamed: 0,question,answer
0,sentimental: I remember going to see the firew...,Was this a friend you were in love with_comma_...
1,sentimental: This was a best friend. I miss her.,Where has she gone?
2,sentimental: We no longer talk.,Oh was this something that happened because of...
3,afraid: it feels like hitting to blank wall w...,Oh ya? I don't really see how
4,afraid: dont you feel so.. its a wonder,I do actually hit blank walls a lot of times b...
5,afraid: i virtually thought so.. and i used t...,Wait what are sweatings
6,proud: Hi how are you doing today,doing good.. how about you
7,proud: Im good_comma_ trying to understand how...,it's quite strange that you didnt imagine it
8,faithful: I have never cheated on my wife.,And thats something you should never do_comma_...
9,faithful: Yea it hasn't been easy but I am pro...,What do you mean it hasn't been easy? How clos...


In [6]:
# Split the data into training and validation sets
train_dataset = tf.data.Dataset.from_tensor_slices((questions, answers))
val_dataset = tf.data.Dataset.from_tensor_slices((questions, answers))

2024-04-14 04:24:06.276933: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-14 04:24:06.277682: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


## Loading data

We will be using the Cornell Dialog Corpus. We will parse the movie conversations into
questions and answers sets.

### Preprocessing and Tokenization

In [7]:
def preprocess_text(sentence):
    sentence = tf.strings.lower(sentence)
    # Removing "_comma_" from sentences
    sentence = tf.strings.regex_replace(sentence, "_comma_", ",")
    # Adding a space between the punctuation and the last word to allow better tokenization
    sentence = tf.strings.regex_replace(sentence, r"([?.!,])", r" \1 ")
    # Replacing multiple continuous spaces with a single space
    sentence = tf.strings.regex_replace(sentence, r"\s\s+", " ")
    # Replacing non-English words with spaces
    sentence = tf.strings.regex_replace(sentence, r"[^a-z?.!,]+", " ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[start]", sentence, "[end]"], separator=" ")
    return sentence

vectorizer = layers.TextVectorization(
    VOCAB_SIZE,
    standardize=preprocess_text,
    output_mode="int",
    output_sequence_length=MAX_LENGTH,
)

# We will adapt the vectorizer to both the questions and answers
# This dataset is batched to parallelize and speed up the process
vectorizer.adapt(tf.data.Dataset.from_tensor_slices((questions + answers)).batch(BATCH_SIZE))

### Tokenizing and padding sentences using `TextVectorization`

In [8]:
def vectorize_text(inputs, outputs):
    inputs, outputs = vectorizer(inputs), vectorizer(outputs)
    # One extra padding token to the right to match the output shape
    outputs = tf.pad(outputs, [[0, 1]])
    return (
        {"encoder_inputs": inputs, "decoder_inputs": outputs[:-1]},
        {"outputs": outputs[1:]},
    )


train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)

train_dataset = (
    train_dataset.cache()
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)
val_dataset = val_dataset.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## Creating the FNet Encoder

The FNet paper proposes a replacement for the standard attention mechanism used by the
Transformer architecture (Vaswani et al., 2017).

![Architecture](https://i.imgur.com/rLg47qU.png)

The outputs of the FFT layer are complex numbers. To avoid dealing with complex layers,
only the real part (the magnitude) is extracted.

The dense layers that follow the Fourier transformation act as convolutions applied on
the frequency domain.

In [9]:

class FNetEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs):
        # Casting the inputs to complex64
        inp_complex = tf.cast(inputs, tf.complex64)
        # Projecting the inputs to the frequency domain using FFT2D and
        # extracting the real part of the output
        fft = tf.math.real(tf.signal.fft2d(inp_complex))
        proj_input = self.layernorm_1(inputs + fft)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


## Creating the Decoder

The decoder architecture remains the same as the one proposed by (Vaswani et al., 2017)
in the original transformer architecture, consisting of an embedding, positional
encoding, two masked multi-head attention layers and finally the dense output layers.
The architecture that follows is taken from
[Deep Learning with Python, second edition, chapter 11](https://www.manning.com/books/deep-learning-with-python-second-edition).

In [10]:

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class FNetDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)


def create_model():
    encoder_inputs = keras.Input(shape=(None,), dtype="int32", name="encoder_inputs")
    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(encoder_inputs)
    encoder_outputs = FNetEncoder(EMBED_DIM, LATENT_DIM)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)
    decoder_inputs = keras.Input(shape=(None,), dtype="int32", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(
        shape=(None, EMBED_DIM), name="decoder_state_inputs"
    )
    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(decoder_inputs)
    x = FNetDecoder(EMBED_DIM, LATENT_DIM, NUM_HEADS)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
    decoder = keras.Model(
        [decoder_inputs, encoded_seq_inputs], decoder_outputs, name="outputs"
    )
    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    fnet = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="fnet")
    return fnet


## Creating and Training the model

In [11]:
fnet = create_model()
fnet.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Callback to save the Keras model at some frequency.
checkpoint_filepath = './checkpoints/checkpoint.model.keras'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


Here, the `epochs` parameter is set to a single epoch, but in practice the model will take around
**20-30 epochs** of training to start outputting comprehensible sentences. Although accuracy
is not a good measure for this task, we will use it just to get a hint of the improvement
of the network.

In [12]:
fnet.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset, callbacks=[model_checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50

In [None]:
# Save the final model
fnet.save("./models/chatbot_correct_emotions_2.keras")

## Performing inference

In [None]:
VOCAB = vectorizer.get_vocabulary()


def decode_sentence(input_sentence):
    # Mapping the input sentence to tokens and adding start and end tokens
    tokenized_input_sentence = vectorizer(
        tf.constant("[start] " + preprocess_text(input_sentence) + " [end]")
    )
    # Initializing the initial sentence consisting of only the start token.
    tokenized_target_sentence = tf.expand_dims(VOCAB.index("[start]"), 0)
    decoded_sentence = ""

    for i in range(MAX_LENGTH):
        # Get the predictions
        predictions = fnet.predict(
            {
                "encoder_inputs": tf.expand_dims(tokenized_input_sentence, 0),
                "decoder_inputs": tf.expand_dims(
                    tf.pad(
                        tokenized_target_sentence,
                        [[0, MAX_LENGTH - tf.shape(tokenized_target_sentence)[0]]],
                    ),
                    0,
                ),
            }
        )
        # Calculating the token with maximum probability and getting the corresponding word
        sampled_token_index = tf.argmax(predictions[0, i, :])
        sampled_token = VOCAB[sampled_token_index.numpy()]
        # If sampled token is the end token then stop generating and return the sentence
        if tf.equal(sampled_token_index, VOCAB.index("[end]")):
            break
        decoded_sentence += sampled_token + " "
        tokenized_target_sentence = tf.concat(
            [tokenized_target_sentence, [sampled_token_index]], 0
        )

    return decoded_sentence

In [None]:
decode_sentence("sadness: My sister got into a car accident.")

In [None]:
decode_sentence("sadness: My dog died.")

In [None]:
decode_sentence("fear: I am afraid to have a car crash.")