In [3]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

dialogues = load_data("../dataset/train/dialogues_train.txt")
emotions = load_data("../dataset/train/dialogues_emotion_train.txt")
dialog_acts = load_data("../dataset/train/dialogues_act_train.txt")


In [8]:
# Data formatting
input_sequences = []
target_sequences = []
emotion_labels = []
dialog_act_labels = []

for dialogue, emotions_line, dialog_act in zip(dialogues, emotions, dialog_acts):
    user_utterances = dialogue.split('__eou__')[:-1]  # Split by end-of-utterance marker
    num_utterances = len(user_utterances)

    # Handle emotions
    individual_emotions = emotions_line.split()  # Split emotions on spaces
    emotion_label = int(individual_emotions[0])  # Take the first emotion label

    # Handle dialog acts
    individual_dialog_acts = dialog_act.split()
    dialog_act_label = int(individual_dialog_acts[0])

    for i in range(num_utterances - 1):
        input_seq = f"User: {user_utterances[i]}"
        target_seq = f"Chatbot: {user_utterances[i+1]}"
        input_sequences.append(input_seq)
        target_sequences.append(target_seq)
        emotion_labels.append(emotion_label)  # Use the extracted emotion label
        dialog_act_labels.append(dialog_act_label)


In [25]:
import tensorflow as tf
# from tensorflow.keras.layers import Input, Embedding, PositionalEncoding, MultiHeadAttention, FeedForwardNetwork, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention

In [27]:
def positional_encoding(max_seq_len, d_model):
    positions = tf.range(max_seq_len, dtype=tf.float32)[:, tf.newaxis]
    angles = 1 / tf.pow(10000, (2 * (tf.range(d_model, dtype=tf.float32) // 2)) / d_model)
    angle_rads = positions * angles
    sines = tf.math.sin(angle_rads[:, 0::2])
    cosines = tf.math.cos(angle_rads[:, 1::2])
    pos_encoding = tf.concat([sines, cosines], axis=-1)
    pos_encoding = pos_encoding[tf.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

def multi_head_attention(d_model, num_heads):
    return MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads)

def feed_forward_network(d_model, d_ff, dropout_rate):
    return FeedForwardNetwork(units=[d_ff, d_model], activation='relu', dropout=dropout_rate)

def transformer_encoder_layer(d_model, num_heads, d_ff, dropout_rate, name='transformer_encoder'):
    inputs = Input(shape=(None, d_model), name='input')
    input_mask = Input(shape=(), dtype=tf.bool, name='input_mask')
    
    pos_encodings = positional_encoding(max_seq_len, d_model)
    x = inputs + pos_encodings
    
    attn_output = multi_head_attention(d_model, num_heads)(query=x, key=x, value=x, attention_mask=input_mask)
    attn_output = Dropout(dropout_rate)(attn_output)
    x = LayerNormalization(epsilon=1e-6)(x + attn_output)
    
    ffn_output = feed_forward_network(d_model, d_ff, dropout_rate)(x)
    ffn_output = Dropout(dropout_rate)(ffn_output)
    x = LayerNormalization(epsilon=1e-6)(x + ffn_output)
    
    encoder_model = Model(inputs=[inputs, input_mask], outputs=x, name=name)
    return encoder_model

def transformer_model(num_layers, d_model, num_heads, d_ff, input_vocab_size, max_seq_len, dropout_rate):
    inputs = Input(shape=(None,), name='input')
    input_mask = Input(shape=(), dtype=tf.bool, name='input_mask')
    
    x = Embedding(input_vocab_size, d_model)(inputs)
    x = x * tf.math.sqrt(tf.cast(d_model, tf.float32))
    
    for _ in range(num_layers):
        x = transformer_encoder_layer(d_model, num_heads, d_ff, dropout_rate)([x, input_mask])
    
    transformer = Model(inputs=[inputs, input_mask], outputs=x, name='transformer')
    return transformer



In [28]:
# Define your variables
vocab_size = 10000  # Define the size of your vocabulary
max_seq_len = 50   # Define the maximum sequence length
num_layers = 4  # Define the number of transformer layers
d_model = 128  # Define the dimension of the model
num_heads = 8  # Define the number of attention heads
d_ff = 512  # Define the dimension of the feedforward network
dropout_rate = 0.1  # Define the dropout rate
learning_rate = 0.001  # Define the learning rate
batch_size = 64  # Define the batch size
num_epochs = 10  # Define the number of training epochs

# Define and adapt the input tokenizer
input_tokenizer = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_sequence_length=max_seq_len)
input_tokenizer.adapt(input_sequences)

# Tokenize and pad input sequences
input_data = input_tokenizer(input_sequences)

# Define and adapt the target tokenizer
target_tokenizer = tf.keras.layers.TextVectorization(max_tokens=vocab_size, output_sequence_length=max_seq_len)
target_tokenizer.adapt(target_sequences)

# Tokenize and pad target sequences
target_data = target_tokenizer(target_sequences)

# Create masks for input sequences
input_masks = tf.math.logical_not(tf.math.equal(input_data, 0))

# Convert emotion and dialog act labels to tensors
emotion_labels = tf.convert_to_tensor(emotion_labels, dtype=tf.int32)
dialog_act_labels = tf.convert_to_tensor(dialog_act_labels, dtype=tf.int32)

# Build and compile the model
transformer = transformer_model(num_layers, d_model, num_heads, d_ff, vocab_size, max_seq_len, dropout_rate)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

transformer.compile(optimizer=optimizer, loss=loss_fn)

# Train the model
history = transformer.fit(
    [input_data, input_masks],
    target_data,
    batch_size=batch_size,
    epochs=num_epochs,
    validation_split=0.1
)

# Define the response generation function
def generate_response(input_text):
    input_tokens = input_tokenizer(input_text)
    input_tokens = tf.expand_dims(input_tokens, 0)
    input_mask = tf.math.logical_not(tf.math.equal(input_tokens, 0))
    
    predicted_tokens = transformer.predict([input_tokens, input_mask])
    
    # Convert predicted tokens back to text
    predicted_text = target_tokenizer.detokenize(predicted_tokens)[0].numpy().decode('utf-8')
    return predicted_text


ValueError: Exception encountered when calling layer "multi_head_attention_1" (type MultiHeadAttention).

dim -3 not in the interval [-2, 1]. for '{{node multi_head_attention_1/ExpandDims}} = ExpandDims[T=DT_BOOL, Tdim=DT_INT32](Placeholder_1, multi_head_attention_1/ExpandDims/dim)' with input shapes: [?], [] and with computed input tensors: input[1] = <-3>.

Call arguments received by layer "multi_head_attention_1" (type MultiHeadAttention):
  • query=tf.Tensor(shape=(None, 50, 128), dtype=float32)
  • value=tf.Tensor(shape=(None, 50, 128), dtype=float32)
  • key=tf.Tensor(shape=(None, 50, 128), dtype=float32)
  • attention_mask=tf.Tensor(shape=(None,), dtype=bool)
  • return_attention_scores=False
  • training=None
  • use_causal_mask=False