## Dataset

In [None]:
import pandas as pd
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the movie lines dataset
movie_lines = pd.read_csv('/content/drive/MyDrive/movie_lines.txt', delimiter = "\+\+\+\$\+\+\+", engine='python', header=None, names=["lineID", "characterID", "movieID", "character", "text"], encoding='ISO-8859-1')

# Load the movie characters metadata dataset
movie_characters = pd.read_csv('/content/drive/MyDrive/movie_characters_metadata.txt', delimiter = "\+\+\+\$\+\+\+", engine='python', header=None, names=["characterID", "character", "movieID", "movie", "gender", "position"], encoding='ISO-8859-1')

# Load the movie titles metadata dataset
movie_titles = pd.read_csv('/content/drive/MyDrive/movie_titles_metadata.txt', delimiter = "\+\+\+\$\+\+\+", engine='python', header=None, names=["movieID", "movie", "year", "rating", "votes", "genres"], encoding='ISO-8859-1')

# Load the movie conversations dataset
movie_conversations = pd.read_csv('/content/drive/MyDrive/movie_conversations.txt', delimiter = "\+\+\+\$\+\+\+", engine='python', header=None, names=["characterID_1", "characterID_2", "movieID", "lines"], encoding='ISO-8859-1')

movie_lines.head()


Unnamed: 0,lineID,characterID,movieID,character,text
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [None]:
import ast
# Start by stripping leading/trailing spaces from all relevant fields in the datasets
movie_lines['characterID'] = movie_lines['characterID'].str.strip()
movie_lines['movieID'] = movie_lines['movieID'].str.strip()

movie_characters['characterID'] = movie_characters['characterID'].str.strip()
movie_characters['movieID'] = movie_characters['movieID'].str.strip()

movie_titles['movieID'] = movie_titles['movieID'].str.strip()

# Merge movie_lines and movie_characters
merged_data = pd.merge(movie_lines, movie_characters, how="left", on=["characterID", "movieID"])

# Merge merged_data and movie_titles
merged_data = pd.merge(merged_data, movie_titles, how="left", on=["movieID"])

# Lowercase, strip and remove quotes from text
merged_data['text'] = merged_data['text'].str.lower().str.strip().str.replace('"', '')

# Replace NaNs in genres with empty lists
merged_data['genres'] = merged_data['genres'].fillna("[]")

# Strip leading and trailing spaces from genre strings
merged_data['genres'] = merged_data['genres'].str.strip()

# Convert genres from string to list
merged_data['genres'] = merged_data['genres'].apply(ast.literal_eval)

# Remove unwanted white space
merged_data['text'] = merged_data['text'].str.replace("\n", " ")

# Show the first few rows of the merged data
merged_data.head()


Unnamed: 0,lineID,characterID,movieID,character_x,text,character_y,movie_x,gender,position,movie_y,year,rating,votes,genres
0,L1045,u0,m0,BIANCA,they do not!,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,L1044,u2,m0,CAMERON,they do to!,CAMERON,10 things i hate about you,m,3,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
2,L985,u0,m0,BIANCA,i hope so.,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
3,L984,u2,m0,CAMERON,she okay?,CAMERON,10 things i hate about you,m,3,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
4,L925,u0,m0,BIANCA,let's go.,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"


In [None]:
merged_data.to_csv('genre.csv')

In [None]:
from keras import backend as K
K.clear_session()

In [None]:
df = pd.read_csv('genre.csv')

In [None]:
df = df.dropna()

In [None]:
len(df)

304446

In [None]:
import pandas as pd

# Assuming `df` has been defined and loaded as in your previous examples

# Explode the genres so each row corresponds to a single genre
df_exploded = df.explode('genres')

# Function to drop half the rows for each genre
def drop_rows(group):
    return group.sample(frac=0.08, random_state=42)

# Apply the function to each genre group
halved_df = df_exploded.groupby('genres').apply(drop_rows).reset_index(drop=True)

# De-duplicate the dataframe by dropping duplicates based on some unique column(s)
# Assuming `lineID` is a unique identifier for each row
deduplicated_df = halved_df.drop_duplicates(subset='lineID').reset_index(drop=True)


In [None]:
len(deduplicated_df)

24350

In [None]:
df = deduplicated_df

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,lineID,characterID,movieID,character_x,text,character_y,movie_x,gender,position,movie_y,year,rating,votes,genres
0,219893,L373084,u6526,m433,TANK,"hey, mikey, he likes it! ready for more?",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['action', 'adventure', 'adventure', 'adventur..."
1,219525,L373184,u6520,m433,CYPHER,honestly. morpheus. he got them all amped up...,CYPHER,the matrix,m,6,the matrix,1999,8.7,389480,"['action', 'adventure', 'adventure', 'adventur..."
2,219875,L373472,u6526,m433,TANK,"neo, this is loco. they've got morpheus in a ...",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['action', 'adventure', 'adventure', 'adventur..."
3,219914,L373527,u6527,m433,TRINITY,"tank, i need a pilot program for a military m-...",TRINITY,the matrix,f,3,the matrix,1999,8.7,389480,"['action', 'adventure', 'adventure', 'adventur..."
4,219869,L373591,u6526,m433,TANK,operator.,TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['action', 'adventure', 'adventure', 'adventur..."


In [None]:
# Convert the string representation of list to an actual list
df['genres'] = df['genres'].apply(eval)

# Remove duplicates from the genres list
df['genres'] = df['genres'].apply(lambda x: list(set(x)))


df.head()

Unnamed: 0.1,Unnamed: 0,lineID,characterID,movieID,character_x,text,character_y,movie_x,gender,position,movie_y,year,rating,votes,genres
0,219893,L373084,u6526,m433,TANK,"hey, mikey, he likes it! ready for more?",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"[sci-fi, action, adventure]"
1,219525,L373184,u6520,m433,CYPHER,honestly. morpheus. he got them all amped up...,CYPHER,the matrix,m,6,the matrix,1999,8.7,389480,"[sci-fi, action, adventure]"
2,219875,L373472,u6526,m433,TANK,"neo, this is loco. they've got morpheus in a ...",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"[sci-fi, action, adventure]"
3,219914,L373527,u6527,m433,TRINITY,"tank, i need a pilot program for a military m-...",TRINITY,the matrix,f,3,the matrix,1999,8.7,389480,"[sci-fi, action, adventure]"
4,219869,L373591,u6526,m433,TANK,operator.,TANK,the matrix,m,7,the matrix,1999,8.7,389480,"[sci-fi, action, adventure]"


In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Embedding, Input, Dense, Flatten, Concatenate
from keras.layers import LSTM, Bidirectional
from keras.layers import Attention, GlobalAveragePooling1D
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import MultiLabelBinarizer

# 1. Prepare dialogue sequences
# --------------------------------

# Initialize lists to store input dialogues, target dialogues, and their genres
input_dialogues = []
target_dialogues = []
genres = []

# Group the dataframe by movieID and convert each group to a list of dialogues
dialogues_per_movie = df.groupby('movieID')['text'].apply(list)

# For each movie, create sequences of dialogues and their genres
for movie, dialogues in dialogues_per_movie.items():
    movie_genres = df[df['movieID'] == movie]['genres'].values[0]
    for i in range(len(dialogues)-1):
        input_dialogues.append(dialogues[i])
        target_dialogues.append(dialogues[i+1])
        genres.append(movie_genres)

# Initialize the tokenizer for dialogues
tokenizer_dialogue = Tokenizer()
tokenizer_dialogue.fit_on_texts(input_dialogues + target_dialogues)

vocab_size = len(tokenizer_dialogue.word_index) + 1

# Tokenize and pad sequences for input and target dialogues
sequences_input_dialogue = tokenizer_dialogue.texts_to_sequences(input_dialogues)
sequences_target_dialogue = tokenizer_dialogue.texts_to_sequences(target_dialogues)

# Calculate 90th percentile of sequence lengths for input and target dialogues
max_seq_len_input_dialogue = int(np.percentile([len(seq) for seq in sequences_input_dialogue], 90))

# Pad sequences for input and target dialogues
padded_sequences_input_dialogue = pad_sequences(sequences_input_dialogue, maxlen=max_seq_len_input_dialogue, padding='post')
padded_sequences_target_dialogue = pad_sequences(sequences_target_dialogue, maxlen=max_seq_len_input_dialogue, padding='post')

# Create teacher input sequences
teacher_input_sequences = np.hstack([np.zeros((padded_sequences_target_dialogue.shape[0], 1)), padded_sequences_target_dialogue[:, :-1]])

# 2. Prepare genre encoding
# --------------------------

# Applying MultiLabelBinarizer
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(genres)

# Determine max_genre_len
max_genre_len = genres_encoded.shape[1]

In [None]:
mlb.classes_

array(['action', 'adult', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir',
       'history', 'horror', 'music', 'musical', 'mystery', 'romance',
       'sci-fi', 'short', 'sport', 'thriller', 'war', 'western'],
      dtype=object)

In [None]:
vocab_size, max_seq_len_input_dialogue, max_genre_len

(17002, 23, 24)

In [None]:
len(padded_sequences_input_dialogue), len(padded_sequences_target_dialogue)

(23734, 23734)

In [None]:
from sklearn.model_selection import train_test_split

# Splitting data into train, test and validation sets
# For dialogues, target dialogues, and teacher input
dialogue_temp, dialogue_test, target_temp, target_test, teacher_temp, teacher_test = train_test_split(
    padded_sequences_input_dialogue,
    padded_sequences_target_dialogue,
    teacher_input_sequences,
    test_size=0.15,
    random_state=42
)

# Splitting genres into train and test
genre_temp, genre_test = train_test_split(
    genres_encoded,
    test_size=0.15,
    random_state=42
)

# Splitting dialogues, target dialogues, teacher input, and genres into train and validation sets
dialogue_train, dialogue_val, target_train, target_val, teacher_train, teacher_val, genre_train, genre_val = train_test_split(
    dialogue_temp,
    target_temp,
    teacher_temp,
    genre_temp,
    test_size=0.176,
    random_state=42
)

In [None]:
print(dialogue_train.shape)
print(genre_train.shape)
print(target_train.shape)
print(teacher_train.shape)

(16622, 23)
(16622, 24)
(16622, 23)
(16622, 23)


## Ablation study

In [None]:
from sklearn.model_selection import KFold

# 1. Wrap model building in a function
def build_model(LSTM_UNITS, DROPOUT_RATE):
  # Define hyperparameters
  LSTM_UNITS = LSTM_UNITS
  NUM_GENRES = len(mlb.classes_)
  L1_REG = 0.001
  L2_REG = 0.001
  DROPOUT_RATE = DROPOUT_RATE

  # Assuming you have already defined these during data preprocessing:
  MAX_LEN = max_seq_len_input_dialogue
  MAX_WORDS = vocab_size
  EMBEDDING_DIM = 16

  # Input for dialogues
  input_dialogue = Input(shape=(MAX_LEN,))
  # Input for genres
  input_genre = Input(shape=(NUM_GENRES,))
  # Input for teacher forcing
  input_teacher = Input(shape=(MAX_LEN,))

  # Embedding layer for dialogues
  embedding_layer = Embedding(MAX_WORDS, EMBEDDING_DIM)(input_dialogue)

  # Bidirectional LSTM for dialogues
  bi_lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
      LSTM(LSTM_UNITS, return_sequences=True, return_state=True,
           kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG)))(embedding_layer)

  state_h = Concatenate()([forward_h, backward_h])
  state_c = Concatenate()([forward_c, backward_c])

  # Dropout for regularization
  bi_lstm_dropout = Dropout(DROPOUT_RATE)(bi_lstm)

  # Attention Mechanism
  attention_layer = Attention(use_scale=True)
  attention_result = attention_layer([bi_lstm_dropout, bi_lstm_dropout])

  # Incorporate teacher forcing input into the decoder
  teacher_embedding = Embedding(MAX_WORDS, EMBEDDING_DIM)(input_teacher)

  # Combine teacher forcing input and attention output
  decoder_input = Concatenate(axis=-1)([teacher_embedding, attention_result, tf.keras.layers.RepeatVector(MAX_LEN)(input_genre)])

  # LSTM Decoder with dropout and regularization, initialized with encoder states
  decoder_lstm = LSTM(2*LSTM_UNITS, return_sequences=True,
                    kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG))(decoder_input, initial_state=[state_h, state_c])
  decoder_lstm_dropout = Dropout(DROPOUT_RATE)(decoder_lstm)

  # Dense layer to predict the next word in the sequence
  output = Dense(MAX_WORDS, activation='softmax')(decoder_lstm_dropout)

  # Compile the model
  model = Model(inputs=[input_dialogue, input_genre, input_teacher], outputs=output)
  model.compile(optimizer=Adam(clipnorm=1.0), loss='sparse_categorical_crossentropy')

  return model

In [None]:
from sklearn.model_selection import KFold, train_test_split
from keras import regularizers
from keras.layers import Input, Embedding, Bidirectional, LSTM, Concatenate, Dropout, Dense, Attention
from keras.models import Model
from keras.optimizers import Adam
import numpy as np

# ... [Your existing model building code]

# Evaluate using KFold cross-validation
def evaluate_model(dialogues, targets, teachers, genres, n_splits=5):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_losses = []

    for train_idx, val_idx in kfold.split(dialogues):
        model = build_model(LSTM_UNITS, DROPOUT_RATE)

        # Split data
        dialogue_train, dialogue_val = dialogues[train_idx], dialogues[val_idx]
        target_train, target_val = targets[train_idx], targets[val_idx]
        teacher_train, teacher_val = teachers[train_idx], teachers[val_idx]
        genre_train, genre_val = genres[train_idx], genres[val_idx]

        # Train for 2 epochs
        model.fit([dialogue_train, genre_train, teacher_train], target_train,
                  epochs=2, validation_data=([dialogue_val, genre_val, teacher_val], target_val))

        # Get validation loss
        val_loss = model.history.history['val_loss'][1]
        val_losses.append(val_loss)

    return np.mean(val_losses)

# Hyperparameter configurations (for demonstration, you can change/add more as per your requirements)
configs = [
    {"LSTM_UNITS": 16, "DROPOUT_RATE": 0.2},
    {"LSTM_UNITS": 16, "DROPOUT_RATE": 0.3},
    {"LSTM_UNITS": 16, "DROPOUT_RATE": 0.5},
    {"LSTM_UNITS": 32, "DROPOUT_RATE": 0.2},
    {"LSTM_UNITS": 32, "DROPOUT_RATE": 0.3},
    {"LSTM_UNITS": 32, "DROPOUT_RATE": 0.5},
    {"LSTM_UNITS": 64, "DROPOUT_RATE": 0.2},
    {"LSTM_UNITS": 64, "DROPOUT_RATE": 0.3},
    {"LSTM_UNITS": 64, "DROPOUT_RATE": 0.5},
    # ... add more configurations here
]

for config in configs:
    LSTM_UNITS = config["LSTM_UNITS"]
    DROPOUT_RATE = config["DROPOUT_RATE"]
    val_loss = evaluate_model(dialogue_train, target_train, teacher_train, genre_train)
    print(f"Configuration - LSTM Units: {LSTM_UNITS}, Dropout Rate: {DROPOUT_RATE}, Validation Loss: {val_loss}")

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Configuration - LSTM Units: 16, Dropout Rate: 0.2, Validation Loss: 2.9615820407867433
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Configuration - LSTM Units: 16, Dropout Rate: 0.3, Validation Loss: 2.9465479373931887
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Configuration - LSTM Units: 16, Dropout Rate: 0.5, Validation Loss: 2.8142449378967287
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Configuration - LSTM Units: 32, Dropout Rate: 0.2, Validation Loss: 3.1620522022247313
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Configuration - LSTM Units: 32, Dropout Rate: 0.3, Validation Loss: 3.077867841720581
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Epoch 

## LSTM with Attention Mechanism

In [None]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, Concatenate, Dense, Dropout, Attention
from keras.optimizers import Adam
from keras import regularizers

# Define hyperparameters
LSTM_UNITS = 16
NUM_GENRES = len(mlb.classes_)
L1_REG = 0.001
L2_REG = 0.001
DROPOUT_RATE = 0.5

# Assuming you have already defined these during data preprocessing:
MAX_LEN = max_seq_len_input_dialogue
MAX_WORDS = vocab_size
EMBEDDING_DIM = 16

# Input for dialogues
input_dialogue = Input(shape=(MAX_LEN,))
# Input for genres
input_genre = Input(shape=(NUM_GENRES,))
# Input for teacher forcing
input_teacher = Input(shape=(MAX_LEN,))

# Embedding layer for dialogues
embedding_layer = Embedding(MAX_WORDS, EMBEDDING_DIM)(input_dialogue)

# Bidirectional LSTM for dialogues
bi_lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(
    LSTM(LSTM_UNITS, return_sequences=True, return_state=True,
         kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG)))(embedding_layer)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# Dropout for regularization
bi_lstm_dropout = Dropout(DROPOUT_RATE)(bi_lstm)

# Attention Mechanism
attention_layer = Attention(use_scale=True)
attention_result = attention_layer([bi_lstm_dropout, bi_lstm_dropout])

# Incorporate teacher forcing input into the decoder
teacher_embedding = Embedding(MAX_WORDS, EMBEDDING_DIM)(input_teacher)

# Combine teacher forcing input and attention output
decoder_input = Concatenate(axis=-1)([teacher_embedding, attention_result, tf.keras.layers.RepeatVector(MAX_LEN)(input_genre)])

# LSTM Decoder with dropout and regularization, initialized with encoder states
decoder_lstm = LSTM(2*LSTM_UNITS, return_sequences=True,
                    kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG))(decoder_input, initial_state=[state_h, state_c])
decoder_lstm_dropout = Dropout(DROPOUT_RATE)(decoder_lstm)

# Dense layer to predict the next word in the sequence
output = Dense(MAX_WORDS, activation='softmax')(decoder_lstm_dropout)

# Compile the model
model = Model(inputs=[input_dialogue, input_genre, input_teacher], outputs=output)
model.compile(optimizer=Adam(clipnorm=1.0), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()


Model: "model_45"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_136 (InputLayer)         [(None, 23)]         0           []                               
                                                                                                  
 embedding_90 (Embedding)       (None, 23, 16)       272032      ['input_136[0][0]']              
                                                                                                  
 bidirectional_45 (Bidirectiona  [(None, 23, 32),    4224        ['embedding_90[0][0]']           
 l)                              (None, 16),                                                      
                                 (None, 16),                                                      
                                 (None, 16),                                               

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=7, verbose=1)

callbacks = [early_stopping]

In [None]:
# Setting hyperparameters for training
BATCH_SIZE = 64
EPOCHS = 25

# Convert target sequences for training and validation to 3D as the loss expects 3D data
target_train_3D = target_train.reshape(*target_train.shape, 1)
target_val_3D = target_val.reshape(*target_val.shape, 1)

# Fit the model
history = model.fit(
    [dialogue_train, genre_train, teacher_train],  # Add the teacher_train input here
    target_train_3D,
    validation_data=([dialogue_val, genre_val, teacher_val], target_val_3D),  # And add the teacher_val input here
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

# Saving the model (optional)
model.save('LSTM_attention_tf.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## Text generation and evaluation

In [None]:
K.clear_session()

In [None]:
from keras.models import load_model
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from keras.utils import pad_sequences

# 1. Load the model
model = load_model('LSTM_attention_tf.h5')

# 2. Generate predictions for the test set
num_samples = 1000  # Limiting to 1000 samples as per your requirement
predicted_sequences = model.predict([dialogue_test[:num_samples], genre_test[:num_samples], teacher_test[:num_samples]])

# Convert predictions to tokens
predicted_tokens = [np.argmax(seq, axis=-1) for seq in predicted_sequences]

# 3. Convert tokens back to words using your tokenizer
predicted_texts = tokenizer_dialogue.sequences_to_texts(predicted_tokens)
target_texts = tokenizer_dialogue.sequences_to_texts(target_test[:num_samples])

# 4. Calculate BLEU score using nltk
smoother = SmoothingFunction().method2  # Using method 7 for smoothing

bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smoother) for ref, pred in zip(target_texts, predicted_texts)]

average_bleu_score = sum(bleu_scores) / len(bleu_scores)

print(f"Average BLEU score for {num_samples} samples: {average_bleu_score:.4f}")


Average BLEU score for 1000 samples: 0.0769


In [None]:
def generate_dialogue_iteratively(sample_text, genre, model, tokenizer, max_len, mlb, num_words):
    sequence = tokenizer.texts_to_sequences([sample_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post')

    if isinstance(genre, str):
        genre = [genre]
    genre_encoded = mlb.transform([genre])

    generated_text = ""
    teacher_input = np.zeros((1, max_len))

    for i in range(num_words):
        generated_sequence = model.predict([padded_sequence, genre_encoded, teacher_input])
        next_word_token = int(np.argmax(generated_sequence[0][i], axis=-1))  # Extract token from current timestep
        next_word = tokenizer.index_word.get(next_word_token, "")
        if not next_word:
            break
        generated_text += " " + next_word
        # We don't necessarily need to pad the generated_text since it will always grow
        padded_sequence = pad_sequences(tokenizer.texts_to_sequences([generated_text]), maxlen=max_len, padding='post')

    return generated_text.strip()

sample_text = "I thought you were cool"
genre_input = "romance"
generated_response = generate_dialogue_iteratively(sample_text, genre_input, model, tokenizer_dialogue, max_seq_len_input_dialogue, mlb, vocab_size)
print(sample_text)
print(f"Generated Dialogue: {generated_response}")

I thought you were cool
Generated Dialogue: i
