In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [2]:
from tensorflow.keras import backend as K
K.clear_session()

In [3]:
import pandas as pd
import numpy as np

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv('/content/drive/MyDrive/genre (2).csv')

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,lineID,characterID,movieID,character_x,text,character_y,movie_x,gender,position,movie_y,year,rating,votes,genres
0,0,L1045,u0,m0,BIANCA,they do not!,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,1,L1044,u2,m0,CAMERON,they do to!,CAMERON,10 things i hate about you,m,3,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
2,2,L985,u0,m0,BIANCA,i hope so.,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
3,3,L984,u2,m0,CAMERON,she okay?,CAMERON,10 things i hate about you,m,3,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
4,4,L925,u0,m0,BIANCA,let's go.,BIANCA,10 things i hate about you,f,4,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"


In [7]:
# Explode the genres so each row corresponds to a single genre
data_exploded = data.explode('genres')

# Function to drop half the rows for each genre
def drop_rows(group):
    return group.sample(frac=0.1, random_state=42)

# Apply the function to each genre group
halved_data = data_exploded.groupby('genres').apply(drop_rows).reset_index(drop=True)

# De-duplicate the dataframe by dropping duplicates based on some unique column(s)
# Assuming `lineID` is a unique identifier for each row
data = halved_data.drop_duplicates(subset='lineID').reset_index(drop=True)

In [8]:
# Convert the string representation of list to an actual list
data['genres'] = data['genres'].apply(eval)

# Remove duplicates from the genres list
data['genres'] = data['genres'].apply(lambda x: list(set(x)))

# If you want to convert the list back to a string representation
data['genres'] = data['genres'].apply(str)

data.head()

Unnamed: 0.1,Unnamed: 0,lineID,characterID,movieID,character_x,text,character_y,movie_x,gender,position,movie_y,year,rating,votes,genres
0,219893,L373084,u6526,m433,TANK,"hey, mikey, he likes it! ready for more?",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['sci-fi', 'action', 'adventure']"
1,219525,L373184,u6520,m433,CYPHER,honestly. morpheus. he got them all amped up...,CYPHER,the matrix,m,6,the matrix,1999,8.7,389480,"['sci-fi', 'action', 'adventure']"
2,219875,L373472,u6526,m433,TANK,"neo, this is loco. they've got morpheus in a ...",TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['sci-fi', 'action', 'adventure']"
3,219914,L373527,u6527,m433,TRINITY,"tank, i need a pilot program for a military m-...",TRINITY,the matrix,f,3,the matrix,1999,8.7,389480,"['sci-fi', 'action', 'adventure']"
4,219869,L373591,u6526,m433,TANK,operator.,TANK,the matrix,m,7,the matrix,1999,8.7,389480,"['sci-fi', 'action', 'adventure']"


In [9]:
import ast

# Convert the string representation of list to actual list
data['genres'] = data['genres'].apply(ast.literal_eval)

# Create input-output pairs for dialogues
data['input_text'] = data['text'].shift(1)  # Previous dialogue becomes input
data['output_text'] = data['text']  # Current dialogue becomes output

# Drop the first row (since it will have NaN for input_text)
data = data.dropna(subset=['input_text'])

# Display the updated dataframe
data[['genres', 'input_text', 'output_text']].head()

Unnamed: 0,genres,input_text,output_text
1,"[sci-fi, action, adventure]","hey, mikey, he likes it! ready for more?",honestly. morpheus. he got them all amped up...
2,"[sci-fi, action, adventure]",honestly. morpheus. he got them all amped up...,"neo, this is loco. they've got morpheus in a ..."
3,"[sci-fi, action, adventure]","neo, this is loco. they've got morpheus in a ...","tank, i need a pilot program for a military m-..."
4,"[sci-fi, action, adventure]","tank, i need a pilot program for a military m-...",operator.
5,"[sci-fi, action, adventure]",operator.,what vase?


In [10]:
missing_values = data[['input_text', 'output_text']].isnull().sum()
missing_values

input_text      0
output_text    26
dtype: int64

In [11]:
data = data.dropna(subset=['output_text'])

In [12]:
len(data)

30411

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

MAX_WORDS = 8000
MAX_LEN = 31

# Initialize a tokenizer for input texts
input_tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
input_tokenizer.fit_on_texts(data['input_text'])

# Initialize a tokenizer for output texts
output_tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
output_tokenizer.fit_on_texts(data['output_text'])

# Convert texts to sequences
input_sequences = input_tokenizer.texts_to_sequences(data['input_text'])
output_sequences = output_tokenizer.texts_to_sequences(data['output_text'])

# Pad the sequences
input_padded = pad_sequences(input_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
output_padded = pad_sequences(output_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

input_padded.shape, output_padded.shape

((30411, 31), (30411, 31))

In [14]:
len(input_tokenizer.word_index)+1

18982

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize a MultiLabelBinarizer to handle multiple genres per movie
mlb = MultiLabelBinarizer()

# Fit and transform the genres to get one-hot encoded vectors
genres_encoded = mlb.fit_transform(data['genres'])

# Display the shape of the encoded genres and the genre classes
genres_encoded.shape, mlb.classes_

((30411, 24),
 array(['action', 'adult', 'adventure', 'animation', 'biography', 'comedy',
        'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir',
        'history', 'horror', 'music', 'musical', 'mystery', 'romance',
        'sci-fi', 'short', 'sport', 'thriller', 'war', 'western'],
       dtype=object))

In [16]:
# --- GloVe Integration ---
def load_glove_embeddings(filepath, word_index, embedding_dim):
    # Adjust the vocabulary size to be the minimum of specified max words or the actual vocab size + 1 for OOV
    vocab_size = min(len(word_index) + 1, MAX_WORDS)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.split()
            word = parts[0]
            if word in word_index and word_index[word] < vocab_size:
                idx = word_index[word]
                vector = np.asarray(parts[1:], dtype='float32')
                embedding_matrix[idx] = vector

    return embedding_matrix

EMBEDDING_DIM=200
GLOVE_FILEPATH = "/content/drive/MyDrive/glove.6B.200d.txt"
embedding_matrix = load_glove_embeddings(GLOVE_FILEPATH, input_tokenizer.word_index, EMBEDDING_DIM)

In [17]:
import tensorflow as tf
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, GRU, Concatenate, Dense, Dropout
from keras import regularizers
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True)

# Define hyperparameters
GRU_UNITS = 64
NUM_GENRES = len(mlb.classes_)
L1_REG = 1e-4
L2_REG = 1e-4
DROPOUT_RATE = 0.2

# Input for dialogues
input_dialogue = Input(shape=(MAX_LEN,))
# Input for genres
input_genre = Input(shape=(NUM_GENRES,))

# Embedding layer for dialogues
embedding_layer = Embedding(MAX_WORDS, EMBEDDING_DIM,weights=[embedding_matrix], trainable=False)(input_dialogue)

# Bidirectional GRU for dialogues
bi_gru = Bidirectional(GRU(GRU_UNITS, return_sequences=True,
                           kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG)))(embedding_layer)

# Dropout for regularization
bi_gru_dropout = Dropout(DROPOUT_RATE)(bi_gru)

# Concatenate the output of Bidirectional GRU with the genre vector
concat = Concatenate(axis=-1)([bi_gru_dropout, tf.keras.layers.RepeatVector(MAX_LEN)(input_genre)])

# GRU Decoder with dropout and regularization
decoder_gru = GRU(GRU_UNITS, return_sequences=True,
                  kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG))(concat)
decoder_gru_dropout = Dropout(DROPOUT_RATE)(decoder_gru)

# Dense layer to predict the next word in the sequence
output = Dense(MAX_WORDS, activation='softmax')(decoder_gru_dropout)

# Compile the model
model = Model(inputs=[input_dialogue, input_genre], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 31)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 31, 200)      1600000     ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 31, 128)      102144      ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 24)]         0           []                               
                                                                                              

In [18]:
output.shape

TensorShape([None, 31, 8000])

In [19]:
from sklearn.model_selection import train_test_split
# Reshape the output_padded for compatibility with sparse categorical crossentropy
output_padded_reshaped = output_padded.reshape(*output_padded.shape, 1)

# Splitting data into training and validation sets
input_train, input_val, genre_train, genre_val, output_train, output_val = train_test_split(
    input_padded, genres_encoded, output_padded_reshaped, test_size=0.1, random_state=42)

# Training the model
history = model.fit([input_train, genre_train], output_train,
                    validation_data=([input_val, genre_val], output_val),
                    epochs=25, batch_size=8,callbacks=[early_stopping])

model.save('seq2seq_gru2.h5')

# Saving the model weights
model.save_weights('seq2seq_gru2_weights.h5')


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25


In [20]:
from nltk.translate.bleu_score import sentence_bleu

# Generate predictions on the validation set
predicted_sequences = model.predict([input_val, genre_val])

# Convert the predicted token sequences back to text
predicted_texts = []
for sequence in predicted_sequences:
    tokens = np.argmax(sequence, axis=-1)
    predicted_text = output_tokenizer.sequences_to_texts([tokens])[0]
    predicted_texts.append(predicted_text)

# Convert the actual output token sequences in the validation set back to text
actual_texts = output_tokenizer.sequences_to_texts(output_val.squeeze(-1))

# Calculate BLEU score
bleu_scores = [sentence_bleu([actual.split()], predicted.split(), weights=(1, 0, 0, 0)) for actual, predicted in zip(actual_texts, predicted_texts)]
average_bleu = np.mean(bleu_scores)

average_bleu



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.7090305613878816

In [22]:
# Function to generate dialogue based on user input
def generate_dialogue(input_text, genre_list):
    # Convert input text and genre list to model's expected format
    input_sequence = input_tokenizer.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_sequence, maxlen=MAX_LEN, padding='post', truncating='post')
    genre_encoded = mlb.transform([genre_list])

    # Predict next dialogue using the model
    predicted_sequence = model.predict([input_padded, genre_encoded])
    tokens = np.argmax(predicted_sequence[0], axis=-1)
    predicted_text = output_tokenizer.sequences_to_texts([tokens])[0]

    return predicted_text

# Interactive loop for dialogue generation
while True:
    # Ask user for a dialogue
    user_input = input("Enter a dialogue: ")

    # Ask user for genres
    genres = input("Enter genres (comma separated, e.g. Comedy,Action): ").split(',')
    genres = [genre.strip() for genre in genres]

    # Generate and print the next dialogue
    response = generate_dialogue(user_input, genres)
    print(f"Generated Dialogue: {response}")

    # Ask the user if they want to continue
    continue_prompt = input("Do you want to continue? (yes/no): ").strip().lower()
    if continue_prompt != 'yes':
        break

Enter a dialogue: 
Enter genres (comma separated, e.g. Comedy,Action): 




Generated Dialogue: i <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>
Do you want to continue? (yes/no): 


In [23]:
param_grid = {
    'GRU_UNITS': [16,32, 64],
    'DROPOUT_RATE': [0.1, 0.2, 0.3]
}

In [24]:
import random

def random_search(param_grid, n_iter=10):
    keys = param_grid.keys()
    param_combinations = []

    for _ in range(n_iter):
        params = {}
        for key in keys:
            params[key] = random.choice(param_grid[key])
        param_combinations.append(params)

    return param_combinations

combinations = random_search(param_grid, n_iter=7)

In [25]:
def build_model(gru_units, dropout_rate):
    # Input for dialogues
    input_dialogue = Input(shape=(MAX_LEN,))
    # Input for genres
    input_genre = Input(shape=(NUM_GENRES,))

    # Embedding layer for dialogues
    embedding_layer = Embedding(MAX_WORDS, EMBEDDING_DIM,weights=[embedding_matrix], trainable=False)(input_dialogue)

    # Bidirectional GRU for dialogues
    bi_gru = Bidirectional(GRU(gru_units, return_sequences=True,
                               kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG)))(embedding_layer)

    # Dropout for regularization
    bi_gru_dropout = Dropout(dropout_rate)(bi_gru)

    # Concatenate the output of Bidirectional GRU with the genre vector
    concat = Concatenate(axis=-1)([bi_gru_dropout, tf.keras.layers.RepeatVector(MAX_LEN)(input_genre)])

    # GRU Decoder with dropout and regularization
    decoder_gru = GRU(gru_units, return_sequences=True,
                      kernel_regularizer=regularizers.l1_l2(l1=L1_REG, l2=L2_REG))(concat)
    decoder_gru_dropout = Dropout(dropout_rate)(decoder_gru)

    # Dense layer to predict the next word in the sequence
    output = Dense(MAX_WORDS, activation='softmax')(decoder_gru_dropout)

    # Compile the model
    model = Model(inputs=[input_dialogue, input_genre], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


In [27]:
from sklearn.model_selection import KFold
from tensorflow.keras.models import clone_model
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min', restore_best_weights=True)

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

best_score = float('inf')
best_params = None

for combo in combinations:
    scores = []

    for train_index, val_index in kf.split(input_padded):
        input_train, input_val = input_padded[train_index], input_padded[val_index]
        genre_train, genre_val = genres_encoded[train_index], genres_encoded[val_index]
        output_train, output_val = output_padded_reshaped[train_index], output_padded_reshaped[val_index]

        model = build_model(combo['GRU_UNITS'], combo['DROPOUT_RATE'])  # Assuming you've wrapped your model creation in a function

        history = model.fit([input_train, genre_train], output_train, validation_data=([input_val, genre_val], output_val),
                            epochs=1, batch_size=8, callbacks=[early_stopping], verbose=0)  # Set verbose=0 to reduce output

        score = min(history.history['val_loss'])
        scores.append(score)

    avg_score = np.mean(scores)

    if avg_score < best_score:
        best_score = avg_score
        best_params = combo

print("Best Parameters:", best_params)

Best Parameters: {'GRU_UNITS': 32, 'DROPOUT_RATE': 0.1}
