## Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

path = 'Colab Notebooks/NLP/Project'

os.chdir(f'/content/drive/MyDrive/{path}')
os.getcwd()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/MyDrive/Colab Notebooks/NLP/Project'

In [None]:
import tensorflow as tf

import numpy as np
import os
import time
import re
import pandas as pd
import ast

Choose whether to use the whole dataset or just a small part of it

In [None]:
dataset = 'cornell-movie-dialogs-small'
# dataset = 'cornell-movie-dialogs'

These are the different files found in the dataset, let's see what's inside of them

In [None]:
with open(f'{dataset}/movie_lines.txt', encoding='utf-8') as f:
    lines = f.readlines()

with open(f'{dataset}/movie_conversations.txt', encoding='utf-8') as f:
    conversations = f.readlines()

with open(f'{dataset}/movie_titles_metadata.txt', encoding='latin-1') as f:
    titles = f.readlines()

with open(f'{dataset}/movie_characters_metadata.txt', encoding='latin-1') as f:
    characters = f.readlines()

The lines and conversation files that are conected with an ID of the line in the conversation.

> According to the dataset documentation these file contains
* 220,579 conversational exchanges between 10,292 pairs of movie characters
* involves 9,035 characters from 617 movies
* in total 304,713 utterances

Also on the titles file we can see

> Movie metadata included:
* genres
* release year
* IMDB rating
* number of IMDB votes
* IMDB rating

And this information for the characters file

> Character metadata included:
* gender (for 3,774 characters)
* position on movie credits (3,321 characters)


For easier visualization, we'll load the Loading them into a pandas dataframe, starting with the lines.


As we saw before, each column of the dataset is splitted by the string `+++$+++` then we'll set that as a parameter for the split function. Also we'll delete the last `\n` symbol from the line

In [None]:
def clean_text(text): # https://github.com/REDFOX1899/Chatbot/blob/master/Chatbot.py
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [None]:
# create dataframe with lines
df_lines = pd.DataFrame({'line_text': lines})

# split into columns
df_lines = df_lines['line_text'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_lines.columns = ['line_ID', 'speaker_ID', 'movie','speaker','text']

# delete new line character and other blank character
df_lines = df_lines.applymap(lambda x: x.rstrip() if isinstance(x, str) else x)
df_lines['text'] = df_lines['text'].str.rstrip('\n')

# Add column clean text
df_lines['clean_text'] = df_lines['text'].apply(lambda x: clean_text(x))

# add column speaker + text
df_lines['line'] = df_lines['speaker'] + ": " + df_lines['text']

df_lines.head(5)

Unnamed: 0,line_ID,speaker_ID,movie,speaker,text,clean_text,line
0,L1045,u0,m0,BIANCA,They do not!,they do not,BIANCA: They do not!
1,L1044,u2,m0,CAMERON,They do to!,they do to,CAMERON: They do to!
2,L985,u0,m0,BIANCA,I hope so.,i hope so,BIANCA: I hope so.
3,L984,u2,m0,CAMERON,She okay?,she okay,CAMERON: She okay?
4,L925,u0,m0,BIANCA,Let's go.,let's go,BIANCA: Let's go.


Now let's do the same thing with all the other txt files

In [None]:
# create dataframe with conversations
df_conv = pd.DataFrame({'conv': conversations})

# split into columns
df_conv = df_conv['conv'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_conv.columns = ['speaker1_ID', 'speaker2_ID', 'movie_ID','lines_list']

# delete new line character
df_conv['lines_list'] = df_conv['lines_list'].str.rstrip('\n')

# set lines_list to list type
df_conv['lines_list'] = df_conv['lines_list'].apply(ast.literal_eval)

df_conv.head(5)

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,lines_list
0,u0,u2,m0,"[L194, L195, L196, L197]"
1,u0,u2,m0,"[L198, L199]"
2,u0,u2,m0,"[L200, L201, L202, L203]"
3,u0,u2,m0,"[L204, L205, L206]"
4,u0,u2,m0,"[L207, L208]"


In [None]:
# create dataframe with conversations
df_title = pd.DataFrame({'title': titles})

# split into columns
df_title = df_title['title'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_title.columns = ['movie_ID','title','year','IMBD_rating','IMBD_votes','genres']

# cast types to what they are
df_title['IMBD_rating'] = df_title['IMBD_rating'].astype(float)
df_title['IMBD_votes'] = df_title['IMBD_votes'].astype(int)
# Clean 'year' column using regex (for cases like ' 1989/I ')
df_title['year'] = df_title['year'].apply(lambda x: re.sub(r'\D', '', x))  # \D significa "no dígito"
df_title['year'] = df_title['year'].astype(int)

# delete new line character
df_title['genres'] = df_title['genres'].str.rstrip('\n')

# set genres_list to list type
df_title['genres'] = df_title['genres'].apply(ast.literal_eval)
df_title.head(5)

Unnamed: 0,movie_ID,title,year,IMBD_rating,IMBD_votes,genres
0,m0,10 things i hate about you,1999,6.9,62847,"[comedy, romance]"
1,m1,1492: conquest of paradise,1992,6.2,10421,"[adventure, biography, drama, history]"
2,m2,15 minutes,2001,6.1,25854,"[action, crime, drama, thriller]"
3,m3,2001: a space odyssey,1968,8.4,163227,"[adventure, mystery, sci-fi]"
4,m4,48 hrs.,1982,6.9,22289,"[action, comedy, crime, drama, thriller]"


In [None]:
# create dataframe with conversations
df_chars = pd.DataFrame({'characters': characters})

# split into columns
df_chars = df_chars['characters'].str.split(r'\+\+\+\$\+\+\+', expand=True)
df_chars.columns = ['Character_ID','name','movie_ID','movie_title','gender','credits_pos']

# delete new line character
df_chars['credits_pos'] = df_chars['credits_pos'].str.rstrip('\n')
df_chars.head(5)

Unnamed: 0,Character_ID,name,movie_ID,movie_title,gender,credits_pos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


For easier handle of the conversations and lines we'll join them together in one dataframe

In [None]:
# Create a guide index
df_conv['index'] = df_conv.index
# Expand list into all sub item lines
expanded_lines = df_conv.explode('lines_list')
# Reset index of expanded df
expanded_lines.reset_index(drop=True, inplace=True)
# merge the line with line ids
merged_df = pd.merge(
    expanded_lines,
    df_lines,
    left_on='lines_list',
    right_on='line_ID',
    how='inner'
)

# Select relevant columns and group by the guide index
merged_df = merged_df[['speaker1_ID', 'speaker2_ID', 'movie_ID', 'lines_list', 'index', 'line']]
merged_df = merged_df.groupby(['index','speaker1_ID', 'speaker2_ID', 'movie_ID'])['line'].apply(list).reset_index()
merged_df = merged_df.rename(columns={'line': 'dialog'})
dialog_df = merged_df.drop(['index'], axis=1)

dialog_df

Unnamed: 0,speaker1_ID,speaker2_ID,movie_ID,dialog
0,u0,u2,m0,[ BIANCA: Can we make this quick? Roxanne Ko...
1,u0,u2,m0,[ BIANCA: You're asking me out. That's so cu...
2,u0,u2,m0,"[ BIANCA: No, no, it's my fault -- we didn't ..."
3,u0,u2,m0,"[ CAMERON: Why?, BIANCA: Unsolved mystery. ..."
4,u0,u2,m0,"[ BIANCA: Gosh, if only we could find Kat a b..."
...,...,...,...,...
83092,u9028,u9031,m616,[ COGHILL: Do you think she might be interest...
83093,u9028,u9031,m616,[ COGHILL: Choose your targets men. That's ri...
83094,u9030,u9034,m616,[ VEREKER: Colonel Durnford... William Vereke...
83095,u9030,u9034,m616,"[ DURNFORD: Your orders, Mr Vereker?, VEREKE..."


In [None]:
# Start following
#https://www.tensorflow.org/text/tutorials/text_generation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
lines = df_lines['clean_text']
lines

0                                               they do not
1                                                they do to
2                                                 i hope so
3                                                  she okay
4                                                  let's go
                                ...                        
304708     lord chelmsford seems to want me to stay back...
304709     i am to take the sikali with the main column ...
304710                               your orders mr vereker
304711     good ones yes mr vereker gentlemen who can ri...
304712     colonel durnford william vereker i hear you  ...
Name: clean_text, Length: 304713, dtype: object

In [None]:
concat_lines = lines.str.cat()
concat_lines[:100]

" they do not they do to i hope so she okay let's go wow okay  you are gonna need to learn how to lie"

In [None]:
vocab = sorted(set(concat_lines))
len(vocab)

87

In [None]:
vocab

['\t',
 ' ',
 '$',
 '%',
 '&',
 "'",
 '*',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '[',
 ']',
 '^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '£',
 '«',
 '´',
 '·',
 '¸',
 '»',
 'á',
 'ä',
 'å',
 'ç',
 'è',
 'ë',
 'ì',
 'í',
 'î',
 'ñ',
 'ò',
 'ó',
 'ô',
 'ö',
 'û',
 '˘',
 '˙',
 '˚',
 'π',
 '‘',
 '’',
 '‚',
 '“',
 '”',
 '‡',
 '•',
 '…',
 '‰',
 '‹',
 '∑',
 '≠',
 '≤',
 '≥',
 'ﬂ']

 Create the tf.keras.layers.StringLookup layer:

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

And another layer to obtain the readable text back `invert=True`.
Define a method to join the tokens back into a string

In [None]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

## Starting with prediction

In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(concat_lines, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [None]:
for ids in ids_dataset.take(20):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

 
t
h
e
y
 
d
o
 
n
o
t
 
t
h
e
y
 
d
o


In [None]:
seq_length = 100 # TODO: can be changed by the avarage line length

In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(10):
    print(text_from_ids(seq).numpy())

b" they do not they do to i hope so she okay let's go wow okay  you are gonna need to learn how to lie "
b'no i am kidding  you know how sometimes you just become this persona  and you do not know how to quit'
b' like my fear of wearing pastels the real you what good stuff i figured you would get to the good stu'
b'ff eventually thank god  if i had to hear one more story about your coiffure me  this endless blonde '
b'babble i am like boring myself what crap do you listen to this crap no then guillermo says if you go '
b'any lighter you are gonna look like an extra on 90210 you always been this selfish but then that is a'
b"ll you had to say well no you never wanted to go out with 'me did you i was i looked for you back at "
b'the party but you always seemed to be occupied tons have fun tonight i believe we share an art instru'
b'ctor you know chastity looks like things worked out tonight huh hi who knows  all i have ever heard h'
b'er say is that she would dip before dating a guy that

In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
dataset = sequences.map(split_input_target)

In [None]:
dataset

<_MapDataset element_spec=(TensorSpec(shape=(100,), dtype=tf.int64, name=None), TensorSpec(shape=(100,), dtype=tf.int64, name=None))>

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b" they do not they do to i hope so she okay let's go wow okay  you are gonna need to learn how to lie"
Target: b"they do not they do to i hope so she okay let's go wow okay  you are gonna need to learn how to lie "


In [None]:
#TODO: Tune these constants
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

## Model

In [None]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

## Test the model

In [None]:
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 88) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "my_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     multiple                  22528     
                                                                 
 gru_2 (GRU)                 multiple                  3938304   
                                                                 
 dense_3 (Dense)             multiple                  90200     
                                                                 
Total params: 4051032 (15.45 MB)
Trainable params: 4051032 (15.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([66,  2, 81, 16,  4, 31,  1, 69,  6, 51, 13, 42, 75, 68, 80, 53, 46,
       65, 75, 35, 41, 40,  7, 76,  0, 71, 60, 20, 71, 79, 40, 57, 41, 35,
       71, 23, 46, 33, 25, 59, 20, 40, 70, 58, 42, 20, 19, 45, 64, 36, 57,
       60, 42, 20, 77,  8, 54, 79, 16, 13, 66, 48, 13, 76, 18, 48, 27,  2,
       65, 62, 16,  7, 57, 64,  4, 11,  4, 34, 17, 50, 36, 69, 64, 47, 42,
       80,  1, 84, 20, 48, 44, 22, 63, 16, 25, 57, 45, 76, 29, 83])

Untrained model

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b"fore the police got there what do we tell him about the guns and money yeah it's been twentyseven ye"

Next Char Predictions:
 b"\xc3\xb4 \xe2\x80\xb08%j\t\xcb\x98'\xc2\xb75u\xe2\x80\x9a\xc3\xbb\xe2\x80\xa6\xc2\xbby\xc3\xb3\xe2\x80\x9ants*\xe2\x80\x9c[UNK]\xcb\x9a\xc3\xac^\xcb\x9a\xe2\x80\xa2s\xc3\xa7tn\xcb\x9abyld\xc3\xab^s\xcb\x99\xc3\xa8u^]x\xc3\xb2o\xc3\xa7\xc3\xacu^\xe2\x80\x9d0\xc3\xa1\xe2\x80\xa285\xc3\xb4\xc2\xa35\xe2\x80\x9c[\xc2\xa3f \xc3\xb3\xc3\xae8*\xc3\xa7\xc3\xb2%3%m9\xc2\xb4o\xcb\x98\xc3\xb2zu\xe2\x80\xa6\t\xe2\x89\xa0^\xc2\xa3wa\xc3\xb18d\xc3\xa7x\xe2\x80\x9ch\xe2\x88\x91"


## train model

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 88)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.4781566, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()

88.07217

In [None]:
model.compile(optimizer='adam', loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)
EPOCHS = 20

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/20

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
print(chars_from_ids)
print(ids_from_chars)

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(['How are you?'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

In [None]:
# vec = TfidfVectorizer(max_df=0.8, min_df=5, stop_words='english')
# vec.fit(lines)
# vocab = vec.get_feature_names_out() # same as doing vectorizer.vocabulary_

# print(f"Length of vocabulary: {len(vocab)}")

Examples

In [None]:
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

In [None]:
ids = ids_from_chars(chars)
ids

In [None]:
chars = chars_from_ids(ids)
chars

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()

In [None]:
split_input_target(list("Tensorflow"))