# RNN Generative Model

In [1]:
#Opening csv files location
from google.colab import drive
drive.mount('/content/drive') #, force_remount=True
%cd drive/My Drive/Colab Notebooks/Project

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Project


In [2]:
import pandas as pd
import re
import numpy as np
from collections import Counter, defaultdict
from nltk.util import ngrams
import os
import tensorflow as tf
import time
import random
import csv

In [3]:
AllReviews = pd.read_csv('Data/AllReviews.csv')
AllReviews = AllReviews.loc[(AllReviews["titleType"] == "tvMovie")|(AllReviews["titleType"] == "movie")]

Reviews_Genres_Title = AllReviews[["Review", "genres"]]
Reviews_Genres_Title = Reviews_Genres_Title.drop_duplicates()
Reviews_Genres_Title = Reviews_Genres_Title.loc[Reviews_Genres_Title["Review"].isna() == False]
Reviews_Genres_Title = Reviews_Genres_Title.drop(Reviews_Genres_Title.loc[(Reviews_Genres_Title["genres"] =='\\N')==True].index)

Complete_Reviews = Reviews_Genres_Title.copy()
Complete_Reviews['Review']=Complete_Reviews['Review'].str.lower()
Complete_Reviews['Review']=Complete_Reviews['Review'].str.replace('<[^>]+>', '')
Complete_Reviews['Review']=Complete_Reviews['Review'].str.replace('[^A-Za-z0-9 ]+', '')

  Complete_Reviews['Review']=Complete_Reviews['Review'].str.replace('<[^>]+>', '')
  Complete_Reviews['Review']=Complete_Reviews['Review'].str.replace('[^A-Za-z0-9 ]+', '')


In [4]:
#Every unique character left in the movie reviews
vocab = sorted(set(' '.join(Complete_Reviews['Review'])))

In [5]:
#Every review saved in one string
text = ' '.join(Complete_Reviews['Review'])

In [None]:
#Every character in vocab
' '.join(vocab)

'  0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z'

In [6]:
#Break data into arrays of single characters
chars = tf.strings.unicode_split(Complete_Reviews['Review'], input_encoding='UTF-8')

In [7]:
#Encoder ids
ids_from_chars = tf.keras.layers.StringLookup(vocabulary=list(vocab), mask_token=None)

In [8]:
ids = ids_from_chars(chars)

In [9]:
#Decoder ids
chars_from_ids = tf.keras.layers.StringLookup(vocabulary = ids_from_chars.get_vocabulary(), invert = True, mask_token = None)

In [10]:
#Function to get text back from the ids
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis = -1)

In [11]:
#All mapped ids from the reviews text
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

In [12]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [13]:
seq_length = 100

In [54]:
sequences= ids_dataset.batch(seq_length+1, drop_remainder=True)

In [15]:
#Take a sequence and break it into two texts. One excludes the last element, one excludes the first.
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
#Visual of split input function
split_input_target(list("MovieReview"))

(['M', 'o', 'v', 'i', 'e', 'R', 'e', 'v', 'i', 'e'],
 ['o', 'v', 'i', 'e', 'R', 'e', 'v', 'i', 'e', 'w'])

In [16]:
#Map the split_input_target function into the sequences
dataset = sequences.map(split_input_target)

# Model Training

In [17]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

Setting up GRU architecture

In [18]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [19]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    #Embedding layer where each character in vocab is dimensionally increased from 1 to 256
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    #GRU layer with 1024
    self.gru= tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    #layer to truncate back down to 38
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    #Pass the inputs into the embedding layer
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    #Pass inputs into gru layer
    x, states = self.gru(x, initial_state=states, training=training)
    #Pass inputs into dense layer
    x = self.dense(x, training=training)

    #If return state from GRU set to true, return state and output character
    if return_state:
      return x, states
    #Otherwise return just output character
    else:
      return x

In [20]:
#Set up the model with specified parameters
gru_model = MyModel(vocab_size=vocab_size, embedding_dim=embedding_dim, rnn_units=rnn_units)

In [21]:
#Initialize the model with a single example
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = gru_model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 38) # (batch_size, sequence_length, vocab_size)


In [23]:
#Use sparse categorical cross entropy loss
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 38)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(3.6383917, shape=(), dtype=float32)


In [None]:
#Perplexity of untrained model
tf.exp(example_batch_mean_loss).numpy()

38.026863

In [22]:
gru_model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  9728      
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  38950     
                                                                 
Total params: 3986982 (15.21 MB)
Trainable params: 3986982 (15.21 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
gru_model.compile(optimizer='adam', loss=loss)

In [25]:
# Directory where the checkpoints will be saved
checkpoint_dir = 'Data/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
#Use this line to reload previously saved models
gru_model.load_weights(checkpoint_dir)

In [26]:
history = gru_model.fit(dataset, epochs=2, callbacks=[checkpoint_callback])

Epoch 1/2
Epoch 2/2


# Text Generation

In [27]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    #Temperature not significant for this project
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    #Temperature is set to 1 so prediction does not change
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [28]:
#Set up model class by saving model, characters, and ids
one_step_model = OneStep(gru_model, chars_from_ids, ids_from_chars)

In [29]:
start = time.time()
states = None
next_char = tf.constant(['and this'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n')
print('\nRun time:', end - start)

and this a second of time to see how good then anyway an oscar manorpex of women somewhere to prove this movieif your dramatic red husses and getting just decided if one of their easily shows you from it its that because they think so obviously its like that it has potential to rule these kinds of louds of murderer the fbi the beginning creepinchitorious yet good deben and that excel it looks for its simple filmmakings i want to give i and your money a go to me expecting progress in their blue convincing or the planet of canned identity and wasted appearance during pail shop your guerlor conceit it to be believable and stepinestrai a rush teenagend soul of the main character disheartens or lame but i sticked this movie at baltmas i can yep i confirm the ex endered footbest credit of the lives go and gave this movie a 210t option our outcuffin real and emotional performances as a house of rands in the hotel for him to proving the son columbus let just hehemovie easily confronts drinki n

In [30]:
start = time.time()
states = None
next_char = tf.constant(['and this'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n')
print('\nRun time:', end - start)

and this is not at the end i think the scene was mesisting not convincing photographer that is also supposed to be conflicts noncluding vehicle for a koreadam they dont watch the trelow slums make no euroverneh and has it really because a documentary youve got last noticed especially jennas broadcast for being in most state this movie akmelp especially his girlfriend associated me im sorry to put only one of the bow by laughing in mime face and then film a lot before comes so off great animation of the usa which gives you great methods cartoon having this attackal rouge mesimatious string of almost never even safefear why is it a bit better than the usual story which people would pows up to say away all the fears foremes admirtedly house to be confusing for your colleagurous minded attack signators when youre opinionalthiny a few things that the worst internological letters the directorcoming down to an interrogate parallel relationship to work up you viewer get real never shows that h

In [31]:
#Save the model/chars/and ids
tf.saved_model.save(one_step_model, 'two_epoch')



In [3]:
#Load previously saved models
one_step_model = tf.saved_model.load('two_epoch')

In [4]:
start = time.time()
states = None
next_char = tf.constant(['and then'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

and then there is a total remairs performance and man while therative and consumers i loved himat very very strong perpetrated shelter and policy here and irritate to truck to just a story of art calls out and was moving this movie tracles whyand reevel their stories  the second polarthrough lives excellent experience were trying rather than bammildgreen and thriller there is also so i missed the summer than theyre believable in the end of the film i also thought that that marco bollytey do where the game one rats is treated by two bratawams pain marculatelli who enjoy broadcast with her talent there wasnt anything happens to see what it didnt buy major it goes up it there is pleghm oh dreadful engaging ending dont waste your time someweeks why i mean stephen douglas srommen henry bradfor vin syba binoche was eradicated by a revelation of the original story not a totello with the environment for the film which is completely awful the film cant have happened when you get to steal the pu

In [37]:
result

<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'and then goverments through streets from other films in spiteafor is planned to bg that someone wit a palpable and so eating i love the pile of screen time this really makes this movie a 2 unless youve seen this movie and i like the film or better i am a fan of a swing people and collectival part of acting by slightly mental action but aproophobly pauls shakespeared stuntman in queentity theyre all too hard for che is to go about exhibs not to actually be remember the alien photography with amciting young cold get hey home natural the director reveal the true review that moves well as i had one night with the specific looks of the first baskern french couple with completely predictable metal industry but there is nothing going on because thi wasnt an awesome copy of it just because that is the end of a real story and since pound with their traple pinewiper who actually die for this film consequences anylooking good economic kne dont

In [53]:
start = time.time()
#Generate a dictionary of reviews
reviews = {}
for i in range(1000):
  states = None
  next_char = tf.constant(['and then'])
  result = [next_char]
  #Generate a review of character length from 500 to 1000
  for n in range(random.randint(500,1000)):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  reviews[i] = tf.strings.join(result)
  if i%100 ==0:
    iter = time.time()
    print("Review:", i, "    Run time:", iter-start)
end = time.time()
print('Total Reviews:', len(reviews))
print('\n Total Run time:', end - start)

Review: 0     Run time: 2.9246339797973633
Review: 100     Run time: 235.5458586215973
Review: 200     Run time: 491.53173661231995
Review: 300     Run time: 713.2154879570007
Review: 400     Run time: 942.9624216556549
Review: 500     Run time: 1164.3898103237152
Review: 600     Run time: 1409.6759459972382
Review: 700     Run time: 1645.5105111598969
Review: 800     Run time: 1877.3839061260223
Review: 900     Run time: 2135.1558017730713
Total Reviews: 1000

 Total Run time: 2356.717267036438


In [55]:
review_tocsv = []
for i in range(len(reviews)):
  #Remove the prompt 'and then' after decoding reviews
  rev = reviews[i].numpy()[0].decode("utf-8")[9:]
  review_tocsv.append([i+1, "GRU", "and then", rev])

In [56]:
#Write reviews to csv
with open('gru_genReviews5.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["Review Number","model", "prompt", "Review"])
  for i in range(len(review_tocsv)):
    writer.writerow(review_tocsv[i])

# Perplexity

In [33]:

total_loss = []

for batch, labels in dataset.take(100):

    outputs = gru_model(batch)
    losses = loss(labels, outputs).numpy()
    total_loss.append(losses)

perplexity = np.exp(np.mean(total_loss))
perplexity

3.4962916

Perplexity without training = 38.03.
Perplexity training for 1 and 2 epochs = 3.5
Perplexity training for 10 epochs = 8.1