<a href="https://colab.research.google.com/github/2bahaa/Movie-Script-Generator-RNN/blob/main/playgenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%tensorflow_version 2.x

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np


In [None]:
from google.colab import files
path_to_file=list(files.upload().keys())[0]

Saving interstellar script.txt to interstellar script (1).txt


In [None]:
text=open(path_to_file,'rb').read().decode(encoding='utf-8')
print(text[:64])

INTERSTELLAR
written by
Jonathan Nolan and Christopher Nolan



In [None]:
#encoding
Vocab=sorted(set(text))
# Creating a mapping from unique characters to indices
char2indx={u:i for i,u in enumerate(Vocab)}
indx2char=np.array(Vocab)
#function to convert unique character to integer for model
def text_to_int(text):
  return np.array([char2indx[c] for c in text])

text_as_int=text_to_int(text)

In [None]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(indx2char[ints])

print(int_to_text(text_as_int[:13]))

INTERSTELLAR


In [None]:
# lets look at how part of our text is encoded
print("Text:", text[:13])
print("Encoded:", text_to_int(text[:13]))

Text: INTERSTELLAR
Encoded: [32 37 43 28 41 42 43 28 35 35 24 41  1]


In [None]:
seq_length = 100  # length of sequence for a training example
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [None]:
sequences=char_dataset.batch(seq_length+1, drop_remainder=True)#Next we can use the batch method to turn this stream of characters into batches of desired length.


In [None]:
def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello

dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

In [None]:
for x, y in dataset.take(2):
  print("\n\nEXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))



EXAMPLE

INPUT
INTERSTELLAR
written by
Jonathan Nolan and Christopher Nolan
Transferred to PDF from:
"Interstel

OUTPUT
NTERSTELLAR
written by
Jonathan Nolan and Christopher Nolan
Transferred to PDF from:
"Interstell


EXAMPLE

INPUT
ar - The Complete
Screenplay with Selected
Storyboards"
Published November 2014 by
Faber & Faber

OUTPUT
r - The Complete
Screenplay with Selected
Storyboards"
Published November 2014 by
Faber & Faber 


In [None]:
BATCH_SIZE = 64
VOCAB_SIZE = len(Vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 256
RNN_UNITS = 1024
# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def build_model(vocab_size,embedding_dim,rnn_units,batch_size):
  model=tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size,embedding_dim,batch_input_shape=[batch_size,None]),
                            tf.keras.layers.LSTM(rnn_units,return_sequences=True,stateful=True,recurrent_initializer='glorot_uniform'),
                            tf.keras.layers.Dense(vocab_size)])
  return model

model=build_model(VOCAB_SIZE,EMBEDDING_DIM,RNN_UNITS,BATCH_SIZE)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           19968     
                                                                 
 lstm (LSTM)                 (64, None, 1024)          5246976   
                                                                 
 dense (Dense)               (64, None, 78)            79950     
                                                                 
Total params: 5346894 (20.40 MB)
Trainable params: 5346894 (20.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
#create our own loss function
def loss(labels,logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,logits,from_logits=True)

model.compile(optimizer='adam',loss=loss)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
#training model
history=model.fit(data,epochs=93,callbacks=[checkpoint_callback])

In [None]:
#load model now with only 1 batch to predict 1 character
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)
#tell the model to use the training weigths
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [None]:
#use model to predict and generate text
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2indx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension

      predictions = tf.squeeze(predictions, 0)

      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(indx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [None]:
inp = input("Type a starting string: ")
print(generate_text(model, inp))