In [None]:
import numpy as np
import pandas as pd
%pip install keras
%pip install tensorflow
%pip install rake-nltk
%pip install -U spacy
!python -m spacy download en_core_web_sm

In [None]:
movies_raw_df = pd.read_csv('s3://sagemaker-studio-716361152964-uifr8hhp3c/archive/wiki_movie_plots_deduped.csv')

movies_raw_df.head()

In [None]:
movies_to_select = ((movies_raw_df['Genre'] == 'horror') &
                    (movies_raw_df['Origin/Ethnicity'] == 'American') &
                    (movies_raw_df['Release Year'] > 2012))

In [None]:
horror_df = movies_raw_df[movies_to_select]['Plot']

horror_df.head()

In [None]:
horror_df.shape

In [None]:
horror_str = horror_df.str.cat(sep=' ')

In [None]:
import spacy

# Load language model. 
nlp = spacy.load('en_core_web_sm', disable = ['parser', 'tagger', 'ner', 'lemmatizer'])

In [None]:
def get_tokens(doc_text):
    skip_pattern = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
    
    tokens = [token.text.lower() for token in nlp(doc_text) if token.text not in skip_pattern]
    
    return tokens
def get_storylines(doc_text):
    skip_pattern = '\r\n \n\n \n\n\n!"-#$%&()--.*+,-./:;<=>?@[\\]^_`{|}~\t\n\r '
    
    tokens = [token.text.lower() for token in nlp(doc_text) if token.text not in skip_pattern or token.text in ".,"]
    
    return tokens

In [None]:
tokens = get_tokens(horror_str)
storytokens = get_storylines(horror_str)

In [None]:
tokens[0:9]

In [None]:
storytokens[0:25]

In [None]:
len(tokens)

In [None]:
len(storytokens)

In [None]:
len_0 = 25

tokens[0:len_0]

In [None]:
tokens[len_0:len_0 + 1]

In [None]:
train_len = len_0 + 1

text_sequences = []
story_sequence = []

for i in range(train_len, len(tokens)):
    # Construct sequence.
    seq = tokens[i - train_len: i]
    # Append.
    text_sequences.append(seq)
sentence = ""
for i in storytokens:
    sentence += i
    sentence += " "
    if i == ".":
        story_sequence.append(sentence)
        sentence = ""

In [None]:
' '.join(text_sequences[0])

In [None]:
story_sequence[1]

In [None]:
len(text_sequences[0])

In [None]:
for i in range(0, 5):
    print(' '.join(text_sequences[i]))
    print(story_sequence[i])
    print('-----')

In [None]:
from rake_nltk import Rake
import nltk
nltk.download('stopwords')
nltk.download('punkt')

r = Rake()

ranked_list = []

for sentence in story_sequence:
    r.extract_keywords_from_text(sentence)
    ranked_list.append(r.get_ranked_phrases()[0])

In [None]:
ranked_list[0]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizerStory = Tokenizer()

tokenizer.fit_on_texts(text_sequences)
tokenizerStory.fit_on_texts(ranked_list)

In [None]:
sequences = tokenizer.texts_to_sequences(text_sequences)
storysequences = tokenizerStory.texts_to_sequences(ranked_list)

In [None]:
sequences[0]

In [None]:
tokenizer.index_word[6]

In [None]:
storysequences[0:9]

In [None]:
flattened_story = [x for sentence in storysequences for x in sentence]
storysequences = flattened_story

In [None]:
storysequences[0:25]

In [None]:
storyseqlength = 5
broken_up = []
for i in range(storyseqlength, len(storysequences)):
    # Construct sequence.
    seq = storysequences[i - storyseqlength: i]
    # Append.
    broken_up.append(seq)

In [None]:
for i in range(0, 5):
    print(broken_up[i])

In [None]:
storysequences = broken_up

In [None]:
tokenizerStory.index_word[570]

In [None]:
vocabulary_size = len(tokenizer.word_counts)

vocabulary_size

In [None]:
story_vocabulary_size = len(tokenizerStory.word_counts)

story_vocabulary_size

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

In [None]:
sequences.shape

In [None]:
storysequences = np.array(storysequences)

In [None]:
storysequences

In [None]:
storysequences.shape

In [None]:
from tensorflow.keras.utils import to_categorical

# select all but last word indices.
X = sequences[:, :-1]
X

In [None]:
X.shape

In [None]:
seq_len = X.shape[1]

In [None]:
# select all but last word indices.
Xstory = storysequences[:, :-1]
Xstory

In [None]:
Xstory.shape

In [None]:
story_seq_len = X.shape[1]

In [None]:
ystory = storysequences[:, -1]
ystory

In [None]:
ystory = to_categorical(ystory, num_classes=(story_vocabulary_size + 1))
ystory

In [None]:
y = sequences[:, -1]
y

In [None]:
y = to_categorical(y, num_classes=(vocabulary_size + 1))
y

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, LayerNormalization

def create_storyline_model(story_vocabulary_size, storyseqlength):
    
    model = Sequential()
    
    model.add(Embedding(input_dim=story_vocabulary_size, 
                        output_dim=storyseqlength, 
                        input_length=storyseqlength))
    
    model.add(Bidirectional(LSTM(units=50, recurrent_dropout=0.1, return_sequences=True)))
    
    model.add(LayerNormalization())
    
    model.add(LSTM(units=50, recurrent_dropout=0.1))

    model.add(Dense(units=50, activation='relu'))

    model.add(Dense(units=story_vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    model.summary()
    
    return model

In [None]:
import tensorflow as tf
def create_model(vocabulary_size, seq_len):
    
    model = Sequential()
    
    model.add(Embedding(input_dim=vocabulary_size, 
                        output_dim=seq_len, 
                        input_length=seq_len))
    
    model.add(Bidirectional(LSTM(units=100, recurrent_dropout=0.1, return_sequences=True)))
    
    model.add(LayerNormalization())
    
    model.add(Bidirectional(LSTM(units=100, recurrent_dropout=0.1)))
    
    model.add(LayerNormalization())
    
    model.add(Dense(units=100, activation='relu'))

    model.add(Dense(units=vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    model.summary()
    
    return model

In [None]:
model = create_model(vocabulary_size=(vocabulary_size + 1), seq_len=seq_len)

In [None]:
story_model = create_storyline_model(story_vocabulary_size=(story_vocabulary_size+1), storyseqlength=storyseqlength-1)

In [None]:
story_model.fit(x=Xstory, y=ystory, batch_size=28, epochs=40, verbose=1)

In [None]:
model.fit(x=X, y=y, batch_size=64, epochs=40, verbose=1)

In [None]:
from pickle import dump

dump(tokenizer, open('tokenizer', 'wb'))
dump(tokenizerStory, open('tokenizerStory', 'wb'))

In [None]:
model.save('model2.h5')
story_model.save('storylinemodel.h5')

In [None]:
from tensorflow.keras.models import load_model

model = load_model('model2.h5')
story_model = load_model('storylinemodel.h5')

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

def generate_text(model, storyline, tokenizer, storytokenizer, seq_len, storyseq, seed_text, num_gen_words, story_length):
    # List to store the generated words. 
    output_text = []
    # Set seed_text as input_text. 
    storyline = seed_text
    
    for i in range(story_length):
        encoded_text = storytokenizer.texts_to_sequences([storyline])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=storyseq, truncating='pre')
        predict_story = story_model.predict(pad_encoded)
        pred_word_ind = np.argmax(predict_story,axis=1)
        # Convert from numeric to word.
        pred_word = storytokenizer.index_word[pred_word_ind.item(0)]
        # Attach predicted word. 
        storyline += ' ' + pred_word
        
    input_text = storyline
    
    for i in range(num_gen_words):
        # Encode input text. 
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # Add if the input tesxt does not have length len_0.
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        # Do the prediction. Here we automatically choose the word with highest probability.
        predict_x = model.predict(pad_encoded) 
        pred_word_ind = np.argmax(predict_x,axis=1)
        # Convert from numeric to word.
        pred_word = tokenizer.index_word[pred_word_ind.item(0)]
        # Attach predicted word. 
        input_text += ' ' + pred_word
        # Append new word to the list. 
        output_text.append(pred_word)
        
    return ' '.join(output_text)

seed_text = sample_text[:190]
print(seed_text)

In [None]:
seed_text = "The most interesting man"
print(seed_text)

In [None]:
generated_text = generate_text(model=model,
                               storyline=story_model,
                               tokenizer=tokenizer,
                               storytokenizer=tokenizerStory,
                               seq_len=seq_len,
                               storyseq=storyseqlength-1,
                               seed_text=seed_text, 
                               num_gen_words=90,
                               story_length=10)

print(generated_text + '...')