In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import nltk
import re
from datetime import date

from tensorflow.keras.layers import Input,Lambda, Dense,LeakyReLU ,Dropout,LSTM, GlobalMaxPool1D, Bidirectional, Embedding,Flatten, Concatenate,Conv2DTranspose , BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import sparse_categorical_crossentropy

sparse_categorical_crossentropy

<function tensorflow.python.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1)>

In [7]:
badlist = pd.read_csv('data/badlist', header=None)[0].tolist()

In [8]:
def clean_line(line):
    
    for badchar in badlist:
        if badchar in line:
            line = re.sub(badchar,'',line)
    
    return line

In [9]:
# read the raw text files and clean the lines

START_STRING = 'BEGIN EPISODE'

all_episodes_by_sentence = []

for element in os.listdir('house/'):
    if 'clean' in element:
        with open('house/'+element) as in_raw:
            # start token
            all_episodes_by_sentence.append(START_STRING)
            for (i, line) in enumerate(in_raw):
                all_episodes_by_sentence.append(clean_line(line) )
        
            # end token
            all_episodes_by_sentence.append('END EPISODE\n\n')
            all_episodes_by_sentence.append('------------------------------------------\n')
    

In [10]:
all_episodes_by_sentence[0]

'BEGIN EPISODE'

In [11]:
# combine text to create a single string for sliceshifting
text = ' '.join(all_episodes_by_sentence)

In [12]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

95 unique characters


In [13]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [14]:
text_as_int

array([33, 36, 38, ..., 13, 13,  1])

In [15]:
# The maximum length sentence we want for a single input in characters
seq_length = 200
examples_per_epoch = len(text)//(seq_length+1)

In [16]:
text_as_int

array([33, 36, 38, ..., 13, 13,  1])

In [21]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [22]:
# extract sequences from character dataset
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'BEGIN EPISODE [Open on a House’s face. His eyes are closed. The picture is not quite in color, but it’s not black and white either. Radiohead’s “No Surprises” plays.]\n [House opens his eyes. He’s lying'
' on a twin bed on the left side of a cell-like room at Mayfield. He’s wearing a gray t-shirt. There is a stainless steel basin on the tiled floor near his head. On the opposite wall there is a single w'
'indow. Next to it a metal sink is bolted into the wall. Another stainless basin is on the floor below the sink.]\n [Cut to House opening his eyes. The color has returned to normal. Everything is quiet. '
'He isn’t restrained any longer. He touches his thigh briefly then sits up. He limps to the window, holding his leg for support.]\n [Cut to a suitcase dropping on the bed. House packs his clothes and zip'
's the valise.]\n [Cut to House walking down the hall with his cane in his right hand and his suitcase in his left. He switches the case to his right so he can swipe a knit cap off 

In [23]:

def split_shift_input(segment):
    """
    Creates the teaching data by shifting the training data on off to create labeled data
    """
    input_segment = segment[:-1]
    target_segment = segment[1:]
    return input_segment, target_segment

dataset = sequences.map(split_shift_input)

In [24]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'BEGIN EPISODE [Open on a House’s face. His eyes are closed. The picture is not quite in color, but it’s not black and white either. Radiohead’s “No Surprises” plays.]\n [House opens his eyes. He’s lyin'
Target data: 'EGIN EPISODE [Open on a House’s face. His eyes are closed. The picture is not quite in color, but it’s not black and white either. Radiohead’s “No Surprises” plays.]\n [House opens his eyes. He’s lying'


In [25]:
# set up dataset as prebatched
BATCH_SIZE = 35

# Length of the vocabulary
vocab_size = len(vocab)

# embedding dimension
embedding_dim = 256

# RNN units
rnn_units = 1124

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [26]:
dataset

<BatchDataset shapes: ((35, 200), (35, 200)), types: (tf.int64, tf.int64)>

In [27]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    i = Input(shape=(None,), batch_size=batch_size )
    x = Embedding(vocab_size, embedding_dim)(i)
    x = LSTM(rnn_units, 
             return_sequences=True,
             stateful=True,
             recurrent_initializer='glorot_uniform')(x)
    x = Dense(vocab_size)(x)
    x = Dense(vocab_size)(x)

    model = Model(i,x)
    
    return model
    

In [28]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [29]:
# simple model
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(35, None)]              0         
_________________________________________________________________
embedding (Embedding)        (35, None, 256)           24320     
_________________________________________________________________
lstm (LSTM)                  (35, None, 1124)          6208976   
_________________________________________________________________
dense (Dense)                (35, None, 95)            106875    
_________________________________________________________________
dense_1 (Dense)              (35, None, 95)            9120      
Total params: 6,349,291
Trainable params: 6,349,291
Non-trainable params: 0
_________________________________________________________________


In [30]:
def loss(labels, logits):
    """
    Define loss function 
    """
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)


In [32]:
model.compile(optimizer='adam', loss =loss)# loss='sparse_categorical_crossentropy' )

In [33]:
# Directory where the checkpoints will be saved
today = date.today()

checkpoint_dir = './base_training_checkpoints_{today}'.format(today=today)


checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# define callbacks
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [34]:
EPOCHS=10

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
    757/Unknown - 100s 132ms/step - loss: 2.0815

In [None]:
# for prediction, batch size has to be changed
# So reload the model and set shape to [1, None]

tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

In [None]:
def text_gen(model, start_string=START_STRING, freedom=1.0, num_generate=1000):
    """
    generate text with the trained model
    
    start_string (STR):  Basis for the model to start prediction on. 
    freedom (FLOAT): Multiplier for predictions. The lower it is the lower the impact of predictive variance
    num_generate (INT): Desired text length
    """
    
    text_generated = []
    
    # Number of characters to generate
    num_generate = 10000

    # vectorization of starting string
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    model.reset_states()
    
    
    for i in range(num_generate):
        predictions = model(input_eval)
        
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / freedom
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
print(text_gen(model, start_string=u"BEGIN EPISODE", freedom=0.7, num_generate=10000))