In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import nltk
import re
from datetime import date

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense ,Dropout,LSTM, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy

sparse_categorical_crossentropy

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


<function tensorflow.python.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1)>

In [2]:
badlist = pd.read_csv('data/badlist', header=None)[0].tolist()

In [3]:
import re

# add spaces to special characters
unspaced = r'[\[\]\(\)\.\,\/\?\-\!\"\;|:]'
spaced =  ' \g<0> '



In [4]:
def clean_line(line):
    """
    remove stray characters
    """
    for badchar in badlist:
        if badchar in line:
            line = re.sub(badchar,'',line)
        line = re.sub(unspaced, spaced, line)
        line = re.sub('  ',' ',line)
        line = re.sub('”','',line)
    return line

In [5]:
# read the raw text files and clean the lines

START_STRING = 'BEGIN EPISODE'

all_episodes_by_sentence = []

for element in os.listdir('house/'):
    if 'clean' in element:
        with open('house/'+element) as in_raw:
            # start token
            all_episodes_by_sentence.append(START_STRING)
            for (i, line) in enumerate(in_raw):
                all_episodes_by_sentence.append(clean_line(line) )
        
            # end token
            all_episodes_by_sentence.append('END EPISODE\n\n')
            all_episodes_by_sentence.append('------------------------------------------\n')
    

In [6]:
def tokenize(sentence):
    return sentence.lower().split(' ')

In [7]:
# combine text to create a single string for sliceshifting
word_dict = {}
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        if not word in word_dict:
            word_dict[word] = 0
        word_dict[word] +=1
        text.append(word)
#text = [tokenize(sent) for sent in all_episodes_by_sentence]

In [8]:
data_quant = pd.DataFrame.from_dict(word_dict, orient='index').sort_values(by=0, ascending=False)

In [9]:
data_quant.shape

(29102, 1)

In [10]:
# only 7202 words occur more than 5 times
data_quant.loc[data_quant[0] > 6].shape

(7880, 1)

In [11]:
allowed_vocabulary = data_quant.loc[data_quant[0] > 7].index.tolist()

In [12]:
# preformat to test if this is enough
word_dict = {}
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        if word in allowed_vocabulary:
            if not word in word_dict:
                word_dict[word] = 0
            word_dict[word] +=1
            text.append(word)


In [13]:
# text still readable ?
' '.join(text[0:25] )

'begin episode  [ open on a house’s face . his eyes are closed . the picture is not quite in color , but it’s'

In [14]:
# The unique characters in the file
vocab = sorted(allowed_vocabulary)
print ('{} unique characters'.format(len(vocab)))

7202 unique characters


In [15]:
# Creating a mapping from unique characters to indices
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

In [16]:
text_as_int = np.array([word2idx[c] for c in text])
text_as_int

array([ 671, 2204,    0, ..., 2162, 2205,   23])

In [17]:
# The maximum length sentence we want for a single input in characters
seq_length = 20
examples_per_epoch = len(text)//(seq_length+1)

In [18]:
text_as_int

array([ 671, 2204,    0, ..., 2162, 2205,   23])

In [19]:
# Create training examples / targets
word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [20]:
# extract sequences from character dataset
sequences = word_dataset.batch(seq_length+1, drop_remainder=True)

In [21]:

def split_shift_input(segment):
    """
    Creates the teaching data by shifting the training data on off to create labeled data
    """
    input_segment = segment[:-1]
    target_segment = segment[1:]
    return input_segment, target_segment

dataset = sequences.map(split_shift_input)

In [22]:
dataset

<MapDataset shapes: ((20,), (20,)), types: (tf.int64, tf.int64)>

In [23]:
# set up dataset as prebatched
BATCH_SIZE = 35

# Length of the vocabulary
vocab_size = len(vocab)

# embedding dimension
embedding_dim = 300

# RNN units
rnn_units = 250

dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

In [24]:
dataset

<BatchDataset shapes: ((35, 20), (35, 20)), types: (tf.int64, tf.int64)>

In [25]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    i = Input(shape=(None,), batch_size=batch_size )
    x = Embedding(vocab_size, embedding_dim)(i)
    x = LSTM(rnn_units, 
             return_sequences=True,
             stateful=True,
             recurrent_initializer='glorot_uniform')(x)
   # x = Dense(vocab_size)(x)
    x = Dense(vocab_size)(x)

    model = Model(i,x)
    
    return model
    

In [26]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)



In [27]:
# simple model
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(35, None)]              0         
_________________________________________________________________
embedding (Embedding)        (35, None, 300)           2160600   
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (35, None, 250)           551000    
_________________________________________________________________
dense (Dense)                (35, None, 7202)          1807702   
Total params: 4,519,302
Trainable params: 4,519,302
Non-trainable params: 0
_________________________________________________________________


In [28]:
def loss(labels, logits):
    """
    Define loss function 
    """
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)


In [29]:
model.compile(optimizer='adam', loss =loss)# loss='sparse_categorical_crossentropy' )

In [30]:
# Directory where the checkpoints will be saved
today = date.today()

checkpoint_dir = './word_training_checkpoints_{today}'.format(today=today)


checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# define callbacks
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [31]:
EPOCHS=12

In [32]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/12


UnknownError: Fail to find the dnn implementation.
	 [[{{node unified_lstm/CudnnRNN}}]]
	 [[loss/dense_loss/loss/weighted_loss/broadcast_weights/assert_broadcastable/is_valid_shape/else/_1/has_valid_nonscalar_shape/then/_47/has_invalid_dims/concat/_36]] [Op:__inference_keras_scratch_graph_1344]

In [358]:
# for prediction, batch size has to be changed
# So reload the model and set shape to [1, None]

# preloaded one checkpoint directory
checkpoint_dir_preloaded = 'base_training_checkpoints_2020-02-28/'


tf.train.latest_checkpoint(checkpoint_dir_preloaded)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

model.summary()

Model: "model_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_16 (InputLayer)        [(1, None)]               0         
_________________________________________________________________
embedding_15 (Embedding)     (1, None, 300)            2160600   
_________________________________________________________________
unified_lstm_15 (UnifiedLSTM (1, None, 250)            551000    
_________________________________________________________________
dense_18 (Dense)             (1, None, 7202)           1807702   
Total params: 4,519,302
Trainable params: 4,519,302
Non-trainable params: 0
_________________________________________________________________


In [4]:
def text_gen(model, start_string=START_STRING, freedom=1.0, num_generate=1000):
    """
    generate text with the trained model
    
    start_string (STR):  Basis for the model to start prediction on. 
    freedom (FLOAT): Multiplier for predictions. The lower it is the lower the impact of predictive variance
    num_generate (INT): Desired text length
    """
    
    text_generated = []
    

    # vectorization of starting string
    input_eval = [word2idx[s] for s in tokenize(start_string)]
    input_eval = tf.expand_dims(input_eval, 0)

    model.reset_states()
    
    
    for i in range(num_generate):
        predictions = model(input_eval)
        
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / freedom
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(' ') # spacing
        text_generated.append(idx2word[predicted_id])

    return (start_string + ''.join(text_generated))

NameError: name 'START_STRING' is not defined

In [5]:
print(text_gen(model, start_string=u"BEGIN EPISODE", freedom=1, num_generate=100))

NameError: name 'text_gen' is not defined