In [1]:
# GLOVE download reference https://github.com/guillaume-chevalier/GloVe-as-a-TensorFlow-Embedding-Layer/blob/master/README.md

import pandas as pd
import numpy as np
import os
import tensorflow as tf
import nltk
import re
from datetime import date

from tensorflow.keras.layers import Input,Lambda, Dense,LeakyReLU ,Dropout,LSTM, GlobalMaxPool1D, Bidirectional, Embedding,Flatten, Concatenate,Conv2DTranspose , BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import sparse_categorical_crossentropy

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import chakin

import json

from collections import defaultdict
import zipfile
chakin.search(lang='English')

                   Name  Dimension                     Corpus VocabularySize  \
2          fastText(en)        300                  Wikipedia           2.5M   
11         GloVe.6B.50d         50  Wikipedia+Gigaword 5 (6B)           400K   
12        GloVe.6B.100d        100  Wikipedia+Gigaword 5 (6B)           400K   
13        GloVe.6B.200d        200  Wikipedia+Gigaword 5 (6B)           400K   
14        GloVe.6B.300d        300  Wikipedia+Gigaword 5 (6B)           400K   
15       GloVe.42B.300d        300          Common Crawl(42B)           1.9M   
16      GloVe.840B.300d        300         Common Crawl(840B)           2.2M   
17    GloVe.Twitter.25d         25               Twitter(27B)           1.2M   
18    GloVe.Twitter.50d         50               Twitter(27B)           1.2M   
19   GloVe.Twitter.100d        100               Twitter(27B)           1.2M   
20   GloVe.Twitter.200d        200               Twitter(27B)           1.2M   
21  word2vec.GoogleNews        300      

In [4]:

CHAKIN_INDEX = 13
NUMBER_OF_DIMENSIONS = 25
SUBFOLDER_NAME = "GloVe.6B.200d"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME))



In [5]:
if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    # GloVe by Stanford is licensed Apache 2.0: 
    #     https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
    #     http://nlp.stanford.edu/data/glove.twitter.27B.zip
    #     Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER))
else:
    print("Embeddings already downloaded.")


Embeddings already downloaded.


In [6]:
if not os.path.exists(UNZIP_FOLDER):

    with zipfile.ZipFile('embeddings/glove.6B.zip',"r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")

Embeddings already extracted.


In [7]:
def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Process the glove file 
    """

    word_to_index_dict = dict()
    index_to_embedding_array = []
    
    with open('embeddings/GloVe.6B.200d/glove.6B.300d.txt', 'r' ) as glove_file:
        for (i, line) in enumerate(glove_file):
            
            split = line.split(' ')
            
            word = split[0]
            
            representation = split[1:]
            representation = np.array(
                [float(val) for val in representation] )
            
            
            
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation


    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array)
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

print("Loading embedding from disks...")
word_to_index, index_to_embedding = load_embedding_from_disks(GLOVE_FILENAME, with_indexes=True)
print("Embedding loaded from disks.")

Loading embedding from disks...
Embedding loaded from disks.


In [8]:
# we need a newline character, so just double down on entry 65 ( an unused -- character )
word_to_index['\n'] = 65

In [9]:
# get shape of the vocabulary
vocab_size, embedding_dim = index_to_embedding.shape

In [10]:
index_to_embedding = index_to_embedding.astype('float32')

In [11]:
# generate a class that looks up embeddings

class PretrainedEmbedding(tf.keras.layers.Layer):
    """Embedding layer taking pretrained embeddings"""

    def __init__(self, embeddings, rate=0.1, **kwargs):
        """"Instantiate the layer using a pre-defined embedding matrix."""
        super().__init__(**kwargs)
        self.embeddings = tf.constant(embeddings)
        # if you want to add some dropout (or normalization, etc.)
        #self.dropout = tf.keras.layers.Dropout(rate=rate)

    def call(self, inputs, training=None):
        """Embed some input tokens and optionally apply dropout."""
        output = tf.nn.embedding_lookup(self.embeddings, inputs)
        return output #self.dropout(output, training=training)


In [12]:
# list of unwanted characters
badlist = pd.read_csv('data/badlist', header=None)[0].tolist()

In [13]:
import re

# add spaces to certain special characters
unspaced = r'[\[\]\(\)\.\,\/\?\-\!\"\;|:]'
spaced =  ' \g<0> '



In [14]:
def clean_line(line):
    """
    remove stray characters
    """
    for badchar in badlist:
        if badchar in line:
            line = re.sub(badchar,'',line)
        line = re.sub(unspaced, spaced, line)
        line = re.sub('  ',' ',line)
        line = re.sub('”','',line)
        line = re.sub('’s' ," 's", line)
        line = re.sub("'s" ," 's", line)
        
        #’s - > is
        line = re.sub('’s' ,' is',line)
    return line

In [15]:
# read the raw text files and clean the lines

START_STRING = 'BEGIN EPISODE'

all_episodes_by_sentence = []

for element in os.listdir('house/'):
    if 'clean' in element:
        with open('house/'+element) as in_raw:
            # start token
            all_episodes_by_sentence.append(START_STRING)
            for (i, line) in enumerate(in_raw):
                all_episodes_by_sentence.append(clean_line(line) )
        
            # end token
            all_episodes_by_sentence.append('END EPISODE\n\n')
            all_episodes_by_sentence.append('------------------------------------------\n')
    

In [16]:
def tokenize(sentence):
    """
    simple tokenization keeping the line seperators etc.
    """
    return sentence.lower().split(' ')

In [17]:
# combine text to create a single string for sliceshifting
word_dict = {}
used_words = []
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        if word in word_to_index :
            if not word in word_dict:
                word_dict[word] = 0
                used_words.append(word)
            word_dict[word] +=1
            text.append(word)
#text = [tokenize(sent) for sent in all_episodes_by_sentence]

In [18]:
# total number of words
len(used_words)

24166

In [19]:
# test if we can reduce the number of words
data_quant = pd.DataFrame.from_dict(word_dict, orient='index').sort_values(by=0, ascending=False)

In [20]:
data_quant.shape

(24166, 1)

In [21]:
data_quant.tail(5000) # looking clean, but many words with only one occurance

Unnamed: 0,0
perfecting,1
payday,1
meticulously,1
reboots,1
trim,1
...,...
migrans,1
stogie,1
unaccounted,1
blofeld,1


In [22]:
# only 7401 words occur more than 6 times
data_quant.loc[data_quant[0] > 6].shape

(7401, 1)

In [23]:
allowed_vocabulary = data_quant.loc[data_quant[0] > 6].index.tolist()

In [24]:
# preformat to test if this is enough
word_dict = {}
used_words = []
text = []
for sent in all_episodes_by_sentence:
    for word in tokenize(sent):
        if word in word_to_index and word in allowed_vocabulary:
            if not word in word_dict:
                word_dict[word] = 0
                used_words.append(word)
            word_dict[word] +=1
            text.append(word)

In [25]:
# original text, first 10 sentences
all_episodes_by_sentence[0:10]

['BEGIN EPISODE',
 " [ Open on a House  's face . His eyes are closed . The picture is not quite in color , but it  's not black and white either . Radiohead  's No Surprises plays .  ] \n",
 " [ House opens his eyes . He  's lying on a twin bed on the left side of a cell - like room at Mayfield . He  's wearing a gray t - shirt . There is a stainless steel basin on the tiled floor near his head . On the opposite wall there is a single window . Next to it a metal sink is bolted into the wall . Another stainless basin is on the floor below the sink .  ] \n",
 ' [ Cut to House opening his eyes . The color has returned to normal . Everything is quiet . He isn’t restrained any longer . He touches his thigh briefly then sits up . He limps to the window , holding his leg for support .  ] \n',
 ' [ Cut to a suitcase dropping on the bed . House packs his clothes and zips the valise .  ] \n',
 ' [ Cut to House walking down the hall with his cane in his right hand and his suitcase in his left . 

In [26]:
' '.join(text[0:500]) # missing a few proper nouns like Radiohead, but this should do

"begin episode [ open on a house 's face . his eyes are closed . the picture is not quite in color , but it 's not black and white either . 's no surprises plays . ] \n [ house opens his eyes . he 's lying on a twin bed on the left side of a cell - like room at mayfield . he 's wearing a gray t - shirt . there is a steel basin on the floor near his head . on the opposite wall there is a single window . next to it a metal sink is into the wall . another basin is on the floor below the sink . ] \n [ cut to house opening his eyes . the color has returned to normal . everything is quiet . he any longer . he touches his thigh briefly then sits up . he limps to the window , holding his leg for support . ] \n [ cut to a suitcase dropping on the bed . house packs his clothes and zips the . ] \n [ cut to house walking down the hall with his cane in his right hand and his suitcase in his left . he switches the case to his right so he can a cap off a patient in a wheelchair . ] \n [ cut to house 

In [27]:
# The unique characters in the file
vocab = sorted(allowed_vocabulary)
print ('{} unique characters'.format(len(vocab)))

7401 unique characters


In [28]:
# Creating a mapping from unique characters to indices
# Here we are acutally using the pretrained words, so this is not needed
#word2idx = {u:i for i, u in enumerate(vocab)}
#idx2word = np.array(vocab)

In [29]:
text_as_int = np.array([word_to_index[c] for c in text])
text_as_int

array([1092, 1942, 2823, ...,    2,   65,  156])

In [30]:
# The maximum length sentence we want for a single input in words
seq_length = 20
examples_per_epoch = len(text)//(seq_length+1)

In [31]:
text_as_int

array([1092, 1942, 2823, ...,    2,   65,  156])

In [32]:
# Create training examples / targets
word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [33]:
# extract sequences from character dataset
sequences = word_dataset.batch(seq_length+1, drop_remainder=True)

In [34]:

def split_shift_input(segment):
    """
    Creates the teaching data by shifting the training data on off to create labeled data
    """
    input_segment = segment[:-1]
    target_segment = segment[1:]
    return input_segment, target_segment

dataset = sequences.map(split_shift_input)

In [35]:
# set up dataset as prebatched
BATCH_SIZE = 50

# shuffle the dataset and set batch size
dataset = dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

In [36]:


# Length of the vocabulary
vocab_size = len(vocab)

# hyperparameters

# embedding dimension
embedding_dim = 300

# RNN units
rnn_units = 300


In [37]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size, index_to_embedding):
    
    i = Input(shape=(None,), batch_size=batch_size, dtype=tf.int64 )
    # use pretrained embedding now
    
    # here is the new embedding layer
    x = PretrainedEmbedding(index_to_embedding)(i)
    
    #x = Embedding(vocab_size, embedding_dim)(i)
    x = LSTM(rnn_units, 
             return_sequences=True,
             stateful=True,
             recurrent_initializer='glorot_uniform')(x)
   # x = Dense(vocab_size)(x)
    x = Dense(vocab_size)(x)

    model = Model(i,x)
    
    return model
    

In [38]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE,
  index_to_embedding = index_to_embedding)



In [39]:
# simple model
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(50, None)]              0         
_________________________________________________________________
pretrained_embedding (Pretra (50, None, 300)           0         
_________________________________________________________________
unified_lstm (UnifiedLSTM)   (50, None, 300)           721200    
_________________________________________________________________
dense (Dense)                (50, None, 7401)          2227701   
Total params: 2,948,901
Trainable params: 2,948,901
Non-trainable params: 0
_________________________________________________________________


In [40]:
def loss(labels, logits):
    """
    Define loss function 
    """
    return sparse_categorical_crossentropy(labels, logits, from_logits=True)


In [41]:
model.compile(optimizer='adam', loss =loss)# loss='sparse_categorical_crossentropy' )

In [42]:
# Directory where the checkpoints will be saved
today = date.today()

checkpoint_dir = './pretrained_word_training_checkpoints_{today}'.format(today=today)


checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

# define callbacks
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [43]:
EPOCHS=10

In [44]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10


UnknownError: Fail to find the dnn implementation.
	 [[{{node unified_lstm/CudnnRNN}}]]
	 [[loss/dense_loss/loss/weighted_loss/broadcast_weights/assert_broadcastable/AssertGuard/pivot_f/_28/_41]] [Op:__inference_keras_scratch_graph_1179]

In [1]:
# for prediction, batch size has to be changed
# So reload the model and set shape to [1, None]

tf.train.latest_checkpoint(checkpoint_dir)

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1, index_to_embedding= index_to_embedding)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]),)

model.summary()

NameError: name 'tf' is not defined

In [239]:
def text_gen(model, start_string=START_STRING, freedom=1.0, num_generate=1000):
    """
    generate text with the trained model
    
    start_string (STR):  Basis for the model to start prediction on. 
    freedom (FLOAT): Multiplier for predictions. The lower it is the lower the impact of predictive variance
    num_generate (INT): Desired text length
    """
    
    text_generated = []
    

    # vectorization of starting string
    input_eval = [word2idx[s] for s in tokenize(start_string)]
    input_eval = tf.expand_dims(input_eval, 0)

    model.reset_states()
    
    
    for i in range(num_generate):
        predictions = model(input_eval)
        
        
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / freedom
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(' ') # spacing
        text_generated.append(idx2word[predicted_id])

    return (start_string + ''.join(text_generated))

In [240]:
print(text_gen(model, start_string=u"BEGIN EPISODE", freedom=1, num_generate=1000))

InvalidArgumentError: Invalid input_h shape: [1,50,300] [1,1,300] [Op:CudnnRNN]