In [1]:
import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.python.client import device_lib
import pandas as pd
import re
import numpy as np
import nltk
import requests
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from timeit import default_timer as timer

Using TensorFlow backend.


Only run this code if you have a GPU. This part of the code makes the code run on your GPU, I used the tensorflow-gpu version 1.15 for this to work. It's considerably faster using my GPU (NVIDIA GTX 1060 6GB), than my CPU. About ~5 times faster, depending on the dataset I use.

In [2]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

print(device_lib.list_local_devices())
K.tensorflow_backend._get_available_gpus()

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 6089656556499622330
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5083824128
locality {
  bus_id: 1
  links {
  }
}
incarnation: 14756280457857581038
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1"
]


['/job:localhost/replica:0/task:0/device:GPU:0']

Several different datasets I experimented on, they are listed from short to long. The shortest one takes a couple of seconds to train, the longest one can, depending on your computer, take hours.

In [3]:
with open('wikiped.txt', 'r') as file:
    data = file.read().replace('\n', '')
print(len(data))

29088


In [15]:
df = pd.read_csv("jokes.csv")
data = ' '.join(df['Joke'].tolist()).replace("\'", "")
print(len(data))

128866


In [5]:
url = "https://www.gutenberg.org/files/46/46-0.txt"
book = requests.get(url)
data = book.text
print(len(data))

182066


In [6]:
url = "http://gutenberg.org/files/1342/1342-0.txt"
book = requests.get(url)
data = book.text
data = data[2440:]
print(len(data))

797205


In [7]:
url = "https://www.gutenberg.org/files/24869/24869-0.txt"
book = requests.get(url)
data = book.text
print(len(data))

2396753


Cleaning of the data

In [16]:
def clean_dataset(dataset):

    # remove whitespace
    tokenized = word_tokenize(dataset)
    dataset = " ".join([token.strip() for token in tokenized])

    # remove characters
    dataset = re.sub(r"[^\w\n]", " ", dataset)

    # replace multiple whitespaces with single whitespace
    dataset = re.sub(r"\s+", " ", dataset)
    dataset = dataset.strip()

    dataset = dataset.lower()
    dataset = dataset.replace("â", "a")
    
    return dataset

data = clean_dataset(data)

In [17]:
print(len(data))
print(data[0:100])

123588
what did the bartender say to the jumper cables you better not try to start anything dont you hate j


In [18]:
word_tokeniser = Tokenizer()
word_tokeniser.fit_on_texts([data])
words_embedding = word_tokeniser.texts_to_sequences([data])[0]
vocab_size = len(word_tokeniser.word_index) + 1
print("Vocabulary Size: ", vocab_size)

Vocabulary Size:  4715


In [19]:
sequences = []
seq_length = 4

for i in range(seq_length, len(words_embedding)):
    sequence = words_embedding[i-seq_length:i+1]
    sequences.append(sequence)
sequences = np.array(sequences)

In [20]:
# divide the sequence into data and target
sequences = np.array(sequences)

X = sequences[:80000,:-1]  # assign all but last words of a sequence to X
y = sequences[:80000,-1]   # assign last word of each sequence to y
y = to_categorical(y, num_classes=vocab_size)

X = pad_sequences(X, maxlen=seq_length, padding='pre')

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 200, input_length = seq_length))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))

# output layer
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
# The 'joke' database on GPU 154.4100482 seconds for 100 epochs seq length of 4.
# The 'joke' database on CPU 439.003922 seconds for 100 epochs seq length of 4.

start = timer()
model.fit(X, y, epochs=100, verbose=1, batch_size=256)
end = timer()
print(end - start)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
208.40813260000004


Make predictions using this function

In [24]:
def generate_words(model, word_tokeniser, seq_length, text, n_words):
    for x in range(n_words):
        
        # create word embeddings
        words_embedding = word_tokeniser.texts_to_sequences([text])[0]

        padded_words = pad_sequences([words_embedding], maxlen=seq_length, padding='pre')
        # predict next word
        prediction = model.predict_classes(padded_words, verbose=0)

        print(sorted(model.predict(padded_words)[0], reverse=True)[0:5])

        next_word = ""
        for word, i in word_tokeniser.word_index.items():
            if [i] == prediction:
                next_word = word
                break
        text += " " + next_word
        
    return text

Pride and Prejudice corpus results

In [None]:
num_words = 10

sentence = "She is not going to go with the"
print(generate_words(model, word_tokeniser, seq_length, sentence, num_words))

In [None]:
num_words = 3

sentence = "though he was now only established as a"
print(generate_words(model, word_tokeniser, seq_length, sentence, num_words))

With the joke dataset 

In [29]:
num_words = 4

sentence = "knock knock whos"
print(generate_words(model, word_tokeniser, seq_length, sentence, num_words))

[0.99969816, 0.00013802716, 3.401807e-05, 2.1994174e-05, 1.0524708e-05]
[0.14637631, 0.08872764, 0.076186754, 0.07592414, 0.07235614]
[0.95845836, 0.027531892, 0.006469008, 0.0015967618, 0.0011852373]
[0.83456224, 0.12347429, 0.008342455, 0.0067857774, 0.0048906007]
knock knock whos there control freak con


In [30]:
num_words = 4

sentence = "What did the"
print(generate_words(model, word_tokeniser, seq_length, sentence, num_words))

[0.1071028, 0.09037031, 0.07648724, 0.061399058, 0.053121354]
[0.99192387, 0.0036045008, 0.0013025511, 0.00061749533, 0.00034863313]
[0.907875, 0.07908155, 0.0082408, 0.0015342761, 0.00061145105]
[0.99938524, 0.00010775592, 9.38089e-05, 6.510899e-05, 4.0031653e-05]
What did the turkey say to the


In [None]:
import math

def perplexity():
    pass



2**-((0.5* math.log(1,2))+(0.1* math.log(0.1,2))+(0.1* math.log(0.1,2))+(0.1* math.log(0.1,2))+(0.1* math.log(0.1,2))+(0.1* math.log(0.1,2)))