In [4]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.python.client import device_lib
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import requests
from nltk.tokenize import word_tokenize

from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding


from timeit import default_timer as timer

Using TensorFlow backend.


Only run this code if you have a GPU. This part of the code makes the code run on your GPU, I used the tensorflow-gpu version 1.15 for this to work. It's considerably faster using my GPU (NVIDIA GTX 1060 6GB), than my CPU. About ~5 times faster, depending on the dataset I use.

In [2]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

print(device_lib.list_local_devices())
K.tensorflow_backend._get_available_gpus()

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9731192535778866208
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5083824128
locality {
  bus_id: 1
  links {
  }
}
incarnation: 1724886082556085304
physical_device_desc: "device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:27:00.0, compute capability: 6.1"
]


['/job:localhost/replica:0/task:0/device:GPU:0']

Several different datasets I experimented on, they are listed from short to long. The shortest one takes a couple of seconds to train, the longest one can, depending on your computer, take hours.

In [1]:
with open('wikiped.txt', 'r') as file:
    data = file.read().replace('\n', '')
print(len(data))

29088


In [5]:
df = pd.read_csv("jokes.csv")
data = ' '.join(df['Joke'].tolist()).replace("\'", "")
print(len(data))

128866


In [31]:
url = "http://gutenberg.org/files/1342/1342-0.txt"
book = requests.get(url)
data = book.text
data = data[2440:]
print(len(data))

797205


In [50]:
url = "https://www.gutenberg.org/files/24869/24869-0.txt"
book = requests.get(url)
data = book.text
print(len(data))

2396753


Cleaning of the data

In [34]:
def clean_dataset(dataset, char_filter = r"[^\w]"):

    # convert words to lower case
    dataset = dataset.lower()
    dataset = dataset.replace("â", "a")
    # tokenise words
    words = word_tokenize(dataset)

    # strip whitespace from all words
    words = [word.strip() for word in words]

        
    # join back words to get dataset
    dataset = " ".join(words)

    # remove unwanted characters
    dataset = re.sub(char_filter, " ", dataset)

    # replace multiple whitespaces with single whitespace
    dataset = re.sub(r"\s+", " ", dataset)

    # strip whitespace from dataset
    dataset = dataset.strip()

    return dataset

data = clean_dataset(data)

In [35]:
print(data[0:1000])

it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife however little known the feelings or views of such a man may be on his first entering a neighbourhood this truth is so well fixed in the minds of the surrounding families that he is considered the rightful property of some one or other of their daughters a my dear mr bennet a said his lady to him one day a have you heard that netherfield park is let at last a mr bennet replied that he had not a but it is a returned she a for mrs long has just been here and she told me all about it a mr bennet made no answer a do you not want to know who has taken it a cried his wife impatiently a _you_ want to tell me and i have no objection to hearing it a this was invitation enough a why my dear you must know mrs long says that netherfield is taken by a young man of large fortune from the north of england that he came down on monday in a chaise and four to see the place and was so much d

In [28]:
word_tokeniser = Tokenizer()
word_tokeniser.fit_on_texts([data])
encoded_words = word_tokeniser.texts_to_sequences([data])[0]

In [6]:
VOCABULARY_SIZE = len(word_tokeniser.word_index) + 1
print('Vocabulary Size: {}'.format(VOCABULARY_SIZE))

Vocabulary Size: 6877


In [8]:
sequences = []
MAX_SEQ_LENGTH = 10

for i in range(MAX_SEQ_LENGTH, len(encoded_words)):
    sequence = encoded_words[i-MAX_SEQ_LENGTH:i+1]
    sequences.append(sequence)
sequences = np.array(sequences)

In [9]:
# divide the sequence into X and y
sequences = np.array(sequences)

X = sequences[:80000,:-1]  # assign all but last words of a sequence to X
y = sequences[:80000,-1]   # assign last word of each sequence to y
y = to_categorical(y, num_classes=VOCABULARY_SIZE)

In [10]:
X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH, padding='pre')

In [11]:
# create model architecture

EMBEDDING_SIZE = 100


model = Sequential()

# embedding layer
model.add(Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, input_length = MAX_SEQ_LENGTH))

# lstm layer 1
model.add(LSTM(128, return_sequences=True))

# lstm layer 2
model.add(LSTM(128))

# output layer
model.add(Dense(VOCABULARY_SIZE, activation='softmax'))

# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  
# summarize defined model
model.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 100)           687700    
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 128)           117248    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 6877)              887133    
Total params: 1,823,665
Trainable params: 1,823,665
Non-trainable params: 0
_________________________________________________________________


In [13]:
# The 'joke' database on GPU 154.4100482 seconds for 100 epochs.
# The 'joke' database on CPU 439.003922 seconds for 100 epochs.

start = timer()
model.fit(X, y, epochs=100, verbose=1, batch_size=256)
end = timer()
print(end - start)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Make predictions using this function

In [15]:
def generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, seed, n_words):
    
    text = seed
    
    # generate n_words
    for _ in range(n_words):
        
        # encode text as integers
        encoded_words = word_tokeniser.texts_to_sequences([text])[0]
        
        # pad sequences
        padded_words = pad_sequences([encoded_words], maxlen=MAX_SEQ_LENGTH, padding='pre')
        
        # predict next word
        prediction = model.predict_classes(padded_words, verbose=0)
        
        print(sorted(model.predict(padded_words)[0], reverse=True)[0:10])
        
        # convert predicted index to its word
        next_word = ""
        for word, i in word_tokeniser.word_index.items():
            if i == prediction:
                next_word = word
                break
        
        # append predicted word to text
        text += " " + next_word
        
    return text

In [25]:
num_words = 20

sentence = "Your mom is a"
print(generate_words(model, word_tokeniser, MAX_SEQ_LENGTH, sentence, num_words))

[0.8252202, 0.053276382, 0.033418547, 0.025384123, 0.008905347, 0.005334601, 0.005212782, 0.0051278095, 0.0042188214, 0.0020693163]
[0.19596015, 0.15580438, 0.10413958, 0.06441732, 0.06259328, 0.056783333, 0.05149593, 0.04895057, 0.033568017, 0.032676782]
[0.5658212, 0.16607705, 0.07701397, 0.05759702, 0.028069556, 0.024884501, 0.022034148, 0.0133826, 0.0051111397, 0.003906514]
[0.37619478, 0.08830888, 0.0490248, 0.043470196, 0.036576647, 0.022136692, 0.02110659, 0.019345714, 0.018209634, 0.017918926]
[0.5818137, 0.07354214, 0.072661094, 0.059272252, 0.05616008, 0.02200624, 0.017967703, 0.011394227, 0.008659827, 0.0074720676]
[0.55544144, 0.08749633, 0.07125034, 0.034787007, 0.028926538, 0.023420054, 0.0214832, 0.02142158, 0.020127857, 0.014584534]
[0.24105245, 0.2001268, 0.14660127, 0.10593711, 0.06477123, 0.054210823, 0.034870327, 0.014184357, 0.012095779, 0.01190589]
[0.24916677, 0.21834788, 0.21534108, 0.17182712, 0.06493011, 0.024553362, 0.014805362, 0.009584355, 0.007817026, 0.00