In [1]:
import re
import pickle
import random
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from timeit import default_timer as timer
import requests



In [2]:
torch.cuda.is_available()

True

In [3]:
url = "http://gutenberg.org/files/1342/1342-0.txt"
book = requests.get(url)
data = book.text
data = data[2440:]
print(len(data))

797205


In [4]:
df = pd.read_csv("jokes.csv")
data = ' '.join(df['Joke'].tolist()).replace("\'", "")
print(len(data))

128866


In [5]:
def clean_dataset(dataset):

    # remove whitespace
    tokenized = word_tokenize(dataset)
    dataset = " ".join([token.strip() for token in tokenized])

    # remove characters
    dataset = re.sub(r"[^\w\n]", " ", dataset)

    # replace multiple whitespaces with single whitespace
    dataset = re.sub(r"\s+", " ", dataset)
    dataset = dataset.strip()

    dataset = dataset.lower()
    dataset = dataset.replace("â", "a")
    
    return dataset

data = clean_dataset(data)

In [6]:
data1 = data
data1 = data1.split()
sequences = []
sequence_length = 10

for i in range(sequence_length, len(data1)):
    sequence = data1[i-sequence_length:i+1]
    sequences.append(sequence)
sequences = np.array(sequences)

XD = []
for x in sequences:
    XD.append(' '.join(x))
    
seqs = XD
seqs[0:1000]

['what did the bartender say to the jumper cables you better',
 'did the bartender say to the jumper cables you better not',
 'the bartender say to the jumper cables you better not try',
 'bartender say to the jumper cables you better not try to',
 'say to the jumper cables you better not try to start',
 'to the jumper cables you better not try to start anything',
 'the jumper cables you better not try to start anything dont',
 'jumper cables you better not try to start anything dont you',
 'cables you better not try to start anything dont you hate',
 'you better not try to start anything dont you hate jokes',
 'better not try to start anything dont you hate jokes about',
 'not try to start anything dont you hate jokes about german',
 'try to start anything dont you hate jokes about german sausage',
 'to start anything dont you hate jokes about german sausage theyre',
 'start anything dont you hate jokes about german sausage theyre the',
 'anything dont you hate jokes about german saus

In [7]:
# inputs and targets
x, y = [], []

for i in seqs:
    seqx = " ".join(i.split()[:-1])
    x.append(seqx)
    seqy = " ".join(i.split()[1:])
    y.append(seqy)

In [8]:
num2word = {}
i = 0

for w in set(data.split()):
    num2word[i] = w
    i+= 1
    
word2num = {t: i for i, t in num2word.items()}

vocab_size = len(num2word)
vocab_size

4717

In [9]:
def tokenizer(seq):
    return [word2num[w] for w in seq.split()]

# Tokenize
x_int = np.array([tokenizer(i) for i in x])
y_int = np.array([tokenizer(i) for i in y])

In [10]:
def get_batches(x, y, size):
    count = 0
    for i in range(size, x.shape[0], size):
        x1 = x[count:i,:]
        y1 = y[count:i,:]
        count = i
        yield x1, y1

In [11]:
class WordLSTM(nn.Module):
    
    def __init__(self, n_hidden=256, n_layers=4, drop=0.1, lr=0.001):
        super().__init__()

        self.drop = drop
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        self.emb_layer = nn.Embedding(vocab_size, 200)

        self.lstm = nn.LSTM(200, n_hidden, n_layers, 
                            dropout=drop, batch_first=True)
        
        self.dropout = nn.Dropout(drop)
        self.fc = nn.Linear(n_hidden, vocab_size)      
    
    def forward(self, x, hidden):

        embedded = self.emb_layer(x)     
        lstm_output, hidden = self.lstm(embedded, hidden) 
        output = self.dropout(lstm_output).reshape(-1, self.n_hidden) 
        output = self.fc(output)
        
        return output, hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        if (torch.cuda.is_available()):
              hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
              hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        return hidden

In [12]:
model = WordLSTM()
model.to('cuda')
print(model)

WordLSTM(
  (emb_layer): Embedding(4717, 200)
  (lstm): LSTM(200, 256, num_layers=4, batch_first=True, dropout=0.1)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=256, out_features=4717, bias=True)
)


In [13]:
def train(model, epochs=100, batch_size=128, lr=0.001):
    
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # push model to GPU
    model.cuda()
    model.train()

    for i in range(epochs):

        h = model.init_hidden(batch_size)
        
        for x, y in get_batches(x_int, y_int, batch_size):
            
            x = torch.from_numpy(x)
            x = torch.tensor(x).to(torch.long)
            y = torch.from_numpy(y)
            y = torch.tensor(y).to(torch.long)
            
            inputs, targets = x, y
            inputs, targets = inputs.cuda(), targets.cuda()

            h = tuple([each.data for each in h])


            model.zero_grad()

            output, h = model(inputs, h)

            loss = criterion(output, targets.view(-1))

            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), 1)

            opt.step()            
       
        print("Epoch ", i+1, "loss ", loss.item())

In [14]:
def predict(model, tkn, h=None):

    # If the word isn't seen yet there will be no key for the dictionary, it defaults to 'the'
    key = word2num.get(tkn, word2num['the'])
    x = np.array([[key]])
    
    inputs = torch.from_numpy(x)
    inputs = inputs.cuda()

    h = tuple([each.data for each in h])
    out, h = model(inputs, h)

    # token probabilities
    prob = F.softmax(out, dim=1).data.cpu()
    prob = prob.numpy()
    prob = prob[0].reshape(prob.shape[1],)
    top_n_idx = prob.argsort()[-3:][::-1]
    sampled_word_index = top_n_idx[0]

    print(sorted(prob, reverse = True)[0:3])
    print(num2word[top_n_idx[0]], num2word[top_n_idx[1]], num2word[top_n_idx[2]])

    return num2word[sampled_word_index], h


# generate text
def generate(model, length, text):      
    model.cuda()
    model.eval()

    h = model.init_hidden(1)
    text = text.lower()
    sentence = text.split()

    for t in text.split():
        word, h = predict(model, t, h)
    
    sentence.append(word)

    for i in range(length-1):
        word, h = predict(model, sentence[-1], h)
        sentence.append(word)

    return ' '.join(sentence)

In [15]:
start = timer()
train(model, batch_size = 128, epochs=50)
end = timer()
print(end - start)



Epoch  1 loss  6.861321926116943
Epoch  2 loss  6.628620147705078
Epoch  3 loss  6.3079094886779785
Epoch  4 loss  6.046767234802246
Epoch  5 loss  5.793934345245361
Epoch  6 loss  5.575216770172119
Epoch  7 loss  5.385983467102051
Epoch  8 loss  5.277525901794434
Epoch  9 loss  5.123063087463379
Epoch  10 loss  4.98129415512085
Epoch  11 loss  4.848435878753662
Epoch  12 loss  4.675694465637207
Epoch  13 loss  4.479979515075684
Epoch  14 loss  4.315096378326416
Epoch  15 loss  4.411412239074707
Epoch  16 loss  4.260664939880371
Epoch  17 loss  4.040541648864746
Epoch  18 loss  3.9297027587890625
Epoch  19 loss  3.874598264694214
Epoch  20 loss  3.7350406646728516
Epoch  21 loss  3.5999152660369873
Epoch  22 loss  3.4346835613250732
Epoch  23 loss  3.3783538341522217
Epoch  24 loss  3.2846622467041016
Epoch  25 loss  3.1634631156921387
Epoch  26 loss  3.122891902923584
Epoch  27 loss  2.9297072887420654
Epoch  28 loss  2.8474392890930176
Epoch  29 loss  2.756193161010742
Epoch  30 loss

In [16]:
np.set_printoptions(precision=3, suppress=True) 

In [22]:
print(generate(model, 3, "knock knock whos"))
print("")
generate(model, 1, "there")

[0.17078434, 0.07750105, 0.07073179]
who why knock
[0.58684105, 0.07083722, 0.04232638]
knock whos com
[0.9480909, 0.0057749725, 0.004183191]
there hes youre
[0.15096, 0.14922044, 0.12390228]
boo sombrero impatient
[0.7003772, 0.07340678, 0.038031053]
boo sombrero ash
knock knock whos there boo boo

[0.1484022, 0.08830034, 0.0808735]
are were dont


'there are'