# Text Generation Exploring

In [2]:
from pathlib import Path
import pandas as pd

In [3]:
inputText = pd.read_csv('data/text_generation/data')
inputText.head(10)

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!
5,6,"If life gives you melons, you might have dysle..."
6,7,Broken pencils... ...are pointless.
7,8,What did one snowman say to the other snowman?...
8,9,How many hipsters does it take to change a lig...
9,10,Where do sick boats go? The dock!


In [4]:
print(inputText.loc[:,'Joke'])

0       What did the bartender say to the jumper cable...
1       Don't you hate jokes about German sausage? The...
2       Two artists had an art contest... It ended in ...
3       Why did the chicken cross the playground? To g...
4        What gun do you use to hunt a moose? A moosecut!
                              ...                        
1617    What do you call a camel with 3 humps? Humphre...
1618    Two fish in a tank. [x-post from r/Jokes] One ...
1619            "Stay strong!" I said to my wi-fi signal.
1620    Why was the tomato blushing? Because it saw th...
1621      What is heavy forward but not backward? **ton**
Name: Joke, Length: 1622, dtype: object


In [5]:
jokeList = []
for each in inputText.loc[:, 'Joke']:
    jokeList.append(each)

In [6]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

True

In [7]:
from __future__ import unicode_literals, print_function, division
from io import open
from glob import glob
import os
import unicodedata
import string
import torch
import re
from nltk.corpus import stopwords

In [8]:
# Tokenize all the words in each joke
# remove punctuation and all that
cleanedJokes = []
CLEANHTML = re.compile('<.*?>')
stopWords = set(stopwords.words('english'))
for joke in jokeList:
    text = re.sub(CLEANHTML, '', joke)
    # split into white space
    wordList = nltk.word_tokenize(text)
    # remove symbol and stop words
    wordList = [word.lower() for word in wordList if word.isalpha() and word not in stopWords]
    wordList.append('<EOS>')
    cleanedJokes.append(wordList)

In [9]:
# Build from vocab
from torchtext.vocab import build_vocab_from_iterator

vocabs = build_vocab_from_iterator(cleanedJokes, specials=['<UNK>'])
vocabs.set_default_index(vocabs['<UNK>'])
len(vocabs)

4330

In [10]:
indexesFirstJoke = vocabs(cleanedJokes[0])
indexesFirstJoke

[2, 198, 7, 2976, 2063, 21, 259, 543, 959, 223, 1]

In [11]:
vocabs['<UNK>']

0

In [12]:
vocab_size = len(vocabs)
all_words = vocabs.get_itos()
print(vocab_size)

4330


In [13]:
# Tokenized joke list
tokenizedJokes = []
for joke in cleanedJokes:
    tokenized = vocabs(joke)
    tokenizedJokes.append(tokenized)

print(tokenizedJokes[0])
print('actual word represent')
print(cleanedJokes[0])

[2, 198, 7, 2976, 2063, 21, 259, 543, 959, 223, 1]
actual word represent
['what', 'bartender', 'say', 'jumper', 'cables', 'you', 'better', 'try', 'start', 'anything', '<EOS>']


In [14]:
# ! Sequence text into input and output. Ex: 50 input and 1 output
# Lest go with 3 input words and 1 output
length = 3 + 1
sequences = []
for joke in cleanedJokes:
    for i in range(length, len(joke)):
        # select sequence of token
        seq = joke[i-length:i]
                
        # convert to a line
        line = ' '.join(seq)
        # store
        sequences.append(line)

print(f"total of sequences: {len(sequences)}")
print(sequences[0:50])

total of sequences: 9616
['what bartender say jumper', 'bartender say jumper cables', 'say jumper cables you', 'jumper cables you better', 'cables you better try', 'you better try start', 'better try start anything', 'do hate jokes german', 'hate jokes german sausage', 'jokes german sausage they', 'german sausage they wurst', 'two artists art contest', 'artists art contest it', 'art contest it ended', 'contest it ended draw', 'why chicken cross playground', 'chicken cross playground to', 'cross playground to get', 'playground to get slide', 'what gun use hunt', 'gun use hunt moose', 'use hunt moose a', 'hunt moose a moosecut', 'if life gives melons', 'life gives melons might', 'gives melons might dyslexia', 'what one snowman say', 'one snowman say snowman', 'snowman say snowman smell', 'say snowman smell carrots', 'how many hipsters take', 'many hipsters take change', 'hipsters take change lightbulb', 'take change lightbulb it', 'change lightbulb it really', 'lightbulb it really obscur

In [15]:
# Save the sequences for later
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

# save sequences to file
out_filename = 'saved/test_sequences.txt'
save_doc(sequences, out_filename)

In [16]:
# When load  back, tokenized every sequence
tokenizedSequences = []
for each in sequences:
    wordList = nltk.word_tokenize(each)
    tokenized = vocabs(wordList)
    tokenizedSequences.append(tokenized)

print(tokenizedSequences[0:3])
print(sequences[0:3])

[[2, 198, 7, 2976], [198, 7, 2976, 2063], [7, 2976, 2063, 21]]
['what bartender say jumper', 'bartender say jumper cables', 'say jumper cables you']


In [17]:
# One hot encode the next word as output
X = []
y = []

for each in tokenizedSequences:
    if len(each) == 4:
        X.append(each[:-1])
        # Make y one hot encode
        y.append(each[-1])
    else:
        print(f'The tokenized: {each}')
        sentence = []
        for word in each:
            sentence.append(vocabs.lookup_token(word))
        print(f'The represetntation is: {sentence}')

tensorY = torch.nn.functional.one_hot(torch.tensor(y), num_classes=vocab_size)

print(f'Len of X: {len(X)}')
tensorX = torch.tensor(X)
seq_len = tensorX.shape[1]
print(f'Shape of X: {tensorX.shape} and of y: {tensorY.shape}')

Len of X: 9616
Shape of X: torch.Size([9616, 3]) and of y: torch.Size([9616, 4330])


In [18]:
import torch.nn as nn
from torchinfo import summary

In [24]:
# TODO: converting the keras embedding into pytorch

# model = Sequential()
# model.add(Embedding(vocab_size, 50, input_length=seq_length))
# model.add(LSTM(100, return_sequences=True))
# model.add(LSTM(100))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(vocab_size, activation='softmax'))

class extract_tensor(nn.Module):
    def forward(self,x):
        # Output shape (batch, features, hidden)
        tensor, _ = x
        # Reshape shape (batch, hidden)
        ten = tensor[:, -1, :]
        print(ten.shape)
        flat = nn.Flatten()
        ten = flat(ten)
        return ten

model = nn.Sequential(
    nn.Embedding(num_embeddings=9616, embedding_dim=4330),
    nn.LSTM(input_size=vocab_size, hidden_size=100, batch_first=True),
    extract_tensor(),

    nn.Linear(in_features=100, out_features=vocab_size),
    nn.Softmax(dim=1)
)
# summary(model, input_size=(1, 9616, 100))
model(tensorX)

torch.Size([9616, 100])


tensor([[0.0002, 0.0002, 0.0002,  ..., 0.0003, 0.0003, 0.0003],
        [0.0003, 0.0002, 0.0002,  ..., 0.0002, 0.0002, 0.0002],
        [0.0002, 0.0002, 0.0003,  ..., 0.0003, 0.0002, 0.0002],
        ...,
        [0.0002, 0.0002, 0.0002,  ..., 0.0003, 0.0002, 0.0003],
        [0.0002, 0.0002, 0.0002,  ..., 0.0003, 0.0002, 0.0002],
        [0.0002, 0.0001, 0.0003,  ..., 0.0002, 0.0002, 0.0003]],
       grad_fn=<SoftmaxBackward0>)

In [None]:
# USELESS
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dict, a list of lines per category
category_lines = {}
all_categories = []
for category, joke in enumerate(cleanedJokes):
    all_categories.append(category)
    category_lines[category] = joke

num_categories = len(all_categories)

print(f'Number of categories: {num_categories}')

Number of categories: 1622


In [None]:
import random


def randChoice(li: list):
    return li[random.randint(0, len(li) - 1)]

# Get a random pair of category (ID) and a word from that joke category
def randomTrainingPair():
    category = randChoice(all_categories)
    wordList = category_lines[category].split(' ')
    line = randChoice(wordList)
    return category, line

In [None]:
# One hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, num_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last words (not including EOS) for input
def inputTensor(cleanedJokeEntry):
    tensor = torch.zeros(len(cleanedJokeEntry), 1, n_words)
    for li in range(len(cleanedJokeEntry)):
        word = cleanedJokeEntry[li]
        print(f'The word: {word}')
        tensor[li][0][all_words.index(word)] = 1

        print(tensor)
        break

    return tensor

# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [None]:
res = categoryTensor(1)
res

tensor([[0., 1., 0.,  ..., 0., 0., 0.]])

In [None]:
lol = inputTensor(cleanedJokes[0])

The word: What
tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])


In [None]:
category, line = randomTrainingPair()
print(category, '====', line)

1550 ==== to
