# Text Generation Exploring

In [25]:
from pathlib import Path
import pandas as pd

In [26]:
inputText = pd.read_csv('data/text_generation/data')
inputText.head(10)

Unnamed: 0,ID,Joke
0,1,What did the bartender say to the jumper cable...
1,2,Don't you hate jokes about German sausage? The...
2,3,Two artists had an art contest... It ended in ...
3,4,Why did the chicken cross the playground? To g...
4,5,What gun do you use to hunt a moose? A moosecut!
5,6,"If life gives you melons, you might have dysle..."
6,7,Broken pencils... ...are pointless.
7,8,What did one snowman say to the other snowman?...
8,9,How many hipsters does it take to change a lig...
9,10,Where do sick boats go? The dock!


In [28]:
print(inputText.loc[:,'Joke'])

0       What did the bartender say to the jumper cable...
1       Don't you hate jokes about German sausage? The...
2       Two artists had an art contest... It ended in ...
3       Why did the chicken cross the playground? To g...
4        What gun do you use to hunt a moose? A moosecut!
                              ...                        
1617    What do you call a camel with 3 humps? Humphre...
1618    Two fish in a tank. [x-post from r/Jokes] One ...
1619            "Stay strong!" I said to my wi-fi signal.
1620    Why was the tomato blushing? Because it saw th...
1621      What is heavy forward but not backward? **ton**
Name: Joke, Length: 1622, dtype: object


In [41]:
jokeList = []
for each in inputText.loc[:, 'Joke']:
    jokeList.append(each)

In [42]:
jokeList[0:5]

['What did the bartender say to the jumper cables? You better not try to start anything.',
 "Don't you hate jokes about German sausage? They're the wurst!",
 'Two artists had an art contest... It ended in a draw',
 'Why did the chicken cross the playground? To get to the other slide.',
 'What gun do you use to hunt a moose? A moosecut!']

In [32]:
import nltk
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/ubuntu/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

True

In [45]:
from __future__ import unicode_literals, print_function, division
from io import open
from glob import glob
import os
import unicodedata
import string
import torch

In [43]:
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Build the category_lines dict, a list of lines per category
category_lines = {}
all_categories = []
for category, joke in enumerate(jokeList):
    all_categories.append(category)
    category_lines[category] = joke

num_categories = len(all_categories)

print(f'Number of categories: {num_categories}')

Number of categories: 1622


In [44]:
import random

def randChoice(li: list):
    return li[random.randint(0, len(li) - 1)]

def randomTrainingPair():
    category = randChoice(all_categories)
    line = randChoice(category_lines[category])
    return category, line

In [47]:
# One hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, num_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)