In [1]:
from transformers import GPT2LMHeadModel , GPT2Tokenizer
import pandas as pd
import random

class trieNode():

  def __init__(self, letter = None):

    # Keep track of children as nodes and letters
    self.children = []
    self.children_nodes = [] 
    self.is_leaf = False
    self.letter = letter

    # Utility: store the "words" up to this point, 
    # as well as all the child strings that follow this.
    # This exhanges some memory for easier lookup later
    # Additionally, we store how many times a node has been visited 
    self.string = None
    self.child_strings = []
    self.visited = 0

class Trie():

  def __init__(self):
    self.root = trieNode()
    self.n_nodes = 0

  def insert(self, key):
    
    # Start at the root
    pointer = self.root
    
    idx = 0

    for i in key:
      if i in pointer.children: # If the child exists
        order = pointer.children.index(i) # simply move forward
        pointer = pointer.children_nodes[order]
      else: # Otherwise create and append a new node
        pointer.children.append(i)
        pointer.children_nodes.append(trieNode(i))
        self.n_nodes += 1
        pointer = pointer.children_nodes[-1]
        pointer.string = key[0:(idx + 1)]
      
      # Update the other values
      pointer.visited += 1

      # ...and if the node is a leaf, or if we should simply add new children
      idx += 1
      if idx == len(key):
        pointer.is_leaf = True
      else:
        pointer.child_strings.append(key[(idx):len(key)]) 
 

## Helper functions

These functions help us create the needed data structures

In [2]:
def create_dict(trie):

  result = {}

  def crawl(trie):

    if len(trie.children_nodes) == 0:
      return

    if trie.is_leaf and len(trie.children) > 0:
      for child_string in trie.child_strings:
        if child_string not in result.keys():
          result[child_string] = []
        result[child_string].append(trie.string)

    for child in trie.children_nodes:

      crawl(child)
  
  crawl(trie)

  return result

def create_flipped_dict(trie):

  result = {}

  def crawl(trie):

    if len(trie.children_nodes) == 0:
      return

    if trie.is_leaf and len(trie.children) > 0:
      for child_string in trie.child_strings:
        flipped_string = child_string[::-1]
        if flipped_string not in result.keys():
          result[flipped_string] = []
        result[flipped_string].append(trie.string[::-1])

    for child in trie.children_nodes:

      crawl(child)
  
  crawl(trie)

  return result


## Joke generator class

This class handles most of the heavy lifting apart from the GPT-3 connectivity. 

In [3]:
class jokeGenerator():

  def __init__(self):

    self.trie = Trie()
    self.flipped_trie = Trie()
    self.words = None

    self.result = None
    self.flipped_result = None
    self.common_keys = None
    
    self.wordplays = None

    self.tokenizer = None
    self.model = None

  def loadWords(self, source):

    words = pd.read_csv(source, na_filter = False)
    words = words.values.tolist()
    words = [x[0] for x in words]    

    print(f'Loading {len(words)} words')

    i = 0
    n_words = len(words)
    for word in words:
      i += 1
      if i % int(n_words/10) == 0:
        print(f'{int((i+1)/n_words*100)}% ({i}/{n_words})')
      self.trie.insert(word)
      self.flipped_trie.insert(word[::-1])

    print(f'Done')

    self.words = words

  # normal: all words
  # not_short: the connector is longer than 2 characters
  # not_a_word: all words, where the connecting word is not a word in itself
  # not_a_short_word: all words, where the connecting word is not a word in itself and it is more than 2 chracters
  # not_a_short_word_or_ing: all words, where the connecting word is not a word in itself and it is more than 2 chracters and is not "ing"
  def generateWords(self, type = 'normal'):

    if self.flipped_trie == None or self.trie == None:
      print('You must load the words first: loadWords(source)')

    self.flipped_result = create_flipped_dict(self.flipped_trie.root)
    self.result = create_dict(self.trie.root)

    common_keys = list(set(self.result.keys()).intersection(self.flipped_result.keys()))

    if type == 'normal':
      self.common_keys = common_keys
    elif type == 'not_short':
      self.common_keys = [x for x in common_keys if (len(x) > 2)]
    elif type == 'not_a_word':
      self.common_keys = [x for x in common_keys if (x not in self.words and x != '-')]
    elif type == 'not_a_short_word':
      self.common_keys = [x for x in common_keys if (x not in self.words and x != '-' and len(x) > 2)]
    elif type == 'not_a_short_word_or_ing':
      self.common_keys = [x for x in common_keys if (x not in self.words and x != '-' and x != 'ing' and len(x) > 2)]

    self.wordplays = {}
    for c_key in self.common_keys:
      for r in self.result[c_key]:
        for f_r in self.flipped_result[c_key]:
          self.wordplays[f'{r}_{c_key}_{f_r}'] = [f'{r}', f'{c_key}',f'{f_r}']
                    
  def loadModels(self, language = 'english'):

    if language == 'finnish':
      self.tokenizer = GPT2Tokenizer.from_pretrained('Finnish-NLP/gpt2-finnish') 
      self.model = GPT2LMHeadModel.from_pretrained('Finnish-NLP/gpt2-finnish')
    elif language == 'english':
      self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large') 
      self.model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    # Adding new languages is laughably easy. Search for a passable model on Huggingface, chuck it here and let the script do it's magic.

  def generateJoke(self, first_string = '', second_string = '', n = 1, length = 30):
    joke = self.wordplays[random.choice(list(self.wordplays.keys()))]
    joke_words = joke[0] + joke[1] + ' and ' + joke[1] + joke[2]   
    joke_string = first_string + ' ' + joke_words + ' ' + second_string

    input_ids = self.tokenizer.encode(joke_string, return_tensors = 'pt')

    output = self.model.generate(input_ids, 
                        max_length = length,
                        num_beams = n, 
                        num_return_sequences = n,
                        no_repeat_ngram_size  = 3)
    
    result = []
    for op in output:
      result.append(self.tokenizer.decode(op))

    return result

## Examples

After running the abovementioned scripts, you can run the following. You will need a dictionary of words. I used [this](http://www.mieliestronk.com/corncob_lowercase.txt) wonderful corpus, but feel free to plug in your own in any language.

In [4]:
# Initialize the class
eng_gen = jokeGenerator()

# Load the words from online or from disk. This can be in any language
eng_gen.loadWords('http://www.mieliestronk.com/corncob_lowercase.txt')
# The generator has multiple filters to filter naive solutions - let's use setting "connector is not a word, is longer than 2 characters and is not _ing_"
eng_gen.generateWords('not_a_short_word_or_ing')
# You can add mode languages above by finding a suitable transformer model in your language and substituting above
eng_gen.loadModels('english')

Loading 58109 words
10% (5810/58109)
19% (11620/58109)
29% (17430/58109)
39% (23240/58109)
49% (29050/58109)
59% (34860/58109)
69% (40670/58109)
79% (46480/58109)
89% (52290/58109)
99% (58100/58109)
Done


You can explore the dyads of words by sampling from the wordplays-dictionary in the generator:

In [5]:
# eng_gen.wordplays[random.choice(list(eng_gen.wordplays.keys()))]

You can generate jokes using the provided GPT engine by calling the generateJoke() method. These are generally rather poor in quality, but nonetheless you can enjoy your 100% artificially created humor

In [6]:
eng_gen.generateJoke('Tell me a joke about', n = 1, length = 80)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


["Tell me a joke about gloomful and fulfilling \xa0the world is a better place when you're not there.\nI'm not sure what the point of this is, but I'm sure it's a good one.\nThe point is that I'm not going to be there. I'm going to go to the movies, I'm gonna go to a concert, I'll go to dinner"]

## OpenAI GPT-3

The following utilizes the OpenAI GPT-3 model. It is not free, but yield _much_ better results than the free transformers. Some might even call it humor. See [https://beta.openai.com/](https://beta.openai.com/) to sign up for your API key. You can call the API using the following prompt:

In [8]:
import os
import openai

words = eng_gen.wordplays[random.choice(list(eng_gen.wordplays.keys()))]
input_word = ''.join(words[0:2]) + ' ' + words[2] + ' and ' + words[0] + ' ' + ''.join(words[1:3])

openai.api_key = "API KEY"
result = openai.Completion.create(
  engine="text-davinci-002",
  prompt=f"Write a joke containing the words '{input_word}'",
  max_tokens=60,
  temperature = 0.3,
  n = 1,
  echo = True
)


and fetch the resulting joke as such:

In [10]:
result.to_dict()['choices'][0]['text']

"Write a joke containing the words 'papas ted and pa pasted'\n\nWhy did Papas Ted and Pa Pasted cross the road?\n\nTo get to the other side!"

Let's finally create a helper function to come up with any number of jokes.

In [11]:
def create_jokes(wordplays, n, api_key):

  results = pd.DataFrame({'input' : [], 'prompt' : [], 'answer' : []})
  openai.api_key = api_key

  for i in range(n):

    words = wordplays[random.choice(list(wordplays.keys()))]
    input_word = ''.join(words[0:2]) + ' ' + words[2] + ' and ' + words[0] + ' ' + ''.join(words[1:3])
    prompt = f"Write a joke containing the words '{input_word}'"

    result = openai.Completion.create(
      engine="text-davinci-002",
      prompt=prompt,
      max_tokens=60,
      temperature = 0.3,
      n = 1
    )
    answer = [input_word, prompt, result.to_dict()['choices'][0]['text']]

    results.loc[len(results)] = answer

  return results

Call it like so

In [12]:
jokes = create_jokes(eng_gen.wordplays, 10, "API KEY")

And check the resulting jokes

In [13]:
jokes

Unnamed: 0,input,prompt,answer
0,laughter mites and laugh termites,Write a joke containing the words 'laughter mi...,\n\nWhat do you call laughter mites that infes...
1,carbon us and car bonus,Write a joke containing the words 'carbon us a...,\n\nWhat do you call a carbon that's also a ca...
2,ponderous ted and ponder ousted,Write a joke containing the words 'ponderous t...,\n\nWhy did the chicken cross the road? To get...
3,tugela borate and tug elaborate,Write a joke containing the words 'tugela bora...,\n\nWhy did the tugela borate the tug elaborat...
4,ohmic robes and oh microbes,Write a joke containing the words 'ohmic robes...,\n\nWhat do you call a Jedi who only wears ohm...
5,refund id and ref undid,Write a joke containing the words 'refund id a...,\n\nI tried to get a refund for my gym members...
6,genres canning and gen rescanning,Write a joke containing the words 'genres cann...,\n\nWhy did the chicken cross the road? To get...
7,scoopful filled and scoop fulfilled,Write a joke containing the words 'scoopful fi...,"\n\nI was going to make some ice cream, but I ..."
8,snugger ontology and snug gerontology,Write a joke containing the words 'snugger ont...,\n\nWhat's the difference between snugger onto...
9,bedsitter mite and bedsit termite,Write a joke containing the words 'bedsitter m...,\n\nWhat's the difference between a bedsitter ...
