## ANNDL Final Project: Jeopardy! Question Generation

In [43]:
# This notebook will train an LSTM on a set of Jeopardy! questions with the goal of producing novel questions.
# NOTE: Some code (notably `sample` and `on_epoch_end`) is from content provided in classwork.

In [72]:
import csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
import random
import sys
from keras.callbacks import LambdaCallback

In [45]:
# Load and clean data.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)                                # Skip header line.
    for row in reader:
        if "<" not in row[5]:                   # Exclude videos/photos, questions with formatting.
            data.append(row[5])                 # Keep only question text.

In [106]:
# Break into train data.
text = " ".join(row for row in data[:10000])

In [108]:
# Map characters to indices, vice versa.
chars = sorted(list(set("".join(row for row in data))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [109]:
# Find length of longest training question by character.
maxlen = 0
counter = 0

for question in train_questions:
    for letter in question:
        counter += 1
    if counter > maxlen:
        maxlen = counter
    counter = 0

In [110]:
# Create fixed-length "questions" to train on.
seqlen = 100
step = seqlen
question_snips = []
for i in range(0, len(text) - seqlen - 1, step):
    question_snips.append(text[i: i + seqlen + 1])

In [112]:
# Turn every question into vector indicating which letter is present (in x) or next (in y) at that point in the sequence.
# seqlen = maxlen                                # Length in chars of longest question
x = np.zeros((len(question_snips), seqlen, len(chars)), dtype=np.bool) 
y = np.zeros((len(question_snips), seqlen, len(chars)), dtype=np.bool) 
for i, question in enumerate(question_snips):
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1      # Log which character is present at current spot in the sequence.
        y[i, t, char_indices[char_out]] = 1     # Log what the next character is in the sequence.

In [113]:
# Model structures.
model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

def sample(preds, temperature=1.0):
    """Helper function to sample an index from a probability array."""
    preds = np.asarray(preds).astype('float64')
    preds = np.exp(np.log(preds) / temperature)  # softmax
    preds = preds / np.sum(preds)                #
    probas = np.random.multinomial(1, preds, 1)  # sample index
    return np.argmax(probas)                     #

def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text."""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(question_snips) - seqlen - 1)
    
    for diversity in [0.2, 0.5, 1.0]:
        print('----- diversity:', diversity)

        generated = ''
        # Seed for generated question is the start of a random question from training set
        question = text[start_index: start_index + seqlen]
        generated += question
        print('----- Generating with seed: "' + question + '"')
        sys.stdout.write(generated)

        for i in range(200):
            x_pred = np.zeros((1, seqlen, len(chars)))
            for t, char in enumerate(question):
                x_pred[0, t, char_indices[char]] = 1.
            
            preds = model.predict(x_pred, verbose=0)
            next_index = sample(preds[0, -1], diversity)
            next_char = indices_char[next_index]

            question = question[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print("\n\n")

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=50,
          callbacks=[print_callback])

Epoch 1/50

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "ution of Mass., second President of the United States In the title of an Aesop fable, this insect sh"
ution of Mass., second President of the United States In the title of an Aesop fable, this insect she the the the the this the the the the the " fore the the the the the the the the the the the thes "of the the the the the the the the the the the The the the the the the the the the this the the the 


----- diversity: 0.5
----- Generating with seed: "ution of Mass., second President of the United States In the title of an Aesop fable, this insect sh"
ution of Mass., second President of the United States In the title of an Aesop fable, this insect shis of an he f conthisouine of car The the amer the of be thes ane this the bamte the the the the an inlin "Awof lithe she thin pas the oofere thes Hopr wonc in "19990400 the this oof ide in arte of th


----- diversity: 1.0
----- Generating wit

your family" In geologic time one of these, shorter than an eon, is divided into periods & subdivide this to be this word might some of this Arien Ochose word for this city that this Canish was a movie Christory, the second of the sea for a main is a long by this country for the mother Sing of these


----- diversity: 1.0
----- Generating with seed: "your family" In geologic time one of these, shorter than an eon, is divided into periods & subdivide"
your family" In geologic time one of these, shorter than an eon, is divided into periods & subdivide Its name Of Arler out drow become in 1938: Maryashim 42 & the cont in its south A Wean heard sheriful park to aloyerse of these for higher take to countred byong back Parky, whose fregar local soll s


Epoch 12/50

----- Generating text after Epoch: 11
----- diversity: 0.2
----- Generating with seed: "displayed on "And away we go" Cows regurgitate this from the first stomach to the mouth & chew it ag"
displayed on "And away we go" Cows regu

lympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves The company to the state for this country's product of this country's state company to the state conson in this state said to the state composer This country's state to the state comedy of these state comp


----- diversity: 0.5
----- Generating with seed: "lympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves The c"
lympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves The capital is this tourne the retitle the note of the trop seen here in the language of this home of the British senting this birth to see common of this seen here of this rest of this to the first of the


----- diversity: 1.0
----- Generating with seed: "lympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves The c"
lympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Br

 minute a  month Outlaw: "Murdered by a traitor and a coward whose name is not worthy to appear here: "Have story" in 1969, many after what "mines Chrraw Perca name Helf adopted The Mensen College town left (Comple tire the born is also uses the romance & a rolo Bodquilates credits "Hello" Highlages


Epoch 33/50

----- Generating text after Epoch: 32
----- diversity: 0.2
----- Generating with seed: "lection This word for someone who walks comes from the Latin for "foot" Lava & igneous rock are form"
lection This word for someone who walks comes from the Latin for "foot" Lava & igneous rock are former the state composer of this country was the state company to the star was the first of the star was the state could be a could be a composer in the 1990s The name of the star with the state comes fr


----- diversity: 0.5
----- Generating with seed: "lection This word for someone who walks comes from the Latin for "foot" Lava & igneous rock are form"
lection This word for someone who walks

scar winner: "...you are a credit to your craft, your race and to your family" In geologic time one of these in this plant in this country in 1970 It was a state at the Chicago sent that has a state that has a state in this country This containing the state in this country In 1977 this state's last 


----- diversity: 0.5
----- Generating with seed: "scar winner: "...you are a credit to your craft, your race and to your family" In geologic time one "
scar winner: "...you are a credit to your craft, your race and to your family" In geologic time one of these in "What The Began as" this state's canyonial school of this president of the indicter that one of these in the world's first can be past of these name from the Pat at a quiet the last was th


----- diversity: 1.0
----- Generating with seed: "scar winner: "...you are a credit to your craft, your race and to your family" In geologic time one "
scar winner: "...you are a credit to your craft, your race and to your family" In geologic

ancis Xavier founded the Society of Jesus In 1961 James Brown announced "all aboard" for this train in Senaturizan called this "Alodge Buima" & into "C'New May'ven in" in blaid of this, who director A baspest, "The Nouride", Location's Lail Perfel off the "hous" Europer VIII, the falled 3 sound in t


Epoch 49/50

----- Generating text after Epoch: 48
----- diversity: 0.2
----- Generating with seed: "omy cabin with free drinks Ali, who married this man's daughter Fatima, is considered by Shia Muslim"
omy cabin with free drinks Ali, who married this man's daughter Fatima, is considered by Shia Muslims that that this for the plane is the first state for this first seat of this region of this type of this state in this city of this country This career of this country is the first seat of this count


----- diversity: 0.5
----- Generating with seed: "omy cabin with free drinks Ali, who married this man's daughter Fatima, is considered by Shia Muslim"
omy cabin with free drinks Ali, who mar

<keras.callbacks.callbacks.History at 0x13d6ff8d0>