## ANNDL Final Project: Jeopardy! Question Generation

In [43]:
# This notebook will train an LSTM on a set of Jeopardy! questions with the goal of producing novel questions.

In [72]:
import csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
import random
import sys
from keras.callbacks import LambdaCallback

In [45]:
# Load data.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)                                # Skip header line.
    for row in reader:
        if "<" not in row[5]:                   # Exclude videos/photos, questions with formatting.
            data.append(row[5])                 # Keep only question text.

In [85]:
# Break into train data. Test data????
train_questions = data[:10000]
text = "".join(row for row in data[:1000])

In [86]:
print(len(text) / len(data))
print(len(text))

0.37014297076567926
76322


In [87]:
# Map characters to indices, vice versa.
chars = sorted(list(set("".join(row for row in data))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [49]:
# Find length of longest training question by character.
maxlen = 0
counter = 0

for question in train_questions:
    for letter in question:
        counter += 1
    if counter > maxlen:
        maxlen = counter
    counter = 0

In [88]:
seqlen = 100
step = seqlen
question_snips = []
for i in range(0, len(text) - seqlen - 1, step):
    question_snips.append(text[i: i + seqlen + 1])

In [89]:
print(len(question_snips))

763


In [90]:
# Turn every question into vector indicating which letter is present (in x) or next (in y) at that point in the sequence.
# seqlen = maxlen                                # Length in chars of longest question
# x = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool) 
# y = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool) 
x = np.zeros((len(question_snips), seqlen, len(chars)), dtype=np.bool) 
y = np.zeros((len(question_snips), seqlen, len(chars)), dtype=np.bool) 
# for i, question in enumerate(train_questions):
for i, question in enumerate(question_snips):
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1      # Log which character is present at current spot in the sequence.
        y[i, t, char_indices[char_out]] = 1     # Log what the next character is in the sequence.

In [91]:
# Model structures.
model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

def sample(preds, temperature=1.0):
    """Helper function to sample an index from a probability array."""
    preds = np.asarray(preds).astype('float64')
    preds = np.exp(np.log(preds) / temperature)  # softmax
    preds = preds / np.sum(preds)                #
    probas = np.random.multinomial(1, preds, 1)  # sample index
    return np.argmax(probas)                     #

def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text."""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    #start_index = random.randint(0, len(train_questions) - seqlen - 1)
    start_index = random.randint(0, len(question_snips) - seqlen - 1)
    
    #
    for diversity in [0.2, 0.5, 1.0]:
        print('----- diversity:', diversity)

        generated = ''
        # Seed for generated question is the start of a random question from training set
        # question = train_questions[random.randint(0, len(train_questions))][:5]
        question = text[start_index: start_index + seqlen]
        generated += question
        print('----- Generating with seed: "' + question + '"')
        sys.stdout.write(generated)

        for i in range(200):
            x_pred = np.zeros((1, seqlen, len(chars)))
            for t, char in enumerate(question):
                x_pred[0, t, char_indices[char]] = 1.
            
            # What is the dimensionality of `preds`? Why do we input `preds[0, -1]` to the `sample` function?
            preds = model.predict(x_pred, verbose=0)
            next_index = sample(preds[0, -1], diversity)
            next_char = indices_char[next_index]

            question = question[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print("\n\n")

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Train network, print generated text at end of each epoch.
model.fit(x, y,
          batch_size=128,
          epochs=50,
          callbacks=[print_callback])

Epoch 1/50

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "ill in use todayNo. 8: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter "
ill in use todayNo. 8: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter    t                                     e                    o  e                                                                                                                                  e   
----- diversity: 0.5
----- Generating with seed: "ill in use todayNo. 8: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter "
ill in use todayNo. 8: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter o    r  e e h a  mee  e  ni  e  n     s    oe e   n   s ee   p s  o  ef  o   e sei  i  e  n  le  y        e e  lt e  e   it es e     te   n er  rer  n  i o   r    h r   ie r r      sr tsteitsor el  ln
----- diversity: 1.0
----- Generating with se

"The Art Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., fran the the  he the the the the the the th s ore an ton the the the the  on the the the the the the an be the the the the the  or s s s an the the the the the the the the the the the the e the the ae th
----- diversity: 0.5
----- Generating with seed: ""The Art Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., fra"
"The Art Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., frat on n e n thes on re ae Li  is ne " ae chan "as s po che  he the s the the thera rert oa h s s e  or sa d ie ee ein the therr oo ecor be th rhar  nsrate thnlon  on as s red the ea aan thr this thus o
----- diversity: 1.0
----- Generating with seed: ""The Art Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., fra"
"The Art Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep

er of the Constitution of Mass., second President of the United StatesIn the title of an Aesop fable ad 1950 s ane tore A the waf the Bean thas "San "is on this s te this The  onte ton  ile fome s of the tope of tong the te the this 1977 194 in 1999 s blate tord tor tan te s of teat " on se end ton 
----- diversity: 1.0
----- Generating with seed: "er of the Constitution of Mass., second President of the United StatesIn the title of an Aesop fable"
er of the Constitution of Mass., second President of the United StatesIn the title of an Aesop fabled Ro thyu(A Eand O cI00 Thd tha , fi. ke"y yt sd wlove late ingbas w an aine t, an pof "1dt'sa thestot Mss ove 200!"Incta LastOmas if Vis dow Ran on tf theom Anunycty U. nan a9 Aney"2 te Bktcokl le wt
Epoch 12/50

----- Generating text after Epoch: 11
----- diversity: 0.2
----- Generating with seed: "of 4,055 hours of sunshine each yearIn 1963, live on "The Art Linkletter Show", this company served "
of 4,055 hours of sunshine each yearIn 1963

n 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 30 steals for the Birmingtars yelot", theWhen thes crypoted lockonch amestia pilcuse hisevesponger 1974 polpacn thes Vind StalsingsedkAnthe uuteto he uliwicape this " anetinslloncThle 1000B. 1061& the bpes this "DoethonMavin
Epoch 17/50

----- Generating text after Epoch: 16
----- diversity: 0.2
----- Generating with seed: "s state has a record average of 4,055 hours of sunshine each yearIn 1963, live on "The Art Linklette"
s state has a record average of 4,055 hours of sunshine each yearIn 1963, live on "The Art Linkletter war a promed a pround of the seanter of this s as the seand in the seander in 1996 he was this siantris serate a seand of the seander a beald of the counder war the serand in 1969 his the sering thi
----- diversity: 0.5
----- Generating with seed: "s state has a record average of 4,055 hours of sunshine each yearIn 1963, live on "The Art Linklette"
s state has a record average of 4,055 hours

grasshopperBuilt in 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 30 stede the sond the was the anded the was a stanted the penting the was an the seath of the was a plant of the Conded the Seathed the wast the was an the Canded the was the seating the the same state the 
----- diversity: 0.5
----- Generating with seed: "grasshopperBuilt in 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 30 ste"
grasshopperBuilt in 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 30 stentand ancht canded an was the 1992 an on the Ling sed to the Candang compangationt of this songen of this tome an 1913 pattes an 1998 this coust catsed of the panted thes amed the Binking formed the S
----- diversity: 1.0
----- Generating with seed: "grasshopperBuilt in 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 30 ste"
grasshopperBuilt in 312 B.C. to link Rome & the South of Italy, it's still in use todayNo. 8: 

 billionth burgerSigner of the Dec. of Indep., framer of the Constitution of Mass., second Presidentrom hes the frame in "This comourd tho ser the soun an on the har The rep of the sen the U.S. this was compary the U.S. Sestor a shere sear count in the in the for bed the spare on this comert red a b
----- diversity: 1.0
----- Generating with seed: " billionth burgerSigner of the Dec. of Indep., framer of the Constitution of Mass., second President"
 billionth burgerSigner of the Dec. of Indep., framer of the Constitution of Mass., second PresidentaveTyiof d pafceatay for the ssare"The Andat Wery fased is EnountatlyInco younder "Gray of the hat this Seat in Ed of w , lighe who "The Agrean 15, moter erstorre you one 2 Peasion & the KestarkidThe 
Epoch 28/50

----- Generating text after Epoch: 27
----- diversity: 0.2
----- Generating with seed: "nkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., framer of the "
nkletter Show", this company served its bil

 Linkletter Show", this company served its billionth burgerSigner of the Dec. of Indep., framer of the ounchandIn Fargy and of this Gory the U.S. of the grints analy Mandor Lath Powner is fer this picoperNeNevstuber play, ReattGy ColIf 14, 2000nTtyturcoustrong 't, upieRnomy yun hes enote"Poutesaen P
Epoch 33/50

----- Generating text after Epoch: 32
----- diversity: 0.2
----- Generating with seed: " Indep., framer of the Constitution of Mass., second President of the United StatesIn the title of a"
 Indep., framer of the Constitution of Mass., second President of the United StatesIn the title of a poperican played this seated one of this country of this compine of this compering the seme in this companes of this compine of this tope this country state of this state of this companes of this com
----- diversity: 0.5
----- Generating with seed: " Indep., framer of the Constitution of Mass., second President of the United StatesIn the title of a"
 Indep., framer of the Constitution of Mass

: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter of 1971-72, a record the sent the seat of the seat of the send the sean the sand this first called the seat of this seat of this state of the sing the state of the seat of the sent on the sent the sent the seat from this 
----- diversity: 0.5
----- Generating with seed: ": 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter of 1971-72, a record "
: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter of 1971-72, a record the Alline man the New Greach play in the ent the sten har this broth plans in this for the ere the Senate state been for his sirie this mone this sont on the suck for this "In this first the sack thi
----- diversity: 1.0
----- Generating with seed: ": 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter of 1971-72, a record "
: 30 steals for the Birmingham Barons; 2,306 steals for the BullsIn the winter of 1971-72, a r

ball star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & BravesThe city of Yuma in 1971 this goother for this Wirk Spotest for the fort with made the balled the Beathing of this country ployed for this 1962 book this stope of the mas of this Dearacting the hardest for the tay shore
----- diversity: 1.0
----- Generating with seed: "ball star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & BravesThe city of Yuma in"
ball star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & BravesThe city of Yuma include Stara 1701; teade forded for bashert blane deses '70s Heation's "meagis 381-degarateared by of the Pale mose...Dicollyvised worn recarded this sai You?"Filfed as hole sem fashout booquight U.S.T
Epoch 44/50

----- Generating text after Epoch: 43
----- diversity: 0.2
----- Generating with seed: " to link Rome & the South of Italy, it's still in use todayNo. 8: 30 steals for the Birmingham Baron"
 to link Rome & the South of Italy, it's st

; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & BravesThe city of Yours's porkstrantion 1,002 TV Patifince Recon eary Pailfcolops&... centare, he! the hong to phunced that Anknial offerm for a groes is begins of Atriskevel"NubrankAnsI'm Boor TV Breed"The Whildwall of
Epoch 49/50

----- Generating text after Epoch: 48
----- diversity: 0.2
----- Generating with seed: "t of the United StatesIn the title of an Aesop fable, this insect shared billing with a grasshopperB"
t of the United StatesIn the title of an Aesop fable, this insect shared billing with a grasshopperBonder with the little of this first company that a speries from the sear the late of the little the seat of the seat of the sear the first played the first sanger of the seat of the seat of the seat o
----- diversity: 0.5
----- Generating with seed: "t of the United StatesIn the title of an Aesop fable, this insect shared billing with a grasshopperB"
t of the United StatesIn the title of an Ae

<keras.callbacks.callbacks.History at 0x13c56ac90>