## ANNDL Final Project: Jeopardy! Question Generation

In [43]:
# This notebook will train an LSTM on a set of Jeopardy! questions with the goal of producing novel questions.

In [72]:
import csv
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
import random
import sys
from keras.callbacks import LambdaCallback

In [45]:
# Load data.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)                                # Skip header line.
    for row in reader:
        if "<" not in row[5]:                   # Exclude videos/photos, questions with formatting.
            data.append(row[5])                 # Keep only question text.

In [46]:
# Break into test data. Train data????
train_questions = data[:10000]

In [47]:
# Map characters to indices, vice versa.
chars = sorted(list(set("".join(row for row in data))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [49]:
# Find length of longest training question by character.
maxlen = 0
counter = 0

for question in train_questions:
    for letter in question:
        counter += 1
    if counter > maxlen:
        maxlen = counter
    counter = 0

In [56]:
# Turn every question into vector indicating which letter is present (in x) or next (in y) at that point in the sequence.
seqlen = maxlen                                # Length in chars of longest question
x = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool) 
y = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool) 
for i, question in enumerate(train_questions):
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1      # Log which character is present at current spot in the sequence.
        y[i, t, char_indices[char_out]] = 1     # Log what the next character is in the sequence.

In [73]:
# Model structures.
model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

def sample(preds, temperature=1.0):
    """Helper function to sample an index from a probability array."""
    preds = np.asarray(preds).astype('float64')
    preds = np.exp(np.log(preds) / temperature)  # softmax
    preds = preds / np.sum(preds)                #
    probas = np.random.multinomial(1, preds, 1)  # sample index
    return np.argmax(probas)                     #

def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text."""
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    # start_index = random.randint(0, len(train_questions))
    
    #
    for diversity in [0.2, 0.5, 1.0]:
        print('----- diversity:', diversity)

        generated = ''
        # Seed for generated question is the start of a random question from training set
        question = train_questions[random.randint(0, len(train_questions))][:5]
        generated += question
        print('----- Generating with seed: "' + question + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, seqlen, len(chars)))
            for t, char in enumerate(question):
                x_pred[0, t, char_indices[char]] = 1.
            
            # What is the dimensionality of `preds`? Why do we input `preds[0, -1]` to the `sample` function?
            preds = model.predict(x_pred, verbose=0)
            next_index = sample(preds[0, -1], diversity)
            next_char = indices_char[next_index]

            question = question[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

# Train network, print generated text at end of each epoch.
model.fit(x, y,
          batch_size=128,
          epochs=50,
          callbacks=[print_callback])

Epoch 1/50

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "This "
This ttaaiaoaasaatetatataattaetaattttaoaoacsaaastacsttttetaataasotatitttatcaotttaeatotolattateaoatataatanteoeottattettetmaaoststtattasitatstsaoaastaatitoeisatsataaoaaaaaaoa tssataattaeatiatastaatttottataaeaaotlstosattaatslaiteisaaaaaasettaocaaaotttaottesaatatailoetattlaetaaattaaatttotaaftiaatoitaaoastasatatettaaaaiattotoatoaoeaitataaatttttstaatlaacaastaitataafatttatttaesaeaoaetaataaaitaatetttataastteaa
----- diversity: 0.5
----- Generating with seed: "Carol"
Carolpeo oaeiotoatiitvretasadtepaentetonepttnaoeoDafoe tahoetaatetottsteenintatthtthttlttasssesfmitel o slatt5owa otl cfrataltoofroaatateatttstetoaf asttsetotacastycottaneaoietat ouaotadasaralotepatttooanoaoaoeotatsaaaaainiaeetetwot sotctatotaatsmtipclsciittoaaoooaeleoaleooaltaeatesktratacttesoinneastouoa iatottmolaiasetttlestgadaenltuetetaitahoeogtatliatoanctmsoeaaaado aofelotooothh nsowcnoahoainosoa1
----- diversity: 1.0
----

Famou                         a                                                                       i                                                                                           a                 a               i                                    i                           i                                      i                             a                                      i   
----- diversity: 0.5
----- Generating with seed: "In 19"
In 19 i et       A m    a ,ai  m  e "  a   aa  ia    ala m h s  a    i,          ih e    s aeean  m  m ,i          'a   a          a         a   o   i        a  a           , ,   oa o m         ,,   s      ai                a   a    a    e i i  n       d   a  , a ai  ,i   ,'a   ti    e  eia  , a, i   , aa,ai        i 'd    a o  a   o  a     .  i y  a m   i    i   a       aa  i  i io     ,i       i m  a
----- diversity: 1.0
----- Generating with seed: "The L"
The Lo  ,cdaimw   5e )N ,fi; em'a l  oh,, v, d,a  aan   eh), eea aei;ewao 

The A                                                                                                                                                                                                                                                                                                                                                                                                                
----- diversity: 0.5
----- Generating with seed: "Carey"
Carey      s a   ,"""" " "m"""""" """ """ "" "" """ "  """"" "e """" """ """"""n" "o    " "",""""""" "a""""""" " """ " "" s""""i """"   """"" " ,"" """""  """""  """ ""e  """y" """" se  s,        ,""""i"" """"" """""" """""""""" """"   """"""""  """""""" """ "" " ""  """y "" """"" """""" """" """""  " """ """l"" e  """ "" """ """" "" ""    """ ""  ""s"  """ " "  "  "  """""""""   """" """i"""  " """"""
----- diversity: 1.0
----- Generating with seed: "Sylvi"
Sylvi',,a " e  "r"" u i"  ,iad2o a'  iaao  '” ""os""  ,rs", ,  h  f  s,,:s

1996'  """""""""" """"""""""" """""""""""""""""""""""""""""" """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" """"""""""" """""""""""""""""" "" """"""""""""""""""""""""""""""""""""""""""" """""" """"""""""""""""""""" """"""""""""" """""""""""""""""" "" """"""""""""""""e""""""""" """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" """"" """"""""""""""""""""""""
----- diversity: 0.5
----- Generating with seed: "The 3"
The 3    a      ua       a-aak o    e a  r  s    is      aa    a a  ae   s e ai     ma i a a a  o i"""""""" """"e""  e "a"""  eio      e    o a       a.)     -e      s   a        ei , e i          c a m  ioam)           h   a   m a s  a a a a   e,  u       a a '   s a    i ,ih  '"a u" a  """ ee"    esa     . i m l a    a a ,s   a  a s oeam a       s . seoo)    aaa   , e""" a  ""a """""""""""oa  """"o""
----- diversity: 1.0
----- Generating with seed: "Examp"
Examp,() aae.uk,'e  ,o "i i"a t"sei m -aoa.sh: os "i " e sesaif)i)ba iae a

The B                                                      ,                                                                                                                                                  a                                                                                                                                                                                                      
----- diversity: 0.5
----- Generating with seed: "Okay,"
Okay,as         aa         ss   ,         s       a    a,             a,          a a    ,sa"        a ,     s s     a   ia ' a  a ,  u      s  ,  a  o  a  ,         ,e      ,   a,a   a     s    ,        a, a  ' ,    a    a    ,,          a,          ,i " " "e""   " "   """"e "" a""     h , s   a   u  a a              eaa a  ,              a   h       '         s   a  e m             ,      a a       a
----- diversity: 1.0
----- Generating with seed: "City "
City  i   - so' a) o saa,s  a as',aa , i   iaaa   oyass s,fs, sd :ohi i Ca

Marsh                                                                                                                                            a                                                                                                                                                                                                                                                                   
----- diversity: 0.5
----- Generating with seed: "This "
This    e     a      s     ae                     a  ,          a       ei      s  a  '        a   a     a  , "" "  """""""""""  """""" i  "" "  "  """"s" "  " """""""""""""""""""""""""""""""""""""""" "" """" """"a  "a" """" """ " """"" ""  """"  "  " ""e "" """ """   "" "y" """"""""""""""""a"""""   ""   "a"""""""""""s "  """"""" """" a"?i, """"""""""""""""""""""""""""""""""""""" " ""    """"""""""""""
----- diversity: 1.0
----- Generating with seed: "The c"
The c"   m" e'me'   ia  .a i )  ,eie,ai iahi'  ae  ,im" -;i  ,:as a    ama

Primea  aa a  aa aa aa aa aa a  aa aa aa aa aa a   aa aoa aa aa a a                      ia                 a   aaaaa  aa aa aa aa aa aa aa aa   a   a   aa aa aa aa aa a   a   aa aa aa aa aa  aa aa a   aa aa aa aa aa aa aa aa a   aa aa aa aa aa aa aa aa aaaaaaaaaaaaaaa  a   aa aa aa aa aa  aa aa aa aa aa aa  aa aa aa aa aa aa aa   aa aa aa aa aa aaaaa  aa aaaaaaaaaaaaaaaaaaaaaaaaaa  oa aa aa aa aa aa a
----- diversity: 0.5
----- Generating with seed: ""The "
"The ""i %X"  uaaa i ia ai i     eaauea   aua aa aa a   aa o  oaaso i        aa,ea a   aa a a   aa   aaaa aaias  i i     e i i           a aa  aaaaai  o aaaaa  a a aasiuaoa ,aa iaa a   a  o, ai oo a,  aa aeaa     a   aa aa aa oea aoa aaaea  i   ao  o a a a, oa aae ai, "ay " aaam oa  a   ' ,    ia aoaaa sss     ,i,-    ia     u  uaaa uaa aa  ea,a aa aa   a,  ia  aa  ,aaas  aa a   osa aa  aa ao   a   aa 
----- diversity: 1.0
----- Generating with seed: "Flori"
Floriiesam  - i i o)ia  vo  iaae as weaf i' a'iew,  a ea o,,a,g,“a aii oai

From a a ai  a   aa a  a aa   a  aaa   a  aa  a  aaa   aa  aaa a a aaaaa         a aa aaa aa   a a aa  a  a a   i   ia aaaiaaa aa a      aa  a    a     a a   a aa   aa  a a   aaaa a  aaaa      a i aaa      a    a a a a     aa a    i  a   ia     a  aa a i a a a a   a a a    a            aa aa a aa aa aaia a     a a    i  aaa a a aaa     a a    a  aa aaaa a    aa a a aa      aia  a ia   a   a a  aa ia a 
----- diversity: 0.5
----- Generating with seed: "Baylo"
Baylo,   a,aaa aaia aoa, i ia aaa         ia o   aea'  ia a a a i aaaaa   ie iai  ai aaia aaa a ai eiai a iame i aa      aaaaae aiiaiaa ui aaa a i   a  a is ,e , $  “ a#$,aaa iaamaa aaa a   aa   a a   i i aaoi ia aaa'    ae  i eieaai i e aai  a  a  ia aaiaaa  a    iaa a  aem  i ea iaiaa a    a aaiii e ia iaaa  ah  ie iaaaiia a w  ?    “,aa   aa  a  i a iaaaia a   a i iaa ,a   ai   a aaoe a  i a  ao  ea
----- diversity: 1.0
----- Generating with seed: "Frank"
Frank' ia ei u iviaa, ie,aa.h oaos ?  ia)iiahia  aei  eeii "tji'  a,imuas 

  from ipykernel import kernelapp as app


 e, ,j.s,,,“ms a.alm ;me ahioamwbc ,“ aucts,?aik“aaea[?aaesuaao i o mmaoia' " ' “e$",w,i“w' ia'ma,w    -ia,aaab saaa ha a  aa m aas)i ihoaiaai  )ua .vahu, ),ae  o   aoaaae's ,; s'w ie eati  ,o i i)sa.i- iami  veooeai,u' mme  u  ihm iuo as'a)ioaym)eim.a isi e. eeai  oo'a iaai'o,we a ieh eooa  s a;m'obaaaaspuioaaieu , woao i   mi-  vi ieifaiiue
Epoch 39/50

----- Generating text after Epoch: 38
----- diversity: 0.2
----- Generating with seed: "The l"
The l        ai  ao    a a                                                              i  aa    aa  o “ """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
----- diversity: 0.5
----- Generating with seed: "This "
This a a ao   ,s“ao  Xao m aa    aa aa i aa ah “a y /a"rm i ao  aa    a   i a'e oium


----- Generating text after Epoch: 43
----- diversity: 0.2
----- Generating with seed: "Ralph"
RalphaaF._1Y““Paa    o$  aaa 'D“ '(z$ “Q  “ y “ ( Q'“ “(“X“i“" V$v  a''. ,$a“ “a  a "$a(“ “'ja"“ai,““ (“$a$“no'(aa“aa .a“a   “a"y    “ "““ v““ aae  aa““““'““““"““["“""“ a"““.'[$"(Y$a"[o““““ $“va“. ““"$“[“(“ " “"“"m “'" a 2“a$ “ “""““$a“[/1W" .aa““ aa  “  “ "““ v““a, "a  ““"" "a" aY $a“  $“$““wvs o 1“iaa“ e, $)s"E"a "'  m a “.Q“z) aaisFaa“2“aa 'a  $“(““_"'[ '$$a22 '2a'“a“aa" [“““a"( a“ _$ “  m“$  a[D[“a
----- diversity: 0.5
----- Generating with seed: "One m"
One m $Q““K"“ "ul"aawaa““ u2 [““"!j]“ ““"" "s$y   i.““[aa $ o,a“a  oea [““ [  X “" im .s, '$ o[a “s"'"u“a'0v"“Y“ (“"(y“"ii.$“1'm a 'V““ “ eya“a“ve“a a “ “  2asai%8 “ “  w.“a“ a ,“aQ2 !“aa.“ a,““““$“$“[(““““["“""“ am" .'oajaa_la,las1y) "a“ “$i'p““o[ o_a0au“uaa“ ,')_“"(h u a   ,i,,  ,“ ““o u"o  P“wa“ “s" asa 2a" [“““ly" e)if. "'““$v aSa 2a“a“eayi“2, ““  %ea“$“iiq" “  ,[$  aau'“Y g“y-i,a'“ “ ae ,o ]aa$a'a
----- diversity: 1.0
----- Generati

If Ri  aaa   a     aa    aa aaaaaaa    a  aa  a  aiaia aaa  aa a   aaa ia   a aaaaa aaaaa aa aaaaa a aa iaaaaaa i a  a aaa aaa aaa  a a   a  iia aai a aaa  a a    iaa a aa a aa aa  a a   a a aaaaaaa aa  aa aaa  a   a aaaaaaaa  a   a a    a a aaiiaaaa aa a a   ai aaaa  a   a a ai    a  a aaiaa     a  a a aa    aaaa      aaa aa   a a a   a aa aaa  a  aa  a  aaa iaaa aa aaa  ia  aa aaaa   a   aaia  aa aaa
----- diversity: 0.5
----- Generating with seed: "Sick "
Sick  aaiaa    ii aa ei  i   aa aa   a  aa  a  oaa ia  ia a  io ae  aiia io     a  u iaaa a aa uo  aoaoa i  aaiai ao  ia,eaaa,    aiuaaaoi  a a  a  a aaiaaa aaa aaiao  e ,aaaao  o a   u iiua i iaa   asaa i   a ia  a a   aaeiai,  iaaaa a a amei  i iia aaia aia  aai  iiao a io oa a a  aaua ia   aa , au aaiaaiwaaa ia i   aaaaa a a  a       i'aaaaa uoaaaa  aa   iuiaaaa a aaaa aaaaioaa'isae a iaao aiauea,ai
----- diversity: 1.0
----- Generating with seed: "In th"
In thi oo - a,a fq aaeh,hioe: ihiaih i  -aa aa   uopo'oo a' aeoio i.m, eoe

<keras.callbacks.callbacks.History at 0x13d6cb990>