## ANNDL Final Project: _Jeopardy!_

In [112]:
import csv
import random
import numpy as np
from sklearn import preprocessing

In [332]:
# Import (value, question, answer) three-ples from CSV.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        data.append(row[4:])

random.shuffle(data) # Do I have to avoid shuffling the data/recreating the train and test data sets?

In [333]:
# We start with all 216,931 rows from the CSV file, which we will clean up.
clean_data = []
set_dollar_values = {"$200", "$400", "$600", "$800", "$1000",
                     "$1200", "$1600", "$2000"}

for item in data:
    # Cut out rows that are Daily Double or Final Jeopardy (imperfect checking criteria)
    if item[0] in set_dollar_values:
        # Convert dollar value from strings into ints.
        dollar_value = item[0][1:].replace(',', '')
        item[0] = int(dollar_value)
        clean_data.append(item)

# We now have 177,850 rows of data.

In [437]:
# Split data into two randomized groups: testing and training data.
idx = len(clean_data) // 2

# Ideally would train on half the data points, but currently takes too long to run.
train_set = clean_data[:7500]
test_set = clean_data[7500:15000]

# Create sets of ONLY questions (remove dollar value and answer).
all_questions = [row[1] for row in clean_data]
lstm_train_questions = [row[1] for row in train_set]

In [438]:
# Map the characters that occur in the question text to indices.
chars = sorted(list(set("".join([row[1] for row in clean_data]))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [439]:
# Find length of longest training question by character.
max_len = 0
counter = 0

for question in lstm_train_questions:
    for letter in question:
        counter += 1
    if counter > max_len:
        max_len = counter
    counter = 0

## Train LSTM on questions

In [441]:
%matplotlib inline

import sys
import io
import requests as rq
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter

In [442]:
seqlen = max_len # Length in chars of longest question

# For every question we indicate if a given character is present (in x) OR what the next character is (in y).
x = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)

for i, question in enumerate(train_questions):
    # Iterate over every question in the training data.
    # For every question, pair character t with character t+1 to provide context.
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1

lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True)) # ret_seq = False because we want abstract feature vector as output
lstm_model.add(Dense(len(chars), activation='softmax'))                            # CUT THIS LAYER? Or is this the feature vector we pass to FF?

lstm_model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

In [443]:
lstm_model.fit(x, y,
          batch_size=128,
          epochs=5,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x17dfb8dd0>

## Train FF on feature vectors from LSTM

In [444]:
# A quick way to test predictions. DELETE LATER.
x_pred = np.zeros((1, seqlen, len(chars)))
print(x_pred.shape)
pred = lstm_model.predict(x_pred, verbose=1)
print(pred)

(1, 593, 126)
[[[1.92560274e-02 4.94443811e-03 1.18119186e-02 ... 3.70258465e-03
   3.58833931e-03 1.94786105e-03]
  [3.90143767e-02 2.18068389e-03 9.64256283e-03 ... 5.96941100e-04
   8.77289276e-04 3.09866242e-04]
  [1.12579286e-01 8.13048624e-04 5.98742161e-03 ... 4.60159281e-05
   1.26353349e-04 2.08869387e-05]
  ...
  [1.73682585e-01 1.01183204e-03 1.30472481e-02 ... 4.52430977e-05
   1.37020834e-04 1.10665205e-05]
  [1.73682585e-01 1.01183157e-03 1.30472481e-02 ... 4.52431450e-05
   1.37020834e-04 1.10665305e-05]
  [1.73682585e-01 1.01183204e-03 1.30472481e-02 ... 4.52431450e-05
   1.37020834e-04 1.10665196e-05]]]


In [445]:
# Generate feature vectors for training questions.
# The feature vectors will be the x_train data for the FF network.
ff_train_vectors = []

for i in range(len(lstm_train_questions)):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(lstm_train_questions[i]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    ff_train_vectors.append(pred)

In [446]:
print(ff_train_vectors[2])

[[[2.1266250e-02 3.2636736e-04 1.0913979e-03 ... 2.2489514e-05
   2.4427823e-04 1.3034280e-05]
  [5.6462851e-03 1.9178340e-04 4.1151870e-04 ... 9.2137880e-06
   2.4401390e-05 5.3854428e-06]
  [3.7833758e-02 3.6581009e-04 2.5705697e-03 ... 1.4364036e-05
   2.7630402e-05 3.5621633e-06]
  ...
  [1.7368338e-01 1.0118251e-03 1.3046853e-02 ... 4.5242617e-05
   1.3701910e-04 1.1066317e-05]
  [1.7368335e-01 1.0118249e-03 1.3046857e-02 ... 4.5242610e-05
   1.3701909e-04 1.1066325e-05]
  [1.7368330e-01 1.0118247e-03 1.3046867e-02 ... 4.5242599e-05
   1.3701904e-04 1.1066322e-05]]]


In [450]:
# Set up data to train FF network.
# (Feature vectors have same index as their original question and dollar value.)
x = array(ff_train_vectors)                        # Pass in feature vectors representing question text.
# y = array([row[0] for row in clean_data[:7500]])   # Expect dollar value associated with each question as output.
y = array([row[0] for row in train_set])

In [451]:
print(len(ff_train_vectors))
print(len(x), len(y))
print(len(lstm_train_questions))
print(y[0])
print(x.shape)

7500
7500 7500
7500
400
(7500, 1, 593, 126)


In [452]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, RMSprop, Adam

# (x_mnist, y_mnist), _ = keras.datasets.mnist.load_data()

# Reshape each image to 1dim vector. Effectively, reshape
# `x_train` from (60000, 28, 28) to (60000, 28*28)
# x_train_mnist = x_mnist[:50000].reshape(-1, 28*28)
# x_test_mnist = x_mnist[50000:].reshape(-1, 28*28)

# Reshape each 4d vector into a 2d vector
# (7500, 1, 593, 95) -> (7500, 56335)
x_train_ff = x.reshape(-1,1*593*126)
# print(x_train_ff.shape)

# Convert y_train from vector of labels to one-hot encoding vector
# y_mnist = keras.utils.to_categorical(y_mnist, num_classes=10)
# y_train_mnist = y_mnist[:50000]
# y_test_mnist = y_mnist[50000:]

y_train_ff = keras.utils.to_categorical(y, num_classes=2001)     # There are 10 question values ($200-$2000)
# print(y_train_ff)

ff_model = Sequential()
ff_model.add(Dense(512, input_dim=593*126))
ff_model.add(Activation('relu'))
ff_model.add(Dropout(0.2))
ff_model.add(Dense(2001))
ff_model.add(Activation('relu'))
ff_model.add(Dropout(0.2))

ff_model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [453]:
ff_model.fit(x_train_ff, y_train_ff,
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x17f0ae250>

In [455]:
# Generate the testing data for the FF network.
x_test_ff = []  # Feature vectors.
y_test_ff = []  # Corresponding dollar values.

for i in range(7500):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(test_set[i][1]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    x_test_ff.append(pred)
    
y_test_ff = [row[0] for row in test_set]

In [467]:
prediction = ff_model.predict((x_test_ff[0]).reshape(-1,593*126))
print(test_set[0])
print(len(prediction[0]))

for i, item in enumerate(prediction[0]):
    if item != 0:
        print(i) 

[200, 'Burgess Meredith as this odd bird', 'the Penguin']
2001
400
800


In [468]:
score = model.evaluate(x_test_ff, y_test_ff, batch_size=128)

## References:
* [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* [Emma Boettcher Thesis](https://futurism.com/jeopardy-emma-boettcher-ai-james-holzhauer)
* [A Gentle Introduction to LSTM Autoencoders](https://machinelearningmastery.com/lstm-autoencoders/)
* [LSTM – nuggest for practical application](https://towardsdatascience.com/lstm-nuggets-for-practical-applications-5beef5252092)
* [Understanding Stateful LSTM RNNs Python Keras](https://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)
* [Reshape Input Data LSTMs](https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/)
* [How to use return_state](https://www.dlology.com/blog/how-to-use-return_state-or-return_sequences-in-keras/)