## ANNDL Final Project: _Jeopardy!_ Dollar Value Prediction

In [602]:
import csv
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [617]:
# Import (value, question, answer) three-ples from CSV.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        data.append(row[4:])

random.shuffle(data)

In [618]:
# We start with all 216,931 rows from the CSV file, which we will clean up.
clean_data = []

dollar_values_map = {"$200 ": 0, "$400 ": 1, "$600 ": 2, "$800 ": 3, "$1,000 ": 4,
                     "$1,200 ": 5, "$1,600 ": 6, "$2,000 ": 7} 

for row in data:
    value = row[0]
    # Cut out rows that are Daily Double or Final Jeopardy (imperfect checking criteria).
    if value in dollar_values_map:
        # Map dollar value string to corresponding 'index'.
        row[0] = dollar_values_map[value]
        clean_data.append(row)

# We now have 182,217 rows of data.

In [676]:
# Ideally would split data in half, but currently takes too long to run.
train_set = clean_data[:30000]
test_set = clean_data[30000:]

# Create sets of ONLY questions (remove dollar value and answer).
all_questions = [row[1] for row in clean_data]
lstm_train_questions = [row[1] for row in train_set]

In [678]:
# Map the characters that occur in the question text to indices.
chars = sorted(list(set("".join([row[1] for row in clean_data]))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [679]:
# Find length of longest training question by character.
max_len = 0
counter = 0

for question in all_questions:
    for letter in question:
        counter += 1
    if counter > max_len:
        max_len = counter
    counter = 0

seqlen = max_len # Length in chars of longest question

## Train LSTM on questions

In [680]:
%matplotlib inline

import sys
import io
import requests as rq
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter

In [681]:
# For every question we indicate if a given character is present (in x) OR what the next character is (in y).
x = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)

for i, question in enumerate(lstm_train_questions):
    # Iterate over every question in the training data.
    # For every question, pair character t with character t+1 to provide context.
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1

lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True)) # ret_seq = False because we want abstract feature vector as output
lstm_model.add(Dense(len(chars), activation='softmax'))                            

lstm_model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

In [682]:
lstm_model.fit(x, y,
          batch_size=128,
          epochs=20,
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x1441843d0>

## Train FF on feature vectors from LSTM

In [660]:
# Generate feature vectors for training questions.
# The feature vectors will be the x_train data for the FF network.
ff_train_vectors = []

for i in range(len(lstm_train_questions)):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(lstm_train_questions[i]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    ff_train_vectors.append(pred)

In [661]:
# Set up data to train FF network.
# (Feature vectors have same index as their original question and dollar value.)
x = array(ff_train_vectors)                 # Pass in feature vectors representing question text.
y = array([row[0] for row in train_set])    # Expect dollar value associated with each question as output.

In [663]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, RMSprop, Adam
from sklearn import preprocessing

In [664]:
# Reshape each 4d vector into a 2d vector
# (7500, 1, 540, 126) -> (7500, 68040)
x_train_ff = x.reshape(-1,1*seqlen*126)

# Reshape each 1d digit label into 2d one-hot encoding
y_train_ff = keras.utils.to_categorical(y, num_classes=8)     # There are 8 dollar values (mapped as 0-7)

ff_model = Sequential()

ff_model.add(Dense(1024, input_dim=seqlen*126, activation='relu'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(512, activation='sigmoid'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(8, activation='sigmoid'))

ff_model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [665]:
ff_model.fit(x_train_ff, y_train_ff,
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x145549350>

In [667]:
x_pred = np.zeros((1, seqlen, len(chars)))
for t, char in enumerate(test_set[300][1]):
    x_pred[0, t, char_indices[char]] = 1.

lstm_pred = lstm_model.predict(x_pred, verbose=0)

ff_pred = ff_model.predict(lstm_pred.reshape(-1,seqlen*126))

print(ff_pred)

[[0.04556538 0.14046079 0.04831688 0.36865437 0.16488083 0.04652775
  0.11479472 0.19862346]]


In [686]:
# Generate the testing data for the FF network.
x_test_ff = []  # Feature vectors.
y_test_ff = []  # Corresponding dollar values.

for i in range(10000):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(test_set[i][1]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    x_test_ff.append(pred)
    
y_test_ff = [row[0] for row in test_set]

In [641]:
def pred_to_value(prediction):
    max_val = 0
    max_idx = 0
    for idx, item in enumerate(prediction):
        if item > max_val:
            max_val = item
            max_idx = idx
    return max_idx

In [689]:
# Quick check for a few text inputs.
for i in range(20, 30):
    prediction = ff_model.predict((x_test_ff[i]).reshape(-1,seqlen*126))
    print("---- Row #", i)
    print("test_set:\n", test_set[i])
    print(prediction[0])
    print(pred_to_value(prediction[0]))

---- Row # 20
test_set:
 [3, 'Scour the barnyards to find one of these percussion instruments that can be part of a drum set', 'cowbell']
[0.09896036 0.08963361 0.14012063 0.3814013  0.32389438 0.12915163
 0.16155075 0.15885288]
3
---- Row # 21
test_set:
 [0, 'In 1958 the theme of the first national week for these places was "Wake up & read!"', 'libraries']
[0.22961381 0.09698592 0.13532788 0.34733292 0.24060984 0.1315212
 0.11048964 0.16334812]
3
---- Row # 22
test_set:
 [4, 'Christopher Isherwood & Sally Bowles are characters in this 1951 play on which a 1966 musical was partly based', 'I Am A Camera (The musical was "Cabaret")']
[0.08063252 0.135028   0.17084324 0.20714566 0.36876896 0.1562017
 0.19673228 0.24317306]
4
---- Row # 23
test_set:
 [3, 'Tags for luggage headed for this airport appropriately read "LAX"', 'Los Angeles (International)']
[0.1995237  0.1493033  0.13776202 0.28116    0.32990602 0.1384628
 0.13212293 0.15277597]
4
---- Row # 24
test_set:
 [5, 'A Vegas casino be

## References:
* [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* [Emma Boettcher Thesis](https://futurism.com/jeopardy-emma-boettcher-ai-james-holzhauer)
* [A Gentle Introduction to LSTM Autoencoders](https://machinelearningmastery.com/lstm-autoencoders/)
* [LSTM – nuggest for practical application](https://towardsdatascience.com/lstm-nuggets-for-practical-applications-5beef5252092)
* [Understanding Stateful LSTM RNNs Python Keras](https://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)
* [Reshape Input Data LSTMs](https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/)
* [How to use return_state](https://www.dlology.com/blog/how-to-use-return_state-or-return_sequences-in-keras/)
* [One-hot Encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f)
* [Dropout](https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/)
* [ReLU](https://medium.com/@danqing/a-practical-guide-to-relu-b83ca804f1f7)
* [First Neural Network Project](https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/)