## ANNDL Final Project: _Jeopardy!_

In [602]:
import csv
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [617]:
# Import (value, question, answer) three-ples from CSV.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        data.append(row[4:])

random.shuffle(data) # Do I have to avoid shuffling the data/recreating the train and test data sets?

In [618]:
# We start with all 216,931 rows from the CSV file, which we will clean up.
clean_data = []

# TODO: Use sklearn labelencoder instead
dollar_values_map = {"$200 ": 0, "$400 ": 1, "$600 ": 2, "$800 ": 3, "$1,000 ": 4,
                     "$1,200 ": 5, "$1,600 ": 6, "$2,000 ": 7} 

for row in data:
    # Cut out rows that are Daily Double or Final Jeopardy (imperfect checking criteria)
    value = row[0]
    if value in dollar_values_map:
        # Map dollar value string to corresponding 'index'.
        row[0] = dollar_values_map[value]
        clean_data.append(row)

# We now have 182,217 rows of data.

In [646]:
# Split data into two randomized groups: testing and training data.
idx = len(clean_data) // 2

# Ideally would split data in half, but currently takes too long to run.
train_set = clean_data[:10000]
test_set = clean_data[10000:20000]

# Create sets of ONLY questions (remove dollar value and answer).
all_questions = [row[1] for row in clean_data]
lstm_train_questions = [row[1] for row in train_set]

In [508]:
# Map the characters that occur in the question text to indices.
chars = sorted(list(set("".join([row[1] for row in clean_data]))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [654]:
# Find length of longest training question by character.
max_len = 0
counter = 0

for question in all_questions:
    for letter in question:
        counter += 1
    if counter > max_len:
        max_len = counter
    counter = 0

seqlen = max_len # Length in chars of longest question

## Train LSTM on questions

In [656]:
%matplotlib inline

import sys
import io
import requests as rq
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter

In [657]:
# For every question we indicate if a given character is present (in x) OR what the next character is (in y).
x = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)

for i, question in enumerate(lstm_train_questions):
    # Iterate over every question in the training data.
    # For every question, pair character t with character t+1 to provide context.
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1

lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True)) # ret_seq = False because we want abstract feature vector as output
lstm_model.add(Dense(len(chars), activation='softmax'))                            # CUT THIS LAYER? Or is this the feature vector we pass to FF?

lstm_model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

In [658]:
lstm_model.fit(x, y,
          batch_size=128,
          epochs=5,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x148602090>

## Train FF on feature vectors from LSTM

In [659]:
# A quick way to check predictions. DELETE LATER.
x_pred = np.zeros((1, seqlen, len(chars)))
print(x_pred.shape)
pred = lstm_model.predict(x_pred, verbose=1)
print(pred)

(1, 860, 126)
[[[1.9444054e-02 4.5492626e-03 7.1101622e-03 ... 2.8026458e-03
   3.0922021e-03 2.1714736e-03]
  [5.3738710e-02 2.2899786e-03 4.9155415e-03 ... 3.6330149e-04
   6.3680724e-04 3.2492765e-04]
  [1.2622854e-01 6.4242259e-04 3.5330197e-03 ... 1.4588161e-05
   5.9491005e-05 1.1841793e-05]
  ...
  [1.5930353e-01 8.8263291e-04 2.0451769e-02 ... 8.9476976e-05
   4.2055608e-04 2.7638655e-05]
  [1.9290237e-01 1.5005581e-03 5.2973803e-02 ... 7.2356008e-05
   4.5601735e-04 2.6879738e-05]
  [1.9271864e-01 2.3862971e-03 8.8099293e-02 ... 4.8293325e-05
   4.1505072e-04 2.1401869e-05]]]


In [660]:
# Generate feature vectors for training questions.
# The feature vectors will be the x_train data for the FF network.
ff_train_vectors = []

for i in range(len(lstm_train_questions)):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(lstm_train_questions[i]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    ff_train_vectors.append(pred)

In [661]:
# Set up data to train FF network.
# (Feature vectors have same index as their original question and dollar value.)
x = array(ff_train_vectors)                 # Pass in feature vectors representing question text.
y = array([row[0] for row in train_set])    # Expect dollar value associated with each question as output.

In [662]:
print(len(ff_train_vectors))
print(len(x), len(y))
print(len(lstm_train_questions))
print(y[1])
print(x.shape)

10000
10000 10000
10000
0
(10000, 1, 860, 126)


In [663]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, RMSprop, Adam
from sklearn import preprocessing

In [664]:
# Reshape each 4d vector into a 2d vector
# (7500, 1, 540, 126) -> (7500, 68040)
x_train_ff = x.reshape(-1,1*seqlen*126)

# Reshape each 1d digit label into 2d one-hot encoding
y_train_ff = keras.utils.to_categorical(y, num_classes=8)     # There are 8 dollar values (mapped as 0-7)

ff_model = Sequential()

# NO ACTIVATIONS IN OUTPUT (NO PREDICTION).
# ff_model.add(Dense(512, input_dim=540*126, activation='relu'))
# ff_model.add(Dropout(0.5))
# ff_model.add(Dense(256, activation='relu'))
# ff_model.add(Dropout(0.5))
# ff_model.add(Dense(8, activation='relu'))

# ff_model.add(Dense(512, input_dim=540*126, activation='relu'))
# ff_model.add(Dropout(0.2))
# ff_model.add(Dense(8, activation='relu'))
# ff_model.add(Dropout(0.2))

# PREDICTION VECTOR FILLED WITH ACTIVATIONS!!! SOME (4/10) ACCURATE!
ff_model.add(Dense(1024, input_dim=seqlen*126, activation='relu'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(512, activation='sigmoid'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(8, activation='sigmoid'))

ff_model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [665]:
ff_model.fit(x_train_ff, y_train_ff,
          epochs=20,
          batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x145549350>

In [667]:
x_pred = np.zeros((1, seqlen, len(chars)))
for t, char in enumerate(test_set[300][1]):
    x_pred[0, t, char_indices[char]] = 1.

lstm_pred = lstm_model.predict(x_pred, verbose=0)

ff_pred = ff_model.predict(lstm_pred.reshape(-1,seqlen*126))

print(ff_pred)

[[0.04556538 0.14046079 0.04831688 0.36865437 0.16488083 0.04652775
  0.11479472 0.19862346]]


In [670]:
# Generate the testing data for the FF network.
x_test_ff = []  # Feature vectors.
y_test_ff = []  # Corresponding dollar values.

for i in range(10000):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(test_set[i][1]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    x_test_ff.append(pred)
    
y_test_ff = [row[0] for row in test_set]

In [671]:
print(len(x_test_ff), len(y_test_ff))

10000 10000


In [641]:
def pred_to_value(prediction):
    max_val = 0
    max_idx = 0
    for idx, item in enumerate(prediction):
        if item > max_val:
            max_val = item
            max_idx = idx
    return max_idx

In [673]:
for i in range(10, 20):
    prediction = ff_model.predict((x_test_ff[i]).reshape(-1,seqlen*126))
    print("---- Row #", i)
    print("test_set:\n", test_set[i])
    print(prediction[0])
    print(pred_to_value(prediction[0]))

---- Row # 10
test_set:
 [6, '"Coffey\'s Hands" was the third installment of this novel that was published in serial form in 1996', 'The Green Mile']
[0.17344937 0.14682752 0.16689369 0.13340704 0.20009877 0.0929205
 0.11024223 0.09477271]
4
---- Row # 11
test_set:
 [0, '"Towards thee I roll, thou all-destroying but unconquering whale"', 'Moby Dick']
[0.5369545  0.16861244 0.08380336 0.17918646 0.06361631 0.04366046
 0.02967537 0.05044321]
0
---- Row # 12
test_set:
 [0, 'Familiar shape of the Jefferson National Expansion Memorial\'s "Gateway"', 'Arch']
[0.20243259 0.29951757 0.06953414 0.24006778 0.04568674 0.04832096
 0.07424676 0.10022949]
1
---- Row # 13
test_set:
 [1, 'In 1962 he "return"ed to West Point to give an emotional speech on "Duty, Honor, Country"', 'Douglas MacArthur']
[0.12418361 0.209118   0.15708816 0.3069174  0.12163036 0.0150142
 0.02727487 0.09324985]
3
---- Row # 14
test_set:
 [0, 'This African river enters the Mediterranean through two main branches:  the Damiett

In [601]:
# score = ff_model.evaluate(array(x_test_ff).reshape(-1,540*126), y_test_ff[0], batch_size=128)

## References:
* [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* [Emma Boettcher Thesis](https://futurism.com/jeopardy-emma-boettcher-ai-james-holzhauer)
* [A Gentle Introduction to LSTM Autoencoders](https://machinelearningmastery.com/lstm-autoencoders/)
* [LSTM – nuggest for practical application](https://towardsdatascience.com/lstm-nuggets-for-practical-applications-5beef5252092)
* [Understanding Stateful LSTM RNNs Python Keras](https://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)
* [Reshape Input Data LSTMs](https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/)
* [How to use return_state](https://www.dlology.com/blog/how-to-use-return_state-or-return_sequences-in-keras/)
* [One-hot Encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f)
* [Dropout](https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/)
* [ReLU](https://medium.com/@danqing/a-practical-guide-to-relu-b83ca804f1f7)
* [First Neural Network Project](https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/)