## ANNDL Final Project: _Jeopardy!_

In [549]:
import csv
import random
import numpy as np
# from sklearn import preprocessing

In [504]:
# Import (value, question, answer) three-ples from CSV.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        data.append(row[4:])

random.shuffle(data) # Do I have to avoid shuffling the data/recreating the train and test data sets?

In [505]:
# We start with all 216,931 rows from the CSV file, which we will clean up.
clean_data = []

# TODO: Use sklearn labelencoder instead
dollar_values_map = {"$200 ": 0, "$400 ": 1, "$600 ": 2, "$800 ": 3, "$1,000 ": 4,
                     "$1,200 ": 5, "$1,600 ": 6, "$2,000 ": 7} 

for row in data:
    # Cut out rows that are Daily Double or Final Jeopardy (imperfect checking criteria)
    value = row[0]
    if value in dollar_values_map:
        # Map dollar value string to corresponding 'index'.
        row[0] = dollar_values_map[value]
        clean_data.append(row)

# We now have 182,217 rows of data.

In [507]:
# Split data into two randomized groups: testing and training data.
idx = len(clean_data) // 2

# Ideally would split data in half, but currently takes too long to run.
train_set = clean_data[:7500]
test_set = clean_data[7500:15000]

# Create sets of ONLY questions (remove dollar value and answer).
all_questions = [row[1] for row in clean_data]
lstm_train_questions = [row[1] for row in train_set]

In [508]:
# Map the characters that occur in the question text to indices.
chars = sorted(list(set("".join([row[1] for row in clean_data]))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [510]:
# Find length of longest training question by character.
max_len = 0
counter = 0

for question in lstm_train_questions:
    for letter in question:
        counter += 1
    if counter > max_len:
        max_len = counter
    counter = 0

## Train LSTM on questions

In [511]:
%matplotlib inline

import sys
import io
import requests as rq
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter

In [515]:
seqlen = max_len # Length in chars of longest question

# For every question we indicate if a given character is present (in x) OR what the next character is (in y).
x = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(lstm_train_questions), seqlen, len(chars)), dtype=np.bool)

for i, question in enumerate(lstm_train_questions):
    # Iterate over every question in the training data.
    # For every question, pair character t with character t+1 to provide context.
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1

lstm_model = Sequential()
lstm_model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True)) # ret_seq = False because we want abstract feature vector as output
lstm_model.add(Dense(len(chars), activation='softmax'))                            # CUT THIS LAYER? Or is this the feature vector we pass to FF?

lstm_model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

In [516]:
lstm_model.fit(x, y,
          batch_size=128,
          epochs=5,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x147d3ffd0>

## Train FF on feature vectors from LSTM

In [517]:
# A quick way to check predictions. DELETE LATER.
x_pred = np.zeros((1, seqlen, len(chars)))
print(x_pred.shape)
pred = lstm_model.predict(x_pred, verbose=1)
print(pred)

(1, 540, 126)
[[[1.6361788e-02 5.3902408e-03 7.7475873e-03 ... 3.1111655e-03
   2.3663426e-03 1.4090101e-03]
  [3.7408464e-02 3.4574748e-03 3.8847846e-03 ... 5.1791471e-04
   5.0410780e-04 2.1345408e-04]
  [1.6578005e-01 1.8813291e-03 2.9995050e-03 ... 3.0032401e-05
   8.1760256e-05 8.3359910e-06]
  ...
  [2.0075856e-01 2.2773757e-03 1.1516644e-02 ... 2.2977754e-05
   6.2739949e-05 3.1469513e-06]
  [2.0075864e-01 2.2773754e-03 1.1516645e-02 ... 2.2977742e-05
   6.2739971e-05 3.1469524e-06]
  [2.0075862e-01 2.2773752e-03 1.1516644e-02 ... 2.2977740e-05
   6.2739964e-05 3.1469522e-06]]]


In [518]:
# Generate feature vectors for training questions.
# The feature vectors will be the x_train data for the FF network.
ff_train_vectors = []

for i in range(len(lstm_train_questions)):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(lstm_train_questions[i]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    ff_train_vectors.append(pred)

In [446]:
print(ff_train_vectors[2])

[[[2.1266250e-02 3.2636736e-04 1.0913979e-03 ... 2.2489514e-05
   2.4427823e-04 1.3034280e-05]
  [5.6462851e-03 1.9178340e-04 4.1151870e-04 ... 9.2137880e-06
   2.4401390e-05 5.3854428e-06]
  [3.7833758e-02 3.6581009e-04 2.5705697e-03 ... 1.4364036e-05
   2.7630402e-05 3.5621633e-06]
  ...
  [1.7368338e-01 1.0118251e-03 1.3046853e-02 ... 4.5242617e-05
   1.3701910e-04 1.1066317e-05]
  [1.7368335e-01 1.0118249e-03 1.3046857e-02 ... 4.5242610e-05
   1.3701909e-04 1.1066325e-05]
  [1.7368330e-01 1.0118247e-03 1.3046867e-02 ... 4.5242599e-05
   1.3701904e-04 1.1066322e-05]]]


In [519]:
# Set up data to train FF network.
# (Feature vectors have same index as their original question and dollar value.)
x = array(ff_train_vectors)                 # Pass in feature vectors representing question text.
y = array([row[0] for row in train_set])    # Expect dollar value associated with each question as output.

In [521]:
print(len(ff_train_vectors))
print(len(x), len(y))
print(len(lstm_train_questions))
print(y[1])
print(x.shape)

7500
7500 7500
7500
4
(7500, 1, 540, 126)


In [550]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, RMSprop, Adam
from sklearn import preprocessing

In [574]:
# Reshape each 4d vector into a 2d vector
# (7500, 1, 540, 126) -> (7500, 68040)
x_train_ff = x.reshape(-1,1*540*126)

# Reshape each 1d digit label into 2d one-hot encoding
y_train_ff = keras.utils.to_categorical(y, num_classes=8)     # There are 8 dollar values (mapped as 0-7)

ff_model = Sequential()

# NO ACTIVATIONS IN OUTPUT (NO PREDICTION).
# ff_model.add(Dense(512, input_dim=540*126, activation='relu'))
# ff_model.add(Dropout(0.5))
# ff_model.add(Dense(256, activation='relu'))
# ff_model.add(Dropout(0.5))
# ff_model.add(Dense(8, activation='relu'))

# ff_model.add(Dense(512, input_dim=540*126, activation='relu'))
# ff_model.add(Dropout(0.2))
# ff_model.add(Dense(8, activation='relu'))
# ff_model.add(Dropout(0.2))

# PREDICTION VECTOR FILLED WITH ACTIVATIONS!!! SOME (4/10) ACCURATE!
ff_model.add(Dense(1024, input_dim=540*126, activation='relu'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(512, activation='sigmoid'))
ff_model.add(Dropout(0.5))
ff_model.add(Dense(8, activation='sigmoid'))

ff_model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

In [575]:
ff_model.fit(x_train_ff, y_train_ff,
          epochs=10,
          batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x146906690>

In [595]:
x_pred = np.zeros((1, seqlen, len(chars)))
for t, char in enumerate(test_set[300][1]):
    x_pred[0, t, char_indices[char]] = 1.

lstm_pred = lstm_model.predict(x_pred, verbose=0)

ff_pred = ff_model.predict(lstm_pred.reshape(-1,540*126))

In [530]:
# Generate the testing data for the FF network.
x_test_ff = []  # Feature vectors.
y_test_ff = []  # Corresponding dollar values.

for i in range(7500):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(test_set[i][1]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = lstm_model.predict(x_pred, verbose=0)
    x_test_ff.append(pred)
    
y_test_ff = [row[0] for row in test_set]

In [592]:
for i in range(10):
    prediction = ff_model.predict((x_test_ff[i]).reshape(-1,540*126))
    print("---- Row #", i)
    print("test_set:\n", test_set[i])
    print(prediction)

---- Row # 0
test_set:
 [3, 'The Phoenicians used a liquid from several species of this gastropod to make Tyrian purple dye', 'Snail']
[[0.19693382 0.271347   0.11098971 0.16939458 0.10414802 0.05318233
  0.05226302 0.08221506]]
---- Row # 1
test_set:
 [6, 'Delacroix, like Byron, sided with this people\'s fight to break free of Turkey, leading to his "Massacre at Chios"', 'Greeks']
[[0.19340341 0.26690242 0.10847434 0.17032787 0.109807   0.05750459
  0.05313412 0.0798901 ]]
---- Row # 2
test_set:
 [2, 'L. Frank Baum\'s Oz books include these underground folk, spelled without the silent "G"', 'Nomes']
[[0.19547854 0.2612157  0.11118241 0.16998594 0.10936431 0.05797143
  0.05562843 0.08521166]]
---- Row # 3
test_set:
 [1, 'In "Born Standing Up", this "wild & crazy guy" recalled selling guidebooks at Disneyland at age 10', 'Sreve Martin']
[[0.19551624 0.26480576 0.10893673 0.17145765 0.10830638 0.05544623
  0.05270435 0.08154971]]
---- Row # 4
test_set:
 [3, "Located just south of the Equ

In [601]:
# score = ff_model.evaluate(array(x_test_ff).reshape(-1,540*126), y_test_ff[0], batch_size=128)

## References:
* [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* [Emma Boettcher Thesis](https://futurism.com/jeopardy-emma-boettcher-ai-james-holzhauer)
* [A Gentle Introduction to LSTM Autoencoders](https://machinelearningmastery.com/lstm-autoencoders/)
* [LSTM – nuggest for practical application](https://towardsdatascience.com/lstm-nuggets-for-practical-applications-5beef5252092)
* [Understanding Stateful LSTM RNNs Python Keras](https://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)
* [Reshape Input Data LSTMs](https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/)
* [How to use return_state](https://www.dlology.com/blog/how-to-use-return_state-or-return_sequences-in-keras/)
* [One-hot Encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f)
* [Dropout](https://machinelearningmastery.com/dropout-for-regularizing-deep-neural-networks/)
* [ReLU](https://medium.com/@danqing/a-practical-guide-to-relu-b83ca804f1f7)
* [First Neural Network Project](https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/)