## ANNDL Final Project: _Jeopardy!_

In [112]:
import csv
import random
import numpy as np
from sklearn import preprocessing

In [7]:
# Import (value, question, answer) three-ples from CSV.
data = []
with open("/Users/fiordali/Downloads/JEOPARDY_CSV.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        data.append(row[4:])

random.shuffle(data) # Do I have to avoid shuffling the data/recreating the train and test data sets?

In [304]:
# We start with all 216,931 rows from the CSV file, which we will clean up.

clean_data = []
set_dollar_values = {"$200", "$400", "$600", "$800", "$1000",
                     "$1200", "$1600", "$2000"}
for item in data:
    # Cut out rows that are Final Jeopardy (where dollar value = "None").
    if item[0] != "None" and len(item[0]) > 0: 
        # Cut out rows that are Daily Double (where dollar value != one of the set dollar values; imperfect check)
        if item[0] in set_dollar_values:
            clean_data.append(item)

# We now have 177,850 rows of data.

# Convert dollar value strings into ints.

177850


In [None]:
# Split data into two randomized groups: testing and training data.
idx = len(data) // 2

# Ideally would train on half the data points, but currently takes too long to run.
train_set = data[:7500]
test_set = data[7500:]

# Remove dollar value and answer from training set.
train_questions = []
for item in train_set:
    train_questions.append(item[1])

In [157]:
# Find length of longest training question by character.
max_len = 0
counter = 0

for question in train_questions:
    for letter in question:
        counter += 1
    if counter > max_len:
        max_len = counter
    counter = 0

## Train LSTM on questions

In [158]:
%matplotlib inline

import sys
import io
import requests as rq
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from collections import Counter

In [159]:
# Q1: What is the purpose of this block? When is `char_indices` used? What about `indices_char`?
chars = sorted(list(set("".join(train_questions))))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, '9': 25, ':': 26, ';': 27, '<': 28, '=': 29, '>': 30, '?': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41, 'K': 42, 'L': 43, 'M': 44, 'N': 45, 'O': 46, 'P': 47, 'Q': 48, 'R': 49, 'S': 50, 'T': 51, 'U': 52, 'V': 53, 'W': 54, 'X': 55, 'Y': 56, 'Z': 57, '[': 58, ']': 59, '_': 60, 'a': 61, 'b': 62, 'c': 63, 'd': 64, 'e': 65, 'f': 66, 'g': 67, 'h': 68, 'i': 69, 'j': 70, 'k': 71, 'l': 72, 'm': 73, 'n': 74, 'o': 75, 'p': 76, 'q': 77, 'r': 78, 's': 79, 't': 80, 'u': 81, 'v': 82, 'w': 83, 'x': 84, 'y': 85, 'z': 86, '¢': 87, '°': 88, 'é': 89, '–': 90, '’': 91, '“': 92, '”': 93, '…': 94}


In [160]:
# Q2: What is the purpose of this block? What do the `seqlen` and `step` parameters do?
seqlen = max_len # Length in chars of longest question

# Q3: What about this block? What is `x` and what is `y`? Why do they have this dimensionality?
x = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool)
y = np.zeros((len(train_questions), seqlen, len(chars)), dtype=np.bool)
for i, question in enumerate(train_questions):
    # Q3a: What happens in this loop?
    for t, (char_in, char_out) in enumerate(zip(question[:-1], question[1:])):
        x[i, t, char_indices[char_in]] = 1
        y[i, t, char_indices[char_out]] = 1


# Q4: Here we build the model. What does the `return_sequences` argument do? Why the dense layer at the end?
model = Sequential()
model.add(LSTM(128, input_shape=(seqlen, len(chars)), return_sequences=True)) # ret_seq = False because we want abstract feature vector as output
model.add(Dense(len(chars), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=RMSprop(learning_rate=0.01),
    metrics=['categorical_crossentropy', 'accuracy']
)

In [161]:
model.fit(x, y,
          batch_size=128,
          epochs=50,
          verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x170cb7ad0>

## Train FF on feature vectors from LSTM

In [176]:
# Generate feature vectors for training questions.
# The feature vectors will be the x_train data for the FF network.
train_vectors = []

for i in range(len(train_questions)):
    x_pred = np.zeros((1, seqlen, len(chars)))
    for t, char in enumerate(train_questions[i]):
        x_pred[0, t, char_indices[char]] = 1.

    pred = model.predict(x_pred, verbose=0)
    train_vectors.append(pred)

In [290]:
# Clean up the data.
y_train = []                 # This set will exclude Final Jeopardy rows

for item in train_set:
    if item[0] != "None" and len(item[0]) > 0:    # Final Jeopardy rows are distinguished by their dollar value = "None"
        y_train.append(item)

In [291]:
# Set up data to train FF network.
# (Feature vectors have same index as their original question and dollar value.)

x_train = array(train_vectors)                     # Pass in feature vectors representing question text.
y_train = array([row[0][1:] for row in y_train]) # Expect dollar value associated with each question as output.

y_train_digits = []
for i, item in enumerate(y_train):
    dollar_value = y_train[i].replace(',', '')
    y_train_digits.append(int(dollar_value))

# print(train_set)

In [294]:
print(type(y_train_digits[0]))
print(y_train_digits[0] + y_train_digits[0])

<class 'int'>
600


In [246]:
print(len(y_train))
for i, value in enumerate(y_train):
    level = y_train[i]
    y_train[i] = int(y_train[i][1:])

print(y_train[0])

3


ValueError: invalid literal for int() with base 10: 'e included an unflattering description of himself in one of "The Canterbury Tales"'

In [224]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, RMSprop, Adam

(x, y), _ = keras.datasets.mnist.load_data()

# Reshape each image to 1dim vector. Effectively, reshape
# `x_train` from (60000, 28, 28) to (60000, 28*28)
x_train_mnist = x[:50000].reshape(-1, 28*28)
x_test_mnist = x[50000:].reshape(-1, 28*28)

# Reshape each 4d vector into a 2d vector
# (7550, 1, 526, 95) -> (7550, 49970)
x_train = x_train.reshape(-1,1*526*95)

# Convert y_train from vector of labels to one-hot encoding vector
print(y[:10])
y = keras.utils.to_categorical(y, num_classes=10)
print(y[:10])
y_train_mnist = y[:50000]
y_test_mnist = y[50000:]

y_train = keras.utils.to_categorical(y_train, num_classes=10)     # There are 10 question values ($200-$2000)

# model = Sequential()
# model.add(Dense(512, input_dim=526*95))
# model.add(Activation('relu'))
# model.add(Dropout(0.2))
# model.add(Dense(10))
# model.add(Activation('relu'))
# model.add(Dropout(0.2))

# model.compile(loss='binary_crossentropy',
#               optimizer=RMSprop(),
#               metrics=['accuracy'])

# model.fit(x_train, y_train,
#           epochs=20,
#           batch_size=128)
# score = model.evaluate(x_test, y_test, batch_size=128)

[5 0 4 1 9 2 1 3 1 4]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]


ValueError: invalid literal for int() with base 10: '$300'

## References:
* [Understanding LSTMs](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* [Emma Boettcher Thesis](https://futurism.com/jeopardy-emma-boettcher-ai-james-holzhauer)
* [A Gentle Introduction to LSTM Autoencoders](https://machinelearningmastery.com/lstm-autoencoders/)
* [LSTM – nuggest for practical application](https://towardsdatascience.com/lstm-nuggets-for-practical-applications-5beef5252092)
* [Understanding Stateful LSTM RNNs Python Keras](https://machinelearningmastery.com/understanding-stateful-lstm-recurrent-neural-networks-python-keras/)
* [Reshape Input Data LSTMs](https://machinelearningmastery.com/reshape-input-data-long-short-term-memory-networks-keras/)
* [How to use return_state](https://www.dlology.com/blog/how-to-use-return_state-or-return_sequences-in-keras/)