In [15]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed


In [16]:
DATA_DIR = '../../../analysis/data/nbmodel_templates.csv'
BATCH_SIZE = 100
HIDDEN_DIM = 200
SEQ_LENGTH = 20
WEIGHTS = ''

GENERATE_LENGTH = 100
LAYER_NUM = 2

In [17]:
reports = []

with open(DATA_DIR, encoding='utf-8-sig') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            reports.append(row[3].replace('"', '').replace("[", '').replace(']', ''))
            line_count += 1
    print(f'Processed {line_count} lines.')

Column names are report, report_chunks, template_chunks, templates
Processed 1953 lines.


In [18]:
data = ' '.join(reports).split(' ')
# data = np.array(reports).flatten()
# data = ' '.join(data)
len(data)

40864

In [19]:
data[:10]

['${player_name}',
 'caught',
 '${receptions}',
 'passes',
 'for',
 '${rec_yards}',
 'yards',
 'and',
 'a',
 'touchdown']

In [20]:
chars = list(set(data)) #set: gets unique values
VOCAB_SIZE = len(chars)

print('Unique Words:\n{}\n\nVOCAB_SIZE: {}'.format(chars, VOCAB_SIZE))

Unique Words:
['', '${pass_td}', 'field', 'have', 'rolled', '${team}.', '(foot)', 'running', 'into', 'rumbled', 'passes', '27', 'career-high', 'catching', '${team_score}-${opp_score},', 'efficiently', 'bulldozed', 'touchdowns,', "Achilles'", 'interceptions', 'the', 'back', 'reception.', 'missed', 'ruptured', 'logged', '41', 'yards.', '11', 'London,', '${opp_score}', 'against', 'while', 'demolition', 'carries.', 'way', 'Sundays', 'feared', 'times', 'Undrafted', 'additional', '6', '36', 'earned', 'contributed', 'out', 'added', '${opp_score}-${team_score},', '2018', 'week', 'Fitzpatrick', '13', 'snagging', "night's", '5', 'catches', 'haul', 'tendon', 'from', 'picks', 'hauling', "Buffalo's", 'Thanksgiving', '76', 'tension', 'pick', 'Niners.', 'before', 'pickup', 'soaked', 'Making', '${game_dow}', 'production', "Steelers'", 'rushes', 'Hunter', 'McCarthy.', '3', 'adding', 'listed', 'Justin', '${week}', 'also', '95', 'delivered', '37', 'fractured', '16', "Falcons'", 'combined', 'drubbing', "P

In [21]:
idx_to_char = {i: char for i, char in enumerate(chars)}
char_to_idx = {char: i for i, char in enumerate(chars)}

In [22]:
import numpy as np

N_FEATURES = VOCAB_SIZE #one hot encoding here, that's why, but deduplicated for clarity

N_SEQ = int(np.floor((len(data) - 1) / SEQ_LENGTH))

X = np.zeros((N_SEQ, SEQ_LENGTH, N_FEATURES))
y = np.zeros((N_SEQ, SEQ_LENGTH, N_FEATURES))

for i in range(N_SEQ):
  X_sequence = data[i * SEQ_LENGTH: (i + 1) * SEQ_LENGTH]
  X_sequence_ix = [char_to_idx[c] for c in X_sequence]
  input_sequence = np.zeros((SEQ_LENGTH, N_FEATURES))
  for j in range(SEQ_LENGTH):
    input_sequence[j][X_sequence_ix[j]] = 1. #one-hot encoding of the input characters
  X[i] = input_sequence
  
  y_sequence = data[i * SEQ_LENGTH + 1: (i + 1) * SEQ_LENGTH + 1] #shifted by 1 to the right
  y_sequence_ix = [char_to_idx[c] for c in y_sequence]
  target_sequence = np.zeros((SEQ_LENGTH, N_FEATURES))
  for j in range(SEQ_LENGTH):
    target_sequence[j][y_sequence_ix[j]] = 1. #one-hot encoding of the target characters
  y[i] = target_sequence

In [23]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM, TimeDistributed, Dense, Activation

# constant parameter for the model
HIDDEN_DIM = 700 #size of each hidden layer, "each layer has 700 hidden states"
LAYER_NUM = 2 #number of hidden layers, how much were used?

model = Sequential()
model.add(LSTM(HIDDEN_DIM, 
               input_shape=(None, VOCAB_SIZE), 
               return_sequences=True))
for _ in range(LAYER_NUM - 1):
  model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])

In [69]:
def generate_text(model, length):
  ix = [np.random.randint(VOCAB_SIZE)]
  y_char = [idx_to_char[ix[-1]]]
  X = np.zeros((1, length, VOCAB_SIZE))
  for i in range(length):
    X[0, i, :][ix[-1]] = 1.
    ix = np.argmax(model.predict(X[:, :i+1,:])[0], 1)
#     print(idx_to_char[ix[-1]], end=" ")
    y_char.append(idx_to_char[ix[-1]])
  return ' '.join(y_char).split('.')

In [48]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
# callback to save the model if better
filepath="tgt_model.hdf5"
save_model_cb = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
# callback to stop the training if no improvement
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=10)
# callback to generate text at epoch end
class generateText(Callback):
    def on_epoch_end(self, batch, logs={}):
        if batch+1 % 10 == 0:
            print(generate_text(self.model, GENERATE_LENGTH))
        
generate_text_cb = generateText()

callbacks_list = [save_model_cb, early_stopping_cb, generate_text_cb]

In [47]:
NB_EPOCHS = 200 #max number of epochs to train, "200 epochs"
BATCH_SIZE = 100
VALIDATION_SPLIT = 0.1 #proportion of the batch used for validation at each epoch

model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, epochs=NB_EPOCHS, 
          callbacks=callbacks_list, validation_split=VALIDATION_SPLIT)

Train on 1838 samples, validate on 205 samples
Epoch 1/200

Epoch 00001: val_acc improved from -inf to 0.79780, saving model to tgt_model.hdf5
Epoch 2/200

Epoch 00002: val_acc improved from 0.79780 to 0.80293, saving model to tgt_model.hdf5
Epoch 3/200

Epoch 00003: val_acc improved from 0.80293 to 0.80463, saving model to tgt_model.hdf5
Epoch 4/200

Epoch 00004: val_acc improved from 0.80463 to 0.80805, saving model to tgt_model.hdf5
Epoch 5/200

Epoch 00005: val_acc did not improve from 0.80805
Epoch 6/200

Epoch 00006: val_acc did not improve from 0.80805
Epoch 7/200

Epoch 00007: val_acc did not improve from 0.80805
Epoch 8/200

Epoch 00008: val_acc did not improve from 0.80805
Epoch 9/200

Epoch 00009: val_acc did not improve from 0.80805
Epoch 10/200

Epoch 00010: val_acc did not improve from 0.80805
Epoch 11/200

Epoch 00011: val_acc did not improve from 0.80805
Epoch 12/200

Epoch 00012: val_acc did not improve from 0.80805
Epoch 13/200

Epoch 00013: val_acc did not improve fr

KeyboardInterrupt: 

In [70]:
output = generate_text(model, 2000)

In [71]:
list(set(output))

[" ${player_name} rushed a ${rec_yards} yard in the ${team}' week ${week} ${player_name} ${player_name} a touchdowns in the ${team}' week ${week} win the the ${team}' week ${week} win over the ${opp}",
 ' ${player_name} rushed ${rush_attempts} times for ${rush_yards} yards and in the of week ${week} against the ${opp}',
 " ${player_name} rushed ${rush_attempts} times for ${rush_yards} yards and hauled in ${receptions} of ${rec_targets} targets for ${rec_yards} yards in the ${team}' week ${week} win over the ${opp}",
 " ${player_name} caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards and a touchdown in the ${team}' week ${week} loss to the ${opp}",
 " ${player_name} rushed ${rush_attempts} times for ${rush_yards} yards and a touchdown in the ${team}' week ${week} win over the the ${player_name} ${player_name} ${player_name} in the ${team}' week ${week} ${player_name} hauled in ${receptions} of his targets for week ${week} ${week} in the week week ${player_name} ${pla