In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed


Using TensorFlow backend.


In [2]:
DATA_DIR = '../../../analysis/data/nbmodel_templates.csv'
BATCH_SIZE = 5
HIDDEN_DIM = 50
SEQ_LENGTH = 3
WEIGHTS = ''

GENERATE_LENGTH = 5
LAYER_NUM = 2

In [3]:
reports = []

with open(DATA_DIR, encoding='utf-8-sig') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            reports.append(row[2].replace('"', '').replace("[", '').replace(']', '').split(","))
            line_count += 1
    print(f'Processed {line_count} lines.')

Column names are report, report_chunks, template_chunks, templates
Processed 1953 lines.


In [4]:
data = np.concatenate(reports)
len(data)

6756

In [5]:
# method for preparing the training data
def load_data(data, seq_length):
    chunks = list(set(data))
    VOCAB_SIZE = len(chunks)

    print('Data length: {} chunks'.format(len(data)))
    print('Vocabulary size: {} chunks'.format(VOCAB_SIZE))

    ix_to_chunk = {ix:chunk for ix, chunk in enumerate(chunks)}
    chunk_to_ix = {chunk:ix for ix, chunk in enumerate(chunks)}
    
    num_seq = int(np.floor((len(data) - 1) / seq_length))

    X = np.zeros((num_seq, seq_length, VOCAB_SIZE))
    y = np.zeros((num_seq, seq_length, VOCAB_SIZE))
    for i in range(0, num_seq):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [chunk_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]
        y_sequence_ix = [chunk_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_chunk

In [6]:
# Creating training data
X, y, VOCAB_SIZE, ix_to_chunk = load_data(data, SEQ_LENGTH)

Data length: 6756 chunks
Vocabulary size: 959 chunks


In [7]:
# method for generating text
def generate_text(model, length, vocab_size, ix_to_chunk):
    # starting with random chunk
    ix = [np.random.randint(vocab_size)]
    y_chunk = [ix_to_chunk[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
        # appending the last predicted chunk to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_chunk[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_chunk.append(ix_to_chunk[ix[-1]])
    return ('').join(y_chunk)

In [8]:
# x = np.zeros((1, GENERATE_LENGTH, VOCAB_SIZE))

In [9]:
# model.predict(x[:, :5, :])

In [10]:
# Creating and compiling the Network
model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
  model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [11]:
# Generate some sample before training to know how bad it is!
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_chunk)

 in the ${team}' week ${week} rout of the ${opp}. 'tacked on ${receptions} catches for ${rec_yards} yards and a touchdown on ${rec_targets} targets ${game_dow} ' 'pulled in ${receptions} catches for ${rec_yards} yards on ${rec_targets} targets ${game_dow} ' week ${week} win over the ${opp} 'rushed ${rush_attempts} times for ${rush_yards} yards and missed his lone target '

" in the ${team}' week ${week} rout of the ${opp}. 'tacked on ${receptions} catches for ${rec_yards} yards and a touchdown on ${rec_targets} targets ${game_dow} ' 'pulled in ${receptions} catches for ${rec_yards} yards on ${rec_targets} targets ${game_dow} ' week ${week} win over the ${opp} 'rushed ${rush_attempts} times for ${rush_yards} yards and missed his lone target ' 'completed ${opp_score} of 36 passes '"

In [12]:
# Training if there is no trained weights specified
if not WEIGHTS == '':
  model.load_weights(WEIGHTS)
  nb_epoch = int(WEIGHTS[WEIGHTS.rfind('_') + 1:WEIGHTS.find('.')])
else:
  nb_epoch = 0

if WEIGHTS == '':
  while True:
    print('\n\nEpoch: {}\n'.format(nb_epoch))
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, nb_epoch=1)
    nb_epoch += 1
    #generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
    if nb_epoch % 10 == 0:
      model.save_weights('checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, nb_epoch))
# Else, loading the trained weights and perform generation only
elif WEIGHTS == '':
  # Loading the trained weights
  model.load_weights(WEIGHTS)
  generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
  print('\n\n')
else:
  print('\n\nNothing to do!')



Epoch: 0



  # This is added back by InteractiveShellApp.init_path()


Epoch 1/1


Epoch: 1

Epoch 1/1


Epoch: 2

Epoch 1/1


Epoch: 3

Epoch 1/1


Epoch: 4

Epoch 1/1


Epoch: 5

Epoch 1/1


Epoch: 6

Epoch 1/1


Epoch: 7

Epoch 1/1


Epoch: 8

Epoch 1/1


Epoch: 9

Epoch 1/1


Epoch: 10

Epoch 1/1


Epoch: 11

Epoch 1/1


Epoch: 12

Epoch 1/1


Epoch: 13

Epoch 1/1


Epoch: 14

Epoch 1/1


Epoch: 15

Epoch 1/1


Epoch: 16

Epoch 1/1


Epoch: 17

Epoch 1/1


Epoch: 18

Epoch 1/1


Epoch: 19

Epoch 1/1


Epoch: 20

Epoch 1/1


Epoch: 21

Epoch 1/1


Epoch: 22

Epoch 1/1


Epoch: 23

Epoch 1/1


Epoch: 24

Epoch 1/1


Epoch: 25

Epoch 1/1


Epoch: 26

Epoch 1/1


Epoch: 27

Epoch 1/1


Epoch: 28

Epoch 1/1


Epoch: 29

Epoch 1/1


Epoch: 30

Epoch 1/1


Epoch: 31

Epoch 1/1


Epoch: 32

Epoch 1/1


Epoch: 33

Epoch 1/1


Epoch: 34

Epoch 1/1


Epoch: 35

Epoch 1/1


Epoch: 36

Epoch 1/1


Epoch: 37

Epoch 1/1


Epoch: 38

Epoch 1/1


Epoch: 39

Epoch 1/1


Epoch: 40

Epoch 1/1


Epoch: 41

Epoch 1/1


Epoch: 42

Epoch 1/1


Epoch: 43

Epoch 1/1


Epoch: 4

KeyboardInterrupt: 

In [14]:
generate_text(model, 20, VOCAB_SIZE, ix_to_chunk)

'${team} ${player_position} ${player_name} ' 'caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards ' in the ${team}' week ${week} loss to the ${opp}.'${player_name} ' 'rushed ${rush_attempts} times for ${rush_yards} yards and ' 'caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards ' in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. 'in week ${week} against the ${opp}.'

"'${team} ${player_position} ${player_name} ' 'caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards ' in the ${team}' week ${week} loss to the ${opp}.'${player_name} ' 'rushed ${rush_attempts} times for ${rush_yards} yards and ' 'caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards ' in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. in the ${team}' week ${week} loss to the ${opp}. 'in week ${week} against the ${opp}.' in ${t