In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import time
import csv
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.layers.wrappers import TimeDistributed


Using TensorFlow backend.


In [3]:
DATA_DIR = '../../../analysis/data/nbmodel_templates.csv'
BATCH_SIZE = 50
HIDDEN_DIM = 500
SEQ_LENGTH = 50
WEIGHTS = ''

GENERATE_LENGTH = 500
LAYER_NUM = 2

In [4]:
reports = []

with open(DATA_DIR, encoding='utf-8-sig') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            reports.append(row[3])
            line_count += 1
    print(f'Processed {line_count} lines.')

Column names are report, report_chunks, template_chunks, templates
Processed 1953 lines.


In [5]:
data = np.array(reports).flatten()
len(' '.join(data))

313727

In [6]:
# method for preparing the training data
def load_data(data, seq_length):
    chars = list(set(data))
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}
    char_to_ix = {char:ix for ix, char in enumerate(chars)}
    
    num_seq = len(data)//seq_length

    X = np.zeros((num_seq, seq_length, VOCAB_SIZE))
    y = np.zeros((num_seq, seq_length, VOCAB_SIZE))
    for i in range(0, num_seq):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [char_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]
        y_sequence_ix = [char_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_char

In [7]:
# Creating training data
X, y, VOCAB_SIZE, ix_to_char = load_data(' '.join(data), SEQ_LENGTH)

Data length: 313727 characters
Vocabulary size: 71 characters


In [8]:
# method for generating text
def generate_text(model, length, vocab_size, ix_to_char):
	# starting with random character
	ix = [np.random.randint(vocab_size)]
	y_char = [ix_to_char[ix[-1]]]
	X = np.zeros((1, length, vocab_size))
	for i in range(length):
		# appending the last predicted character to sequence
		X[0, i, :][ix[-1]] = 1
		print(ix_to_char[ix[-1]], end="")
		ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
		y_char.append(ix_to_char[ix[-1]])
	return ('').join(y_char)

In [9]:
# Creating and compiling the Network
model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
  model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")

In [11]:
# Generate some sample before training to know how bad it is!
generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)

FkffyyyyyyTTSSSSsssss-----yyyyyyyyTTiTiTiT5"""uu_uqqqq5_VVeeNNNNN__EE''EnXX000kk)JJJl5rrkkfffyyyyyyTttiiiiPkkk8}AAAAkkyyyyyy4jjodddoou_uaaaFyyyyyyt8iii4PIIIIIoloIIIoIIoIIoIIIooII_IIoIIo_IIlo___Fu"__uuu_XX_))$$$$$ccccqqqqqq555eeccccccccczzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz

'FkffyyyyyyTTSSSSsssss-----yyyyyyyyTTiTiTiT5"""uu_uqqqq5_VVeeNNNNN__EE\'\'EnXX000kk)JJJl5rrkkfffyyyyyyTttiiiiPkkk8}AAAAkkyyyyyy4jjodddoou_uaaaFyyyyyyt8iii4PIIIIIoloIIIoIIoIIoIIIooII_IIoIIo_IIlo___Fu"__uuu_XX_))$$$$$ccccqqqqqq555eeccccccccczzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'

In [13]:
# Training if there is no trained weights specified
nb_epoch = 0
if WEIGHTS == '':
  while True:
    print('\n\nEpoch: {}\n'.format(nb_epoch))
    model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, nb_epoch=1)
    nb_epoch += 1
    generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
    if nb_epoch % 10 == 0:
      model.save_weights('checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, nb_epoch))



Epoch: 0



  


Epoch 1/1
} ${res__arres} yardss yards  add ${{uas__aree} taasss  arees ${oppss. ${playyrr_aam}  ards  {reec ${opps}  ${player_name} caugee ${reec_tooss}  oass ff ${rec_yyrdss} yards  and  {aeek wweek wwe  vee  toe  ${ppp}. ${player_name} caugee ${reec_tooss}  oass ff ${rec_yyrdss} yards  and  {aeek wweek wwe  vee  toe  ${ppp}. ${player_name} caugee ${reec_tooss}  oass ff ${rec_yyrdss} yards  and  {aeek wweek wwe  vee  toe  ${ppp}. ${player_name} caugee ${reec_tooss}  oass ff ${rec_yyrdss} yards  and  {ae

Epoch: 1

Epoch 1/1
-${rec_targets} targets for ${rec_yards} yards in the ${team}' week ${week} loss to the ${opp}. ${player_name} caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards in the ${team}' week ${week} loss to the ${opp}. ${player_name} caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards in the ${team}' week ${week} loss to the ${opp}. ${player_name} caught ${receptions} of ${rec_targets} targets for ${rec_yards} yards in the ${team}' wee

KeyboardInterrupt: 