In [1]:
import tensorflow.compat.v1 as tf
import numpy as np
import urllib
tf.compat.v1.disable_eager_execution()

In [2]:
# the number of iterations to train for
numTrainingIters = 10000

# the number of hidden neurons that hold the state of the RNN
hiddenUnits = 1000

# the number of classes that we are learning over
numClasses = 3

# the number of data points in a batch
batchSize = 100

# the learning rate
learning_rate = 0.005

In [3]:
# this function takes a dictionary (called data) which contains
# of (dataPointID, (classNumber, matrix)) entries.  Each matrix
# is a sequence of vectors; each vector has a one-hot-encoding of
# an ascii character, and the sequence of vectors corresponds to
# one line of text.  classNumber indicates which file the line of
# text came from.
#
# The argument maxSeqLen is the maximum length of a line of text
# seen so far.  fileName is the name of a file whose contents
# we want to add to data.  classNum is an indicator of the class
# we are going to associate with text from that file.  linesToUse
# tells us how many lines to sample from the file.
#
# The return val is the new maxSeqLen, as well as the new data
# dictionary with the additional lines of text added
def addToData (maxSeqLen, data, fileName, classNum, linesToUse):
    #
    # open the file and read it in
    response = urllib.request.urlopen(fileName)
    content = response.readlines ()
    #
    # sample linesToUse numbers; these will tell us what lines
    # from the text file we will use
    myInts = np.random.randint (0, len(content), linesToUse)
    #
    # i is the key of the next line of text to add to the dictionary
    i = len(data)
    #
    # loop thru and add the lines of text to the dictionary
    for whichLine in myInts.flat:
        #
        # get the line and ignore it if it has nothing in it
        line = content[whichLine].decode("utf-8")
        if line.isspace () or len(line) == 0:
            continue;
        #
        # take note if this is the longest line we've seen
        if len (line) > maxSeqLen:
            maxSeqLen = len (line)
        #
        # create the matrix that will hold this line
        temp = np.zeros((len(line), 256))
        #
        # j is the character we are on
        j = 0
        #
        # loop thru the characters
        for ch in line:
            #
            # non-ascii? ignore
            if ord(ch) >= 256:
                continue
            #
            # one hot!
            temp[j][ord(ch)] = 1
            #
            # move onto the next character
            j = j + 1
            #
        # remember the line of text
        data[i] = (classNum, temp)
        #
        # move onto the next line
        i = i + 1
    #
    # and return the dictionary with the new data
    return (maxSeqLen, data)

In [4]:
# this function takes as input a data set encoded as a dictionary
# (same encoding as the last function) and pre-pends every line of
# text with empty characters so that each line of text is exactly
# maxSeqLen characters in size
def pad (maxSeqLen, data):
   #
   # loop thru every line of text
   for i in data:
        #
        # access the matrix and the label
        temp = data[i][1]
        label = data[i][0]
        #
        # get the number of chatacters in this line
        len = temp.shape[0]
        #
        # and then pad so the line is the correct length
        padding = np.zeros ((maxSeqLen - len,256))
        data[i] = (label, np.transpose (np.concatenate ((padding, temp), axis = 0)))
   #
   # return the new data set
   return data


In [5]:
# this generates a new batch of training data of size batchSize from the
# list of lines of text data. This version of generateData is useful for
# an RNN because the data set x is a NumPy array with dimensions
# [batchSize, 256, maxSeqLen]; it can be unstacked into a series of
# matrices containing one-hot character encodings for each data point
# using tf.unstack(inputX, axis=2)
def generateDataRNN (maxSeqLen, data, batchSize = batchSize):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data), batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack ([data[i][1] for i in myInts.flat])
    #
    # and stack all of the labels into a vector of labels
    y = np.stack ([np.array((data[i][0])) for i in myInts.flat])
    #
    # return the pair
    return (x, y)

In [19]:
# this also generates a new batch of training data, but it represents
# the data as a NumPy array with dimensions [batchSize, 256 * maxSeqLen]
# where for each data point, all characters have been appended.  Useful
# for feed-forward network training
def generateDataFeedForward (maxSeqLen, data, batchSize=batchSize):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data), batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack ([data[i][1].flatten () for i in myInts.flat])
    #
    # and stack all of the labels into a vector of labels
    y = np.stack ([np.array((data[i][0])) for i in myInts.flat])
    #
    # return the pair
    return (x, y)


In [7]:
# create the data dictionary
maxSeqLen = 0
data = {}

# load up the three data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "https://s3.amazonaws.com/chrisjermainebucket/text/Holmes.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "https://s3.amazonaws.com/chrisjermainebucket/text/war.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "https://s3.amazonaws.com/chrisjermainebucket/text/william.txt", 2, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
data = pad (maxSeqLen, data)



In [8]:
num_test = 3000
print(maxSeqLen, len(data))

87 25049


In [9]:
def split_data(data, num_test):
    """
    Splits the data into training and testing sets based on the specified test ratio.

    Args:
    data (dict): Dictionary containing the data to split.
    test_ratio (float): Proportion of the dataset to include in the test split.

    Returns:
    tuple: Two dictionaries, the first being the training set and the second the test set.
    """
    # Extract all keys (indices) from the data
    keys = list(data.keys())
    np.random.shuffle(keys)  # Randomly shuffle the keys

    # Split the keys into training and testing
    test_keys = keys[:num_test]
    train_keys = keys[num_test:]

    # Create dictionaries for train and test datasets
    train_data = {i: data[key] for i, key in enumerate(train_keys)}
    test_data = {i: data[key] for i, key in enumerate(test_keys)}

    return train_data, test_data


In [10]:
# Splitting padded data into Train and Test Data
train_data, test_data = split_data(data, num_test)


print("Training Set Size:", len(train_data))
print("Test Set Size:", len(test_data))

Training Set Size: 22049
Test Set Size: 3000


In [11]:
def find_max_seq_len(data):
    """
    Finds the maximum sequence length in a dataset.

    Args:
    data (dict): Dictionary containing the data, where each entry is a tuple (classNumber, matrix).

    Returns:
    int: The maximum sequence length in the dataset.
    """
    max_len = 0
    for key, value in data.items():
        # value[1] is the matrix, and its sum is maxSeqLen as it is one-hot-encoded
        sequence_length = value[1].sum()
        if sequence_length > max_len:
            max_len = sequence_length
    return int(max_len)


In [12]:
maxSeqLen_train = find_max_seq_len(train_data)
maxSeqLen_test = find_max_seq_len(test_data)

print("Maximum length of sequence in Training set:", maxSeqLen_train)
print("Maximum length of sequence in Test set:", maxSeqLen_test)

Maximum length of sequence in Training set: 87
Maximum length of sequence in Test set: 78


In [13]:
# now we build the TensorFlow computation... there are two inputs,
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256 * maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# initialize weight vectors for hidden layers
Wl1 = tf.Variable(np.random.normal(0, 0.05, [256 * maxSeqLen, hiddenUnits]), dtype=tf.float32)
b1 = tf.Variable(np.random.normal(0, 0.05, [1, hiddenUnits]), dtype=tf.float32)
Wl2 = tf.Variable(np.random.normal(0, 0.05, [hiddenUnits, hiddenUnits]), dtype=tf.float32)
b2  = tf.Variable(np.random.normal(0, 0.05, [1, hiddenUnits]), dtype=tf.float32)

Wout = tf.Variable(np.random.normal(0, 0.05, [hiddenUnits, numClasses]), dtype=tf.float32)
bout = tf.Variable(np.random.normal(0, 0.05, [1, numClasses]), dtype=tf.float32)

# implement forward pass
act1 = tf.nn.relu(tf.matmul(inputX, Wl1) + b1 )
act2 = tf.nn.relu(tf.matmul(act1, Wl2) + b2)


# compute the set of outputs
outputs = tf.matmul(act2, Wout) + bout

predictions = tf.nn.softmax(outputs)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
#sequenceOfLetters = tf.unstack(inputX, axis=2)


# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)

# use gradient descent to train
trainingAlg = tf.compat.v1.train.AdagradOptimizer(learning_rate).minimize(totalLoss)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [20]:
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.compat.v1.global_variables_initializer())
    #
    # and run the training iters
    for epoch in range(numTrainingIters):
        #
        # get some data
        x, y = generateDataFeedForward(maxSeqLen_train, train_data)
        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                })

        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
           maxPos = -1
           maxVal = 0.0
           for j in range (numClasses):
               if maxVal < _predictions[i][j]:
                   maxVal = _predictions[i][j]
                   maxPos = j
           if maxPos == y[i]:
               numCorrect = numCorrect + 1
        #
        # print out to the screen
        print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)

    # Compute evaluation on test set
    numTestIterations = num_test // batchSize

    totalNumCorrect = 0
    totalTestLoss = 0
    xtest, ytest = generateDataFeedForward(maxSeqLen, test_data, batchSize=num_test)
    for idx in range(numTestIterations):
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _predictions = sess.run(
                [totalLoss, predictions],
                feed_dict={
                    inputX:xtest[(idx*batchSize):((idx+1)*batchSize)],
                    inputY:ytest[(idx*batchSize):((idx+1)*batchSize)],
                    #initialState:_currentState
                })

        numCorrect = 0
        for i in range (batchSize):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _predictions[i][j]:
                    maxVal = _predictions[i][j]
                    maxPos = j
            if maxPos == ytest[i]:
                numCorrect = numCorrect + 1

        totalNumCorrect += numCorrect
        totalTestLoss += _totalLoss * batchSize

    print(f"Loss for {num_test} test samples is {totalTestLoss / num_test} and correct is {totalNumCorrect} out of {num_test}")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 5001 Loss 0.08060542 Correct 99 out of 100
Step 5002 Loss 0.09335725 Correct 98 out of 100
Step 5003 Loss 0.101655856 Correct 98 out of 100
Step 5004 Loss 0.10565256 Correct 96 out of 100
Step 5005 Loss 0.07681105 Correct 98 out of 100
Step 5006 Loss 0.07827316 Correct 98 out of 100
Step 5007 Loss 0.11116311 Correct 95 out of 100
Step 5008 Loss 0.07125621 Correct 99 out of 100
Step 5009 Loss 0.095936015 Correct 96 out of 100
Step 5010 Loss 0.077752836 Correct 99 out of 100
Step 5011 Loss 0.08276918 Correct 98 out of 100
Step 5012 Loss 0.08821717 Correct 98 out of 100
Step 5013 Loss 0.095649965 Correct 97 out of 100
Step 5014 Loss 0.14786048 Correct 97 out of 100
Step 5015 Loss 0.112385385 Correct 98 out of 100
Step 5016 Loss 0.14778309 Correct 99 out of 100
Step 5017 Loss 0.08120868 Correct 98 out of 100
Step 5018 Loss 0.117170736 Correct 98 out of 100
Step 5019 Loss 0.05656207 Correct 99 out of 100
Step 5020 Loss 0.