In [10]:
import numpy as np
from Assignments.Third.Model import MyFFLM, cross_entropy
import re
from tqdm import tqdm

In [11]:
# Read data
with open("lotrFotr.txt","r",encoding='utf-8') as file:
    raw_txt = file.read()


stripped_txt = raw_txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')\
    .replace(';', '').replace(':', '').replace('  ', ' ').replace('.', '').replace(',', '').replace('"', '').lower()
stripped_txt = re.sub(r'[^\w\s]', '', stripped_txt).split()
stripped_txt = stripped_txt[:2000]


In [12]:
# Create  vocabulary
vocab = list(set(stripped_txt))
vocab.sort()
vocab_dict = {}
for i, word in enumerate(vocab):
    identity_vector = np.zeros(len(vocab))
    identity_vector[i] = 1
    vocab_dict[word] = identity_vector
print("Vocab size: ", len(vocab))


Vocab size:  720


In [13]:
# create labels and dataset
memory_depth = 15
dataX = np.zeros((len(stripped_txt) - memory_depth, memory_depth, len(vocab_dict)), dtype='float32')
dataY = np.zeros((len(stripped_txt) - memory_depth, len(vocab_dict)), dtype='int16')
for i in range(len(stripped_txt) - memory_depth):
    x = np.zeros((memory_depth, len(vocab_dict)))
    for j in range(memory_depth):
        x[j] = vocab_dict[stripped_txt[i+j]]
    dataX[i] = x
    dataY[i] = vocab_dict[stripped_txt[i+memory_depth]]
print("Data set size:", dataX.shape[0])

Data set size: 1985


In [14]:
# Create a dictionary to reverse the one hot encoding
reversed_dict = {}
for key in vocab_dict:
    reversed_dict[np.argmax(vocab_dict[key])] = key
for x in dataX[0]:
    print(reversed_dict[np.argmax(x)])
    print(x)
print(reversed_dict[np.argmax(dataY[0])])
print(dataY[0], np.argmax(dataY[0]))

three
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [15]:
# Shuffle and split
perm = np.random.permutation(dataX.shape[0])
shuffledX = dataX[perm]
shuffledY = dataY[perm]
trainX, testX = shuffledX[:int(dataX.shape[0]*0.80)], shuffledX[int(dataX.shape[0]*0.80):]
trainY, testY = shuffledY[:int(dataX.shape[0]*0.80)], shuffledY[int(dataX.shape[0]*0.80):]
print("Train set length:", trainX.shape[0], "Test set length:", testX.shape[0])

Train set length: 1588 Test set length: 397


In [16]:
# Set up model
v_len = len(vocab)
data_set_length = len(stripped_txt) - memory_depth
model = MyFFLM(v_len, 128, learning_rate=0.001, memory_depth=memory_depth)

# Set up training
loss = 0
epochs = 100

# Do training
for epoch in range(epochs):
    epoch_progress = tqdm(total=dataX.shape[0], desc="epoch progression {}/{}, loss: {}, accuracy: 0".format(epoch+1, epochs, loss))
    loss = 0
    epoch_accuracy = 0
    for k, (x, y) in enumerate(zip(dataX, dataY)):
        y_pred = model.forward(x)
        model.backprop(y)
        # Exponentially weighted average loss
        loss += 0.005*(cross_entropy(y, y_pred) - loss)
        if y[np.argmax(y_pred)] == 1:
            epoch_accuracy += 1
        # This is just to update the progressbar every 100 case
        if k % 100 == 0:
            epoch_progress.set_description(desc="epoch progression {}/{}, loss: {}, accuracy: {}".format(epoch+1, epochs, loss, epoch_accuracy/(k+1)), refresh=True)
        epoch_progress.update()
    epoch_progress.close()


print(model.dL_dw['0'])
model.delta(dataY[-1])

epoch progression 1/100, loss: 6.527854477315307, accuracy: 0.028406102051551814: 100%|██████████| 1985/1985 [00:10<00:00, 190.56it/s]
epoch progression 2/100, loss: 6.50168329060948, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 205.28it/s]  
epoch progression 3/100, loss: 6.478628619539284, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 207.25it/s] 
epoch progression 4/100, loss: 6.453352429907901, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 199.62it/s] 
epoch progression 5/100, loss: 6.4257860344852595, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 204.13it/s] 
epoch progression 6/100, loss: 6.396002024500052, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 205.41it/s] 
epoch progression 7/100, loss: 6.364353503337075, accuracy: 0.0778537611783272: 100%|██████████| 1985/1985 [00:09<00:00, 202.53it/s]  
epoch progression 8/100, loss: 6.331657822674397, accuracy:

[array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0




In [17]:
# See results with test set
acc = 0
print("Predicted, true, predicted value")
for x, y in zip(testX[:30], testY[:30]):

    print(reversed_dict[np.argmax(model.forward(x))], reversed_dict[np.argmax(y)], model.forward(x).max())
    if reversed_dict[np.argmax(model.forward(x))] == reversed_dict[np.argmax(y)]:
        acc += 1
print("Test accuracy:", acc/testX.shape[0])

Predicted, true, predicted value
the necessary 0.07798319
the the 0.07798319
the would 0.07798319
the the 0.07798319
the not 0.07798319
the history 0.07798319
the the 0.07798319
the in 0.07798319
the and 0.07798319
the not 0.07798319
the about 0.07798319
the read 0.07798319
the on 0.07798319
the chapter 0.07798319
the yet 0.07798319
the attractive 0.07798319
the or 0.07798319
the yet 0.07798319
the own 0.07798319
the of 0.07798319
the there 0.07798319
the i 0.07798319
the and 0.07798319
the say 0.07798319
the had 0.07798319
the and 0.07798319
the the 0.07798319
the rate 0.07798319
the concerned 0.07798319
the to 0.07798319
Test accuracy: 0.010075566750629723


In [18]:
# Save embeddings for projector
out_v = ""
out_m = ""
weights = model.embedding_layer.weights
for index, word in enumerate(vocab):
  vec = weights[:, index]
  out_v = out_v + ('\t'.join([str(x) for x in vec]) + "\n")
  out_m = out_m +  (word + "\n")

with open("mbVec.tsv", 'w', encoding='utf-8') as file:
    file.write(out_v)
with open("mbMeta.tsv", 'w', encoding='utf-8') as file:
    file.write(out_m)