In [26]:
import numpy as np
from Assignments.Third.Model import MyFFLM, cross_entropy
import re
from tqdm import tqdm

In [27]:
# Read data
with open("lotrFotr.txt","r",encoding='utf-8') as file:
    raw_txt = file.read()


stripped_txt_lotr = raw_txt.replace('\n', ' ').replace('\t', ' ').replace('\r', ' ')\
    .replace(';', '').replace(':', '').replace('  ', ' ').replace('.', '').replace(',', '').replace('"', '').lower()
stripped_txt_lotr = re.sub(r'[^\w\s]', '', stripped_txt_lotr).split()
stripped_txt_lotr = stripped_txt_lotr[:len(stripped_txt_lotr)//50]


In [28]:
# Create  vocabulary
vocab = list(set(stripped_txt_lotr))
vocab.sort()
vocab_dict = {}
for i, word in enumerate(vocab):
    identity_vector = np.zeros(len(vocab))
    identity_vector[i] = 1
    vocab_dict[word] = identity_vector
print("Vocab size: ", len(vocab))


Vocab size:  1136


In [29]:
# create labels and dataset
memory_depth = 5
dataX = np.zeros((len(stripped_txt_lotr) - memory_depth, memory_depth, len(vocab_dict)), dtype='float32')
dataY = np.zeros((len(stripped_txt_lotr) - memory_depth, len(vocab_dict)), dtype='int16')
for i in range(len(stripped_txt_lotr) - memory_depth):
    x = np.zeros((memory_depth, len(vocab_dict)))
    for j in range(memory_depth):
        x[j] = vocab_dict[stripped_txt_lotr[i+j]]
    dataX[i] = x
    dataY[i] = vocab_dict[stripped_txt_lotr[i+memory_depth]]
print("Data set size:", dataX.shape[0])

Data set size: 3738


In [30]:
# Create a dictionary to reverse the one hot encoding
reversed_dict = {}
for key in vocab_dict:
    reversed_dict[np.argmax(vocab_dict[key])] = key
for x in dataX[0]:
    print(reversed_dict[np.argmax(x)])
    print(x)
print(reversed_dict[np.argmax(dataY[0])])
print(dataY[0], np.argmax(dataY[0]))

three
[0. 0. 0. ... 0. 0. 0.]
rings
[0. 0. 0. ... 0. 0. 0.]
for
[0. 0. 0. ... 0. 0. 0.]
the
[0. 0. 0. ... 0. 0. 0.]
elvenkings
[0. 0. 0. ... 0. 0. 0.]
under
[0 0 0 ... 0 0 0] 1055


In [31]:
# Shuffle and split
perm = np.random.permutation(dataX.shape[0])
shuffledX = dataX[perm]
shuffledY = dataY[perm]
trainX, testX = shuffledX[:int(dataX.shape[0]*0.80)], shuffledX[int(dataX.shape[0]*0.80):]
trainY, testY = shuffledY[:int(dataX.shape[0]*0.80)], shuffledY[int(dataX.shape[0]*0.80):]
print("Train set length:", trainX.shape[0], "Test set length:", testX.shape[0])

Train set length: 2990 Test set length: 748


In [32]:
# Set up model
v_len = len(vocab)
data_set_length = len(stripped_txt_lotr) - memory_depth
model = MyFFLM(v_len, 128, learning_rate=0.01, memory_depth=memory_depth)

# Set up training
loss = 0
epochs = 20

# Do training
for epoch in range(epochs):
    epoch_progress = tqdm(total=trainX.shape[0], desc="epoch progression {}/{}, loss: {}, accuracy: 0".format(epoch+1, epochs, loss))
    loss = 0
    epoch_accuracy = 0
    for k, (x, y) in enumerate(zip(trainX, trainY)):
        y_pred = model.forward(x)
        model.backprop(y)
        # Exponentially weighted average loss
        loss += 0.005*(cross_entropy(y, y_pred) - loss)
        if y[np.argmax(y_pred)] == 1:
            epoch_accuracy += 1
        # This is just to update the progressbar every 100 case
        if k % 100 == 0:
            epoch_progress.set_description(desc="epoch progression {}/{}, loss: {}, accuracy: {}".format(epoch+1, epochs, loss, epoch_accuracy/(k+1)), refresh=True)
        epoch_progress.update()
    epoch_progress.close()



epoch progression 1/20, loss: 7.483842637761003, accuracy: 0.008319467554076539:  21%|██        | 627/2990 [00:17<00:43, 54.34it/s]

KeyboardInterrupt: 

In [None]:
# See results with test set
acc = 0
print("Predicted, true, predicted value")
for x, y in zip(testX[:30], testY[:30]):

    print(reversed_dict[np.argmax(model.forward(x))], reversed_dict[np.argmax(y)], model.forward(x).max())
    if reversed_dict[np.argmax(model.forward(x))] == reversed_dict[np.argmax(y)]:
        acc += 1
print("Test accuracy:", acc/testX.shape[0])

In [None]:
# Save embeddings for projector
out_v = ""
out_m = ""
weights = model.embedding_layer.weights
for index, word in enumerate(vocab):
  vec = weights[:, index]
  out_v = out_v + ('\t'.join([str(x) for x in vec]) + "\n")
  out_m = out_m +  (word + "\n")

with open("mbVec.tsv", 'w', encoding='utf-8') as file:
    file.write(out_v)
with open("mbMeta.tsv", 'w', encoding='utf-8') as file:
    file.write(out_m)