#### Read model

In [None]:
directory = "europarl"
filename = "europarl"
params = "8_200_25_skip"

In [None]:
with open("corpora/{}/{}.vocab".format(directory, filename), "r") as fd_vocab:
    vocab = [s.strip() for s in fd_vocab.readlines()]
len(vocab), vocab[int(len(vocab)/2)]

In [None]:
lookup = {}
for i, v in enumerate(vocab):
    lookup[v] = i
lookup['the']

In [None]:
with open("corpora/{}/{}_{}.center".format(directory, filename, params), "r") as fd_center:
    embedding = [line.split() for line in fd_center.readlines()]
    center_embedding = [[float(element) for element in elements[0:-1]] for elements in embedding]
    center_bias = [float(elements[-1]) for elements in embedding]
len(center_embedding), len(center_embedding[0]), len(center_bias)

In [None]:
with open("corpora/{}/{}_{}.context".format(directory, filename, params), "r") as fd_context:
    embedding = [line.split() for line in fd_context.readlines()]
    context_embedding = [[float(element) for element in elements[0:-1]] for elements in embedding]
    context_bias = [float(elements[-1]) for elements in embedding]
len(context_embedding), len(context_embedding[0]), len(context_bias)

In [None]:
import numpy as np
embeddings = np.asarray(center_embedding) + np.asarray(context_embedding)
biases = np.asarray(center_bias) + np.asarray(context_bias)
print(embeddings.shape)

#### Open analogy file

In [None]:
with open("question-words.txt", "r") as fd:
    all_elements = [line.split() for line in fd.readlines()]
    analogy_titles = []
    analogies = []
    for elements in all_elements:
        if elements[0] == ":":
            analogy_titles += [elements[1]]
            analogies += [[]]
        else:
            analogies[-1] += [elements]
len(analogy_titles), len(analogies), len(analogies[0]), len(analogies[-1])

#### Begin analogy test

In [None]:
import math
def euclidean(w, v):
    """Calculate Euclidean distance."""
    dsum = 0.0
    for i in range(len(w)):
        dsum += (w[i] - v[i])**2
    return math.sqrt(dsum)


def nn(test_embedding, eucl=False, k=1):
    """k nearest neighbors, not necessarily sorted by distance.""" 
    if eucl:
        temp = [euclidean(test_embedding, embedding) for embedding in embeddings]
        return np.argpartition(temp, k)[:k]
    temp = np.sum(embeddings * test_embedding,axis=1) / np.linalg.norm(test_embedding)
    return np.argpartition(temp, -k)[-k:]


# Find the 5 nearest neighbors of 'the' by Euclidean distance and cosine similarity.
the_index = lookup['the']
the_eucl_indices = nn(embeddings[the_index], eucl=True, k=5)
the_eucl_words = [vocab[i] for i in the_eucl_indices]
the_cos_sim_indices = nn(embeddings[the_index], eucl=False, k=5)
the_cos_sim_words = [vocab[i] for i in the_cos_sim_indices]
the_eucl_indices, the_eucl_words, the_cos_sim_indices, the_cos_sim_words

In [None]:
def test(eucl=False, k=1):
    counts = np.zeros((len(analogies),2))
    for i, category in enumerate(analogies):
        print("{}: ".format(analogy_titles[i]), end='')
        for terms in category:
            correct_count = 0
            count = 0
            # e.g. man : woman :: king : ? (queen)
            try:
                word0 = embeddings[lookup[terms[0]]]
                word1 = embeddings[lookup[terms[1]]]
                word2 = embeddings[lookup[terms[2]]]
                indices = nn(word1 - word0 + word2, eucl, k)
                kwords = [vocab[i] for i in indices]
                if terms[3] in kwords:
                    correct_count += 1
                count += 1
            except KeyError:
                pass
        counts[i,:] = [correct_count,count]
        print("{}/{} = {}".format(correct_count, count, 0 if count == 0 else correct_count/count))
    return counts

k = 3
eucl_counts = test(embeddings, True, k)
cos_sim_counts = test(embeddings, False, k)

In [None]:
with open("{}_{}_eucl_accuracy.txt".format(filename, params), "w") as fd_accuracy:
    for i, category in enumerate(analogies):
        fd_accuracy.write("{}: Total number: {}, Total Correct: {} \n".format(analogy_titles[i],eucl_counts[i,0],eucl_counts[i,1]))

In [None]:
with open("{}_{}_accuracy.txt".format(filename, params), "w") as fd_accuracy:
    for i, category in enumerate(analogies):
        fd_accuracy.write("{}: Total number: {}, Total Correct: {} \n".format(analogy_titles[i],cos_sim_counts[i,0],cos_sim_counts[i,1]))