#### Read model

In [26]:
filename = "../brown/brown"
params = "8_200_30"

In [27]:
with open("{}.vocab".format(filename), "r") as fd_vocab:
    vocab = [s.strip() for s in fd_vocab.readlines()]
len(vocab), vocab[int(len(vocab)/2)]

lookup = {}
for i, v in enumerate(vocab):
    lookup[v] = i

In [28]:
with open("{}.center".format(filename+"_"+params), "r") as fd_center:
    embedding = [line.split() for line in fd_center.readlines()]
    center_embedding = [[float(element) for element in elements[0:-1]] for elements in embedding]
    center_bias = [float(elements[-1]) for elements in embedding]
len(center_embedding), len(center_embedding[0]), len(center_bias)

(41506, 200, 41506)

In [29]:
with open("{}.context".format(filename+"_"+params), "r") as fd_context:
    embedding = [line.split() for line in fd_context.readlines()]
    context_embedding = [[float(element) for element in elements[0:-1]] for elements in embedding]
    context_bias = [float(elements[-1]) for elements in embedding]
len(context_embedding), len(context_embedding[0]), len(context_bias)

(41506, 200, 41506)

In [39]:
import numpy as np
embeddings = np.asarray(center_embedding) + np.asarray(context_embedding)
biases = np.asarray(center_bias) + np.asarray(context_bias)
print(embeddings.shape)

(200,)


#### Open analogy file

In [32]:
with open("question-words.txt", "r") as fd:
    all_elements = [line.split() for line in fd.readlines()]
    analogy_titles = []
    analogies = []
    for elements in all_elements:
        if elements[0] == ":":
            analogy_titles += [elements[1]]
            analogies += [[]]
        else:
            analogies[-1] += [elements]
len(analogy_titles), len(analogies), len(analogies[0]), len(analogies[-1])

(14, 14, 506, 870)

#### Begin analogy test

In [48]:
import math
def euclidean(w, v):
    """
    Calculate Euclidean distance.
    """
    dsum = 0.0
    for i in range(len(w)):
        dsum += (w[i] - v[i])**2
    return math.sqrt(dsum)

    

def nn(test_embedding, euclidean):
    """
    Nearest neighbor.
    """ 
    if euclidean:
        min_index = 0
        min_distance = euclidean(test_embedding, embedding[0])
        for i, v in enumerate(embedding[1:]):
            distance = euclidean(test_embedding, v)
            if distance < min_distance:
                min_index = i
                min_distance = distance
    else:
        temp = np.sum(embeddings * test_embedding,axis=1) / np.linalg.norm(test_embedding)
        min_index = np.argmax(temp)        
    return min_index


the_index = nn(embeddings[lookup['the']], euclidean = False)
vocab[the_index]

KeyError: 'poop'

In [54]:
def test(embedding):
    counts = np.zeros((len(analogies),2))
    for i, category in enumerate(analogies):
        print("{}: ".format(analogy_titles[i]), end='')
        for terms in category:
            correct_count = 0
            count = 0
            # e.g. man : woman :: king : (queen)
            try:
                word0 = embeddings[lookup[terms[0]]]
                word1 = embeddings[lookup[terms[1]]]
                word2 = embeddings[lookup[terms[2]]]
                index = nn(word1 - word0 + word2, euclidean=False)
                if vocab[index] == terms[3]:
                    correct_count += 1
                count += 1
            except KeyError:
                pass
        counts[i,:] = [correct_count,count]
        print("{}/{} = {}".format(correct_count, count, 0 if count == 0 else correct_count/count))
    return counts
counts = test(embeddings)

capital-common-countries: 0/0 = 0
capital-world: 0/0 = 0
currency: 0/0 = 0
city-in-state: 0/0 = 0
family: 0/1 = 0.0
gram1-adjective-to-adverb: 0/1 = 0.0
gram2-opposite: 0/1 = 0.0
gram3-comparative: 0/1 = 0.0
gram4-superlative: 0/1 = 0.0
gram5-present-participle: 0/1 = 0.0
gram6-nationality-adjective: 0/0 = 0
gram7-past-tense: 0/1 = 0.0
gram8-plural: 0/1 = 0.0
gram9-plural-verbs: 0/1 = 0.0


In [58]:
with open("{}_accuracy.txt".format(filename), "w") as fd_center:
    for i, category in enumerate(analogies):
        fd_center.write("{}: Total number: {}, Total Correct: {} \n".format(analogy_titles[i],counts[i,0],counts[i,1]))