In [1]:
from flair.embeddings import WordEmbeddings
from flair.data import Sentence
import networkx as nx
import torch

In [2]:
def read_data(datafile):
    with open(datafile) as f:
        lines = f.readlines()
    return lines

In [5]:
glove_embedding = WordEmbeddings('glove')
words = read_data('../data/vertomul.txt')
words = [line.strip() for line in open('../data/vertomul.txt')]

In [6]:
word_emb = {}
for word in words:
    sentence = Sentence(word) # --> strip() removes the white space from beginning and end of word
     # embed a sentence using glove.
    glove_embedding.embed(sentence)
    for token in sentence:

        word_emb[word]=token.embedding
    

In [10]:
G = nx.Graph()

In [11]:
G.add_nodes_from(words)

In [12]:
# list(G.nodes)

## Compute the distance using euclidean distance

In [13]:
x = word_emb['sky']
y = word_emb['cloud']

print(torch.norm(x - x))
print(torch.norm(y - x))

tensor(0.)
tensor(4.8084)


A distance of 0 means that the words are the same. As the discrepancy increaces, the distance also increases. 
 

In [14]:
for item in word_emb:
    x = word_emb[item]
    
    for word in word_emb:
        if word is not item:
            y = word_emb[word]
            euclidean_dis = torch.norm(y - x)
            
            if euclidean_dis < 3:
                G.add_edge(item, word)
#             print("x:", item, "y:", word, "distance: ", euclidean_dis)

In [15]:
G.number_of_nodes()

529

In [16]:
G.number_of_edges()

45

In [17]:
list(G.edges)

[('a', 'another'),
 ('after', 'before'),
 ('after', 'last'),
 ('after', 'when'),
 ('ankle', 'knee'),
 ('away', 'back'),
 ('back', 'out'),
 ('be', 'not'),
 ('because', 'not'),
 ('because', 'what'),
 ('before', 'when'),
 ('better', 'good'),
 ('boy', 'girl'),
 ('bring', 'put'),
 ('brother', 'uncle'),
 ('cat', 'dog'),
 ('do', 'you'),
 ('down', 'up'),
 ('garbage', 'trash'),
 ('get', 'go'),
 ('give', 'make'),
 ('give', 'put'),
 ('give', 'take'),
 ('go', 'take'),
 ('grandma', 'grandpa'),
 ('hear', 'listen'),
 ('here', 'today'),
 ('hold', 'take'),
 ('how', 'what'),
 ('how', 'why'),
 ('little', 'much'),
 ('look', 'see'),
 ('make', 'take'),
 ('more', 'some'),
 ('no', 'not'),
 ('no', 'there'),
 ('off', 'out'),
 ('out', 'put'),
 ('out', 'up'),
 ('put', 'take'),
 ('say', 'why'),
 ('think', 'what'),
 ('think', 'why'),
 ('time', 'when'),
 ('what', 'why')]

## Compute the distance using cosine similarity

In [18]:
F = nx.Graph()

In [19]:
F.add_nodes_from(words)

In [20]:
len(list(G.nodes))

529

In [21]:
torch.cosine_similarity(x.unsqueeze(0),
                        y.unsqueeze(0))

tensor([0.1302])

A similarity of 1 means that the words are the same.
Cosine similarity value smaller than 80 makes the edges increase by a lot. 

In [None]:
for item in word_emb:
    x = word_emb[item]
    
    for word in word_emb:
        if word is not item:
            y = word_emb[word]
            cosine_sim = torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
            
            if cosine_sim > .8:
                F.add_edge(item, word)
#             print("x:", item, "y:", word, "distance: ", euclidean_dis)

In [None]:
F.number_of_nodes()

In [None]:
F.number_of_edges()

In [None]:
list(F.edges)