In [6]:
import time
import copy
from typing import List

from nltk.tokenize import word_tokenize
from tqdm import tqdm
from gensim.models import KeyedVectors
import torch
from torch.utils.data import Dataset
from torch import nn
import torch.nn.functional as F
import torch.optim as optimizer
import matplotlib.pyplot as plt
import gensim.downloader

import numpy as np

def get_embed():
    try:
        embed = KeyedVectors.load('glove-wiki-gigaword-200.kv')
    except:
        embed = gensim.downloader.load('glove-wiki-gigaword-200')
        embed.save('glove-wiki-gigaword-200.kv')
    return embed



In [8]:
embed = get_embed()

In [10]:
embed["hi"].shape

(200,)

In [24]:
def get_word_db():
    with open("words.txt", "r") as f:
        data = f.read()
        data = data.lower()
        data = word_tokenize(data)
        tokens = []
        for token in data:
            if token in embed and token not in tokens:
                tokens.append(token)
        return tokens

words = get_word_db()

In [29]:
from annoy import AnnoyIndex
import random

f = embed[words[0]].shape[0]  # Length of item vector that will be indexed

t = AnnoyIndex(f, 'angular')
for i, word in enumerate(words):
    v = embed[word]
    t.add_item(i, v)

index_to_word = dict(enumerate(words))
word_to_index = dict([(word, i) for i, word in enumerate(words)])

t.build(10) # 10 trees
t.save('test.ann')

# ...

word = "machine"
this_word_index = word_to_index[word]

u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
nearest_i = u.get_nns_by_item(this_word_index, 1000) # will find the 1000 nearest neighbors
nearest_word = [(i, index_to_word[i]) for i in nearest_i]
nearest_word

[(450, 'machine'),
 (237, 'using'),
 (453, 'computer'),
 (1500, 'tool'),
 (174, 'uses'),
 (334, 'systems'),
 (772, 'system'),
 (209, 'used'),
 (82, 'answering'),
 (193, 'use'),
 (577, 'instead'),
 (383, 'single'),
 (1186, 'method'),
 (188, 'same'),
 (212, 'every'),
 (976, 'making'),
 (1229, 'multiple'),
 (1522, 'you'),
 (375, 'each'),
 (71, 'one'),
 (89, 'simple'),
 (28, 'model'),
 (70, 'just'),
 (707, 'out'),
 (1619, 'store'),
 (1679, 'up'),
 (230, 'paper'),
 (845, 'so'),
 (213, 'only'),
 (972, 'no'),
 (539, 'rather'),
 (97, 'processing'),
 (605, 'random'),
 (833, 'components'),
 (631, 'time'),
 (1055, 'like'),
 (471, 'running'),
 (1617, 'man'),
 (1648, '40'),
 (205, 'standard'),
 (340, 'scratch'),
 (586, 'either'),
 (653, '50'),
 (1678, 'speed'),
 (1590, 'purpose'),
 (1135, 'once'),
 (67, 'can'),
 (765, 'around'),
 (84, 'without'),
 (185, 'simply'),
 (91, 'powerful'),
 (1804, 'another'),
 (844, 'small'),
 (25, 'a'),
 (467, 'even'),
 (541, 'actual'),
 (606, 'then'),
 (210, 'example'),