In [1]:
import gensim
import random
import json
from request_img import get_img

In [2]:
class Embedding():
    def __init__(self, wordfile, cache_url=None):
        self.model = gensim.models.KeyedVectors.load_word2vec_format(wordfile)
        if cache_url:
            with open(cache_url, 'r') as f:
                self.cache_url = json.load(f)
        else:
            self.cache_url = {}
        self.postopn = 50 # choise topn of positive words
        self.posrann = 2 # sample n from above
        self.posmask = 5 # avoid topn
        self.negtopn = 100 # choise topn of negative words
        self.negrann = 4 # sample n from above
        self.optionn = 9 # total choice num
        
    def invocab(self, word):
        return word in self.model.vocab
    
    def get_options(self, word):
        
        # check word exists in vocab
        if not self.invocab(word):
            return None
        
        # select positive keys
        high = self.model.most_similar(positive=[word], topn=self.postopn)
        highkeys = [high[i][0] for i in range(self.posmask, self.postopn)]
        choice = random.sample(highkeys, self.posrann)

        # select negative keys
        neg = self.model.most_similar(negative=[word], topn=self.negtopn)
        negkeys = [neg[i][0] for i in range(self.negtopn)]
        choice.extend(random.sample(negkeys, self.negrann))
        
        # random select keys
        keys = self.model.vocab.keys()
        otherkeys = random.sample(keys, self.optionn-self.posrann-self.negrann)
        choice.extend(otherkeys)
        
        #shuffel keys
        random.shuffle(choice)
        
        choice_dic = {}
        
        for i in choice:
            choice_dic[i] = {
                "url": self.cache_url[i] if i in self.cache_url else get_img(i),
                "score": self.model.similarity(word, i),
                "name": i
            }
            
        return choice_dic
    
    def similarity(self, w1, w2):
        if not self.invocab(w1) or not self.invocab(w2):
            return None
        return self.model.similarity(w1, w2)
# gensim example:          
# sim = m.model.most_similar(positive=['貴族', '女人'], negative=['男人'], topn=1000)
# sim = [(i,m.model.vocab[i[0]].count) for i in sim]

# m.model.similarity('大量', '分類')

# m.model.doesnt_match('歌手')

In [3]:
# m = Embedding('wiki.en.vec.small')

In [4]:
# print(m.get_options('bird'))

{'converts': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSvt5ZIuf-gyk9paibEg9md42cFL9fSQnI8Niy94gM3QJnwuz2s', 'score': 0.002942441380619139, 'name': 'converts'}, 'magpie': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSudP2eN_MLADKY4SweBGlin6BoXepkCeDr4ktO6InPPgOMXR86Xw', 'score': 0.5544420176237213, 'name': 'magpie'}, 'rlfc': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT52Q3Nfq-x86_IuMVGVCkw48J9tLPJ95h_x4KYyvSRGWj63Scp', 'score': 0.020485037630606415, 'name': 'rlfc'}, 'boilermakers': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQrqvqT0_SsQExHtEhGB_9Mf_m1rdjf6qwTKOT26Rm-vDvxq7hs', 'score': 0.11973190022905951, 'name': 'boilermakers'}, 'tribunals': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQomVAUQivEoRRAvCmofpArI7ZjImT-2KLPhfNB0WNOuv1_7c9o', 'score': 0.01906958710277818, 'name': 'tribunals'}, 'magnificent': {'url': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcScteklQYqDaYDEsvfTsQBTbjj

In [11]:
# m.model.vocab['bird'].count

28477

In [5]:
# m.similarity('bird', 'tree')

0.3866276199754389