In [49]:
import gensim
from nltk.data import find
from nltk.metrics.distance import edit_distance
from nltk.corpus import wordnet as wn

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [154]:
good = ['millionaire', 'agent', 'paper', 'lace', 'oil', 'flood', 'telescope', 'window', 'fiddle']
bad = ['soup', 'arm', 'drum', 'bowl', 'memory', 'school', 'luck', 'plastic']
okay = ['root', 'bowler', 'maracas', 'second', 'bacon', 'tutu', 'kid']
assassin = ['match']

In [21]:
model.most_similar(positive=good, negative=bad)  # baseline, very bad


[('multimillionaire', 0.2880403995513916),
 ('agents', 0.27311012148857117),
 ('appraise', 0.24996958673000336),
 ('millionaires', 0.24939695000648499),
 ('painter', 0.2443416863679886),
 ('gloomily', 0.24415557086467743),
 ('dilettante', 0.2424042969942093),
 ('property', 0.24002386629581451),
 ('financier', 0.23953421413898468),
 ('industrialist', 0.2389102429151535)]

In [109]:
def get_hints(word):
    hints = [x[0] for x in model.most_similar(positive=[word], topn=10)]
    return hints

def filter_hints(hints, good):
    barray = [True] * len(hints)
    for word in good:
        leven = [edit_distance(x, word) for x in hints]
        
        SAMENESS_THRESHOLD = 0.7
        for i in range(len(hints)):
            if (leven[i] / len(word) < SAMENESS_THRESHOLD):
                barray[i] = False;
    
    final_hints = [x for i,x in enumerate(hints) if barray[i]]
    return final_hints


In [111]:
google_words = []
with open('google-10000-english.txt', 'r') as f:
    google_words = f.readlines()
google_words = [x.strip() for x in google_words]
google_words = filter_hints(google_words, good)
google_words


['the',
 'to',
 'a',
 'that',
 'by',
 'this',
 'you',
 'be',
 'as',
 'your',
 'was',
 'we',
 'home',
 'can',
 'us',
 'my',
 'has',
 'search',
 'but',
 'information',
 'they',
 'he',
 'up',
 'may',
 'what',
 'which',
 'their',
 'news',
 'use',
 'see',
 'so',
 'contact',
 'here',
 'business',
 'web',
 'pm',
 'c',
 'e',
 'am',
 'would',
 'were',
 'me',
 's',
 'services',
 'click',
 'its',
 'x',
 'than',
 'price',
 'had',
 'list',
 'just',
 'state',
 'year',
 'day',
 'email',
 'two',
 'health',
 'n',
 'world',
 're',
 'used',
 'go',
 'b',
 'work',
 'most',
 'products',
 'music',
 'buy',
 'data',
 'them',
 'should',
 'product',
 'system',
 'post',
 'city',
 't',
 'policy',
 'number',
 'such',
 'available',
 'copyright',
 'support',
 'software',
 'jan',
 'well',
 'd',
 'where',
 'rights',
 'public',
 'books',
 'high',
 'school',
 'through',
 'm',
 'she',
 'review',
 'years',
 'very',
 'privacy',
 'items',
 'company',
 'r',
 'read',
 'group',
 'sex',
 'need',
 'many',
 'said',
 'does',
 'gene

In [113]:
combined = []
for word in good:
    for hint in filter_hints(get_hints(word), good):
        combined.append(hint)
combined

['businessman',
 'wealthy',
 'entrepreneur',
 'baron',
 'industrialist',
 'financier',
 'broker',
 'realtor',
 'signing',
 'investigator',
 'contract',
 'Realtor',
 'unsigned',
 'printed',
 'printing',
 'cardboard',
 'journal',
 'satin',
 'dresses',
 'sequins',
 'organdy',
 'pinafores',
 'beaded',
 'dress',
 'petroleum',
 'gas',
 'hydrocarbon',
 'hydrocarbons',
 'gasoline',
 'barrel',
 'barrels',
 'storm',
 'disaster',
 'inundations',
 'inundated',
 'interferometer',
 'astronomy',
 'spectrometer',
 'spacecraft',
 'astronomer',
 'satellites',
 'planetarium',
 'orbiting',
 'Astronomy',
 'doorway',
 'windshield',
 'doors',
 'skylight',
 'porch',
 'harp',
 'clarinet',
 'trumpet',
 'saxophone',
 'flutist']

In [136]:
def to_synsets(words):
    return [wn.synsets(x)[0] for x in words]

good_synsets = to_synsets(good)
bad_synsets = to_synsets(bad)
okay_synsets= to_synsets(okay)
assassin_synsets = to_synsets(assassin)
hints_synsets = to_synsets(combined)

google_synsets = []
final_google_words = []
for word in google_words:
    synsets = wn.synsets(word)
    if (len(synsets) > 0):
        google_synsets.append(synsets[0])
        final_google_words.append(word)
google_synsets = hints_synsets + google_synsets
final_google_words = combined + final_google_words
google_synsets

[Synset('businessman.n.01'),
 Synset('affluent.s.01'),
 Synset('entrepreneur.n.01'),
 Synset('baron.n.01'),
 Synset('industrialist.n.01'),
 Synset('financier.n.01'),
 Synset('agent.n.04'),
 Synset('realtor.n.01'),
 Synset('sign_language.n.01'),
 Synset('research_worker.n.01'),
 Synset('contract.n.01'),
 Synset('realtor.n.01'),
 Synset('unsigned.a.01'),
 Synset('print.v.01'),
 Synset('printing.n.01'),
 Synset('cardboard.n.01'),
 Synset('diary.n.01'),
 Synset('satin.n.01'),
 Synset('dress.n.01'),
 Synset('sequin.n.01'),
 Synset('organdy.n.01'),
 Synset('jumper.n.07'),
 Synset('bead.v.01'),
 Synset('dress.n.01'),
 Synset('petroleum.n.01'),
 Synset('gas.n.01'),
 Synset('hydrocarbon.n.01'),
 Synset('hydrocarbon.n.01'),
 Synset('gasoline.n.01'),
 Synset('barrel.n.01'),
 Synset('barrels.n.01'),
 Synset('storm.n.01'),
 Synset('catastrophe.n.02'),
 Synset('flood.n.01'),
 Synset('deluge.v.01'),
 Synset('interferometer.n.01'),
 Synset('astronomy.n.01'),
 Synset('mass_spectrometer.n.01'),
 Synset(

In [139]:
to_sort = []

for i, x in enumerate(google_synsets):
    good_sum = 0
    for y in good_synsets:
        good_sum += wn.path_similarity(x,y) ** 2
    bad_sum = 0
    for y in bad_synsets:
        bad_sum += wn.path_similarity(x,y) ** 2
    okay_sum = 0
    for y in okay_synsets:
        okay_sum += wn.path_similarity(x,y) ** 2
    assassin_sum = 0
    for y in assassin_synsets:
        assassin_sum += wn.path_similarity(x,y) ** 2
        
    index = good_sum * 2- bad_sum * 2 - okay_sum - assassin_sum * 5
    
    to_sort.append((final_google_words[i], x, index))
    
    #print(x, index, good_sum, bad_sum, okay_sum, assassin_sum)

to_sort.sort(key=lambda x:x[2],reverse=True)
to_sort

[('inundations', Synset('flood.n.01'), 1.942399225523185),
 ('skylight', Synset('skylight.n.01'), 0.43158328115101474),
 ('petroleum', Synset('petroleum.n.01'), 0.42045587959049474),
 ('petroleum', Synset('petroleum.n.01'), 0.42045587959049474),
 ('cardboard', Synset('cardboard.n.01'), 0.41708685643792187),
 ('card', Synset('card.n.01'), 0.41708685643792187),
 ('wallpaper', Synset('wallpaper.n.01'), 0.41708685643792187),
 ('wallpapers', Synset('wallpaper.n.01'), 0.41708685643792187),
 ('chad', Synset('chad.n.01'), 0.41708685643792187),
 ('material', Synset('material.n.01'), 0.3742868990600424),
 ('materials', Synset('material.n.01'), 0.3742868990600424),
 ('stuff', Synset('material.n.01'), 0.3742868990600424),
 ('fat', Synset('fat.n.01'), 0.18395696636185313),
 ('wax', Synset('wax.n.01'), 0.18395696636185313),
 ('deposit', Synset('deposit.n.01'), 0.16462144774540716),
 ('deposits', Synset('deposit.n.01'), 0.16462144774540716),
 ('earthquake', Synset('earthquake.n.01'), 0.16462144774540

In [140]:
dict = {}
for x in to_sort[::-1]:
    dict[x[1]] = x
    
vc = list(dict.values()).copy()
vc.sort(key=lambda x:x[2],reverse=True)
vc

[('inundations', Synset('flood.n.01'), 1.942399225523185),
 ('skylight', Synset('skylight.n.01'), 0.43158328115101474),
 ('petroleum', Synset('petroleum.n.01'), 0.42045587959049474),
 ('chad', Synset('chad.n.01'), 0.41708685643792187),
 ('wallpaper', Synset('wallpaper.n.01'), 0.41708685643792187),
 ('card', Synset('card.n.01'), 0.41708685643792187),
 ('cardboard', Synset('cardboard.n.01'), 0.41708685643792187),
 ('material', Synset('material.n.01'), 0.3742868990600424),
 ('wax', Synset('wax.n.01'), 0.18395696636185313),
 ('fat', Synset('fat.n.01'), 0.18395696636185313),
 ('earthquake', Synset('earthquake.n.01'), 0.16462144774540716),
 ('deposit', Synset('deposit.n.01'), 0.16462144774540716),
 ('chemical', Synset('chemical.n.01'), 0.14605778932036012),
 ('racks', Synset('rack.n.01'), 0.1410514981551237),
 ('stocks', Synset('stocks.n.01'), 0.1410514981551237),
 ('frames', Synset('frame.n.01'), 0.1410514981551237),
 ('individual', Synset('person.n.01'), 0.13954511357649896),
 ('virus', Sy

In [141]:
to_give = [x[0] for x in vc]
to_give[:10]

['inundations',
 'skylight',
 'petroleum',
 'chad',
 'wallpaper',
 'card',
 'cardboard',
 'material',
 'wax',
 'fat']

In [156]:
import nishy
import importlib
importlib.reload(nishy)
n = nishy.NishyBot(good,bad,okay,assassin)
n.get_hints()

['inundations',
 'skylight',
 'petroleum',
 'chad',
 'wallpaper',
 'card',
 'cardboard',
 'material',
 'wax',
 'fat']