In [192]:
import gensim
from nltk.corpus import wordnet as wn
from nltk.metrics.distance import edit_distance

# word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format('model.bin', binary=True)

In [193]:
good = ['giant', 'thumb', 'nail', 'lock','plane', 'ship','cell','state', 'capital']
bad = ['Aztec', 'court','chocolate','space','snow']
okay = ['shop','genius','ambulance','button','heart','pupil','vet']
assassin = ['microscope']

In [194]:
class NishyBot2:
    TOP_N = 3000
    SAMENESS_THRESHOLD = 0.5

    def find_similars(words):
        similars = []
        for word in words:
            try:
                similars.append([x[0] for x in model.most_similar(positive=[word], topn=NishyBot2.TOP_N)])
            except KeyError:
                similars.append([])
                print('No similar words found for', word)
        return similars

    def count_matches(wordlists, test):
        count = 0
        for wordlist in wordlists:
            if test in wordlist:
                count += len(wordlist) - wordlist.index(test)
        return count

    def __init__(self, good, bad, okay, assassin):
        self.good = good
        self.bad = bad
        self.okay = okay
        self.assassin = assassin

        self.good_similars = NishyBot2.find_similars(good)
        self.bad_similars = NishyBot2.find_similars(bad)
        self.okay_similars = NishyBot2.find_similars(okay)
        self.assassin_similars = NishyBot2.find_similars(assassin)

    def good_matches(self, test):
        result = []
        for i, similar in enumerate(self.good_similars):
            if test in similar:
                result.append(self.good[i])
        return result

    def score(self, test):
        good_count = NishyBot2.count_matches(self.good_similars, test)
        bad_count = NishyBot2.count_matches(self.bad_similars, test)
        okay_count = NishyBot2.count_matches(self.okay_similars, test)
        assassin_count = NishyBot2.count_matches(self.assassin_similars, test)

        return 2 * good_count - 2 * bad_count - 1 * okay_count - 5 * assassin_count

    def score_all(self, wordset):
        scores = [(x, n.score(x)) for x in wordset]
        scores.sort(key=lambda x: x[1], reverse=True)
        scores = dict(scores)
        return scores

    def score_all_pruned(self, wordset):
        scores = self.score_all(wordset)
        hints = list(scores.keys())

        for i, hint in enumerate(hints):
            if len(wn.synsets(hint)) == 0:
                if hint in scores:
                    scores.pop(hint)
            else:
                for word in n.good:
                    leven = edit_distance(hint, word)
                    if leven / len(word) < NishyBot2.SAMENESS_THRESHOLD:
                        if hint in scores:
                            scores.pop(hint)

        return scores

In [195]:
n = NishyBot2(good, bad, okay, assassin)

In [196]:
s = set()
for similar in n.good_similars:
    for word in similar:
        s.add(word)
len(s)

23139

In [197]:
sc = n.score_all_pruned(s)
sc

{'claw': 19728,
 'gunwale': 17961,
 'protruding': 17542,
 'mortise': 16796,
 'windlass': 16342,
 'shoelace': 16085,
 'bandsaw': 16040,
 'tenon': 15740,
 'shackle': 15618,
 'ferrule': 15562,
 'pliers': 15507,
 'long-handled': 15464,
 'claws': 15218,
 'screws': 15125,
 'vise': 15076,
 'scissors': 14964,
 'dowel': 14912,
 'wrists': 14757,
 'dowels': 14737,
 'gouges': 14728,
 'eyeballs': 14628,
 'wrench': 14531,
 'neck': 14501,
 'screw': 14432,
 'knobbed': 14406,
 'strap': 14403,
 'nicks': 14338,
 'retractor': 14318,
 'collet': 14304,
 'reamer': 14270,
 'halyard': 14253,
 'dangles': 14212,
 'tacks': 14104,
 'rivet': 14093,
 'crimping': 14072,
 'jaws': 14002,
 'unscrewing': 13971,
 'gloved': 13966,
 'cleat': 13963,
 'tucking': 13950,
 'snips': 13948,
 'rope': 13900,
 'forefinger': 13820,
 'ankles': 13728,
 'toe': 13706,
 'grommets': 13704,
 'mallet': 13673,
 'unscrewed': 13614,
 'jabbed': 13606,
 'clamps': 13572,
 'hairbrush': 13565,
 'forestay': 13517,
 'eyelet': 13504,
 'manacles': 13460,

In [198]:
[(x, n.good_matches(x)) for x in list(sc.keys())][:20]

[('claw', ['giant', 'thumb', 'nail', 'lock']),
 ('gunwale', ['thumb', 'nail', 'lock', 'plane', 'ship']),
 ('protruding', ['giant', 'thumb', 'nail', 'lock']),
 ('mortise', ['thumb', 'nail', 'lock']),
 ('windlass', ['thumb', 'nail', 'lock', 'ship']),
 ('shoelace', ['thumb', 'nail', 'lock']),
 ('bandsaw', ['thumb', 'nail', 'lock']),
 ('tenon', ['thumb', 'nail', 'lock']),
 ('shackle', ['thumb', 'nail', 'lock']),
 ('ferrule', ['thumb', 'nail', 'lock']),
 ('pliers', ['thumb', 'nail', 'lock']),
 ('long-handled', ['thumb', 'nail', 'lock']),
 ('claws', ['giant', 'thumb', 'nail']),
 ('screws', ['thumb', 'nail', 'lock']),
 ('vise', ['thumb', 'nail', 'lock']),
 ('scissors', ['thumb', 'nail', 'lock']),
 ('dowel', ['thumb', 'nail', 'lock']),
 ('wrists', ['thumb', 'nail', 'lock']),
 ('dowels', ['thumb', 'nail', 'lock']),
 ('gouges', ['thumb', 'nail', 'lock'])]

In [83]:
n.good_matches('thumbtacks')

['chair', 'tag']

In [191]:
NishyBot2.count_matches(n.okay_similars, 'protruding')
n.okay_similars[0][2000:]

['creel',
 'craftwork',
 'hire',
 'clamming',
 'Garages',
 'auctioneering',
 'Sazali',
 'worker-owned',
 'blenders',
 'Bersham',
 'ASDA',
 'Idora',
 'onsite',
 'pumphouse',
 'planer',
 'machine',
 '450-seat',
 'Sawmill',
 'Haçienda',
 'realtor',
 'Movieland',
 'newly-weds',
 'wallets',
 'handbag',
 'stonemasonry',
 'ironing',
 'co-working',
 'alehouses',
 'mucking',
 'Migros',
 'resells',
 'Spacely',
 'manufactures',
 'pittance',
 'Coney',
 'Severs',
 '3-story',
 'chemist',
 'Shecky',
 'housekeeping',
 'Manufacturing',
 'trimmers',
 'roof-top',
 'Andronico',
 'townhouse',
 'liquor',
 'electronics',
 'intercoms',
 'chauffeur',
 'piggy',
 'chainsaw',
 'Showroom',
 'Feech',
 'low-priced',
 'orphanage',
 'wood-carving',
 'Endsleigh',
 'janitors',
 'Pies',
 'Dispensary',
 'Snack',
 'schoolteacher',
 'Pancakes',
 'Auntie',
 'unscrew',
 'Delmonico',
 'Pawnshop',
 'pimp',
 'kiln',
 'umbrellas',
 'Tog',
 'Housewares',
 'Squee',
 'pastries',
 'Lutie',
 'cabin',
 'burglarized',
 'Shiatzy',
 'chin

In [ ]:
"""
tour: ['concert', 'play', 'trip']
wrestle: ['play', 'tag']
wrestle: ['play', 'tag']
robe: ['dress', 'chair']

'claw', ['giant', 'thumb', 'nail', 'lock'])
('freighter', ['plane', 'ship'])
('republic', ['state', 'capital'])
('apoptosis', ['cell'])
('protruding', ['giant', 'thumb', 'lock'])

[('claw', ['giant', 'thumb', 'nail', 'lock']),
 ('gunwale', ['thumb', 'nail', 'lock', 'plane', 'ship']),
 ('protruding', ['giant', 'thumb', 'nail', 'lock']),
 ('shoelace', ['thumb', 'nail', 'lock']),
 ('mortise', ['thumb', 'nail', 'lock']),
 ('bandsaw', ['thumb', 'nail', 'lock']),
 ('long-handled', ['thumb', 'nail', 'lock']),
 ('vise', ['thumb', 'nail', 'lock']),
 ('shackle', ['thumb', 'nail', 'lock']),
 ('tenon', ['thumb', 'nail', 'lock']),
 ('ferrule', ['thumb', 'nail', 'lock']),
 ('pliers', ['thumb', 'nail', 'lock']),
 ('screws', ['thumb', 'nail', 'lock']),
 ('wrists', ['thumb', 'nail', 'lock']),
 ('claws', ['giant', 'thumb', 'nail']),
 ('halyard', ['thumb', 'nail', 'lock', 'ship']),
 ('scissors', ['thumb', 'nail', 'lock']),
 ('dowel', ['thumb', 'nail', 'lock']),
 ('forepaws', ['giant', 'thumb', 'nail']),
 ('dowels', ['thumb', 'nail', 'lock'])]
"""

