In [13]:
import gensim
from nltk.corpus import wordnet as wn

model = gensim.models.KeyedVectors.load_word2vec_format('gigaword-nocase-26.bin', binary=True)

In [23]:
class NishyBot2:
    TOP_N = 10000
    SAMENESS_THRESHOLD = 0

    def find_similars(words):
        similars = []
        for word in words:
            try:
                similars.append({x[0]:i for i, x in enumerate(model.most_similar(positive=[word], topn=NishyBot2.TOP_N))})
            except KeyError:
                similars.append([])
                print('No similar words found for', word)
        return similars

    def count_matches(wordlists, test):
        count = 0
        for wordlist in wordlists:
            if test in wordlist:
                count += len(wordlist) - wordlist[test]
        return count

    def __init__(self, good, bad, okay, assassin):
        self.good = good
        self.bad = bad
        self.okay = okay
        self.assassin = assassin

        self.good_similars = NishyBot2.find_similars(good)
        self.bad_similars = NishyBot2.find_similars(bad)
        self.okay_similars = NishyBot2.find_similars(okay)
        self.assassin_similars = NishyBot2.find_similars(assassin)

    def good_matches(self, test):
        result = []
        for i, similar in enumerate(self.good_similars):
            if test in similar:
                result.append(self.good[i])
        return result

    def score(self, test):
        good_count = NishyBot2.count_matches(self.good_similars, test)
        bad_count = NishyBot2.count_matches(self.bad_similars, test)
        okay_count = NishyBot2.count_matches(self.okay_similars, test)
        assassin_count = NishyBot2.count_matches(self.assassin_similars, test)

        return good_count - bad_count - 0.5 * okay_count - 3 * assassin_count

    def score_all(self, wordset):
        scores = [(x, self.score(x)) for x in wordset]
        scores.sort(key=lambda x: x[1], reverse=True)
        scores = dict(scores)
        return scores

    def score_all_pruned(self, wordset):
        scores = self.score_all(wordset)
        hints = list(scores.keys())

        for i, hint in enumerate(hints):
            if len(wn.synsets(hint)) == 0:  # if it's not a real word
                if hint in scores:
                    scores.pop(hint)
            else:  # if it's too similar to an existing word
                for word in self.good:
                    if word in hint or hint in word:
                        if hint in scores:
                            scores.pop(hint)
                            break

        return scores


In [24]:
def pregame(good, bad, okay, assassin):
    good2 = good.copy()
    while len(good2) != 0:
        n = NishyBot2(good2, bad, okay, assassin)

        s = set()
        for similar in n.good_similars:
            for word in similar:
                s.add(word)

        sc = n.score_all_pruned(s)

        hint = list(sc.keys())[0]
        matches = n.good_matches(hint)
        print(hint, matches)
        good2 = list(filter(lambda x: x not in matches, good2))


In [25]:
good = 'sack makeup bottle cuckoo cast cone jockey America'.lower().split(' ')
bad = 'nut Russia fog break spider bear rip tube plane'.lower().split(' ')
okay = 'Christmas pool Beijing trip nyc fever peanut'.lower().split(' ')
assassin = 'link'.lower().split(' ')

# good = 'spot,blade,chain,record,magician,jeweler,fiddle,apple'.lower().split(',')
# bad = 'wonderland,Newton,glacier,pig,spy,lead,mess,duck,stable'.lower().split(',')
# okay = 'India,millionaire,rainbow,razor,bridge,polo,Notre,Dame'.lower().split(',')
# assassin = 'ice,cream'.lower().split(',')

# good = ['giant', 'thumb', 'nail', 'lock','plane', 'ship','cell','state', 'capital']
# bad = ['Aztec', 'court','chocolate','space','snow']
# okay = ['shop','genius','ambulance','button','heart','pupil','vet']
# assassin = ['microscope']

In [26]:
pregame(good, bad, okay, assassin)

comedian ['makeup', 'cast', 'jockey', 'america']
confiscate ['sack', 'bottle']
trilling ['cuckoo', 'cone']


In [27]:
n = NishyBot2(good, bad, okay, assassin)

s = set()
for similar in n.good_similars:
    for word in similar:
        s.add(word)
print(len(s))

sc = n.score_all_pruned(s)
[(x, n.good_matches(x), sc[x]) for x in list(sc.keys())][:20]

55971


[('comedian', ['makeup', 'cast', 'jockey', 'america'], 31748.5),
 ('entertainer', ['makeup', 'cast', 'jockey', 'america'], 31160.0),
 ('anoint', ['sack', 'makeup', 'cast', 'jockey'], 30661.0),
 ('hollywood', ['makeup', 'cuckoo', 'cast', 'jockey', 'america'], 29910.5),
 ('outsider', ['makeup', 'cast', 'jockey', 'america'], 28967.0),
 ('audition', ['makeup', 'cuckoo', 'cast', 'jockey'], 28097.5),
 ('animator', ['makeup', 'cuckoo', 'cast', 'jockey'], 27839.0),
 ('conceit', ['makeup', 'cuckoo', 'cast', 'america'], 27759.0),
 ('shuffle', ['sack', 'makeup', 'jockey'], 27575.0),
 ('whippersnapper', ['makeup', 'cast', 'jockey', 'america'], 27572.0),
 ('incumbent', ['sack', 'cast', 'jockey'], 27510.0),
 ('actor', ['makeup', 'cuckoo', 'cast', 'jockey', 'america'], 27351.0),
 ('contestant', ['makeup', 'cast', 'jockey'], 27335.0),
 ('streep', ['makeup', 'cuckoo', 'cast'], 27232.0),
 ('reality', ['makeup', 'cast', 'america'], 26870.0),
 ('booth', ['makeup', 'bottle', 'cast', 'cone'], 26861.0),
 ('p

In [224]:
NishyBot2.count_matches(n.bad_similars, 'spoiled')

0

In [227]:
n.good_similars[good.index('cone')].index('encase')

1