In [2]:
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import word_tokenize, pos_tag
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
import re

In [2]:
def least_similar(synsets):
    # returns entries with highest wup similarity score
    the_max = 0
    for i in range(len(synsets)):
        for j in range(i + 1, len(synsets)):
            this_similarity = synsets[i].wup_similarity(synsets[j])
            if this_similarity is not None and this_similarity > the_max:
                the_max = this_similarity
                s1 = synsets[i]
                s2 = synsets[j]
    return (s1, s2)

def best_corresponding_pos(synset):
    # inspects definition and returns most similar word with
    # same pos as input word excluding the input word
    # right now only works for nouns, verbs, adjectives
    # returns a tuple with a boolean describing whether
    # the same pos was found, similarity, and word
    this_pos = synset.pos()
    def_token = word_tokenize(synset.definition())
    
    if  this_pos == wn.NOUN:
        pos_token = 'NN'
    elif this_pos == wn.VERB:
        pos_token = 'V'
    elif this_pos == wn.ADJ:
        pos_token = 'JJ'
    else:
        # don't care about pos
        pos_token = 'z'
#     else:
#         raise ValueError('Input synset must be a Noun, Verb, or Adjective')
        
    lm = WordNetLemmatizer()
    min_similarity = 2
    min_similarity_other_pos = 2
    best_corr_pos = None
    best_corr_other_pos = None
    
    for tagged_word in pos_tag(def_token):
        if pos_token in tagged_word[1]:
            lemma = lm.lemmatize(tagged_word[0], pos=synset.pos())
            other_synsets = wn.synsets(lemma, pos=synset.pos())
            if len(other_synsets) > 0:
                # just pick the first synset
                this_similarity = synset.wup_similarity(other_synsets[0])
                if this_similarity is not None and this_similarity < min_similarity:
                    min_similarity = this_similarity
                    best_corr_pos = tagged_word[0]
        elif best_corr_pos is None:
            lemma = lm.lemmatize(tagged_word[0])
            other_synsets = wn.synsets(lemma)
            if len(other_synsets) > 0:
                # just pick the first synset
                this_similarity = synset.wup_similarity(other_synsets[0])
                if this_similarity is not None and this_similarity < min_similarity_other_pos:
                    min_similarity_other_pos = this_similarity
                    best_corr_other_pos = tagged_word[0]
    # if no word with same pos found
    if best_corr_pos is None:
        return False, min_similarity_other_pos, best_corr_other_pos
    return True, min_similarity, best_corr_pos

def first_corresponding_pos(synset):
    # inspects definition and returns first word with same pos
    # as input word
    # right now only works for nouns, verbs, adjectives
    this_pos = synset.pos()
    def_token = word_tokenize(synset.definition())
    
    if this_pos == 'n':
        pos_token = 'NN'
    elif this_pos == 'v':
        pos_token = 'V'
    elif this_pos == 'a':
        pos_token = 'JJ'
    else:
        raise ValueError('Input synset must be a Noun, Verb, or Adjective')
    
    # in case same pos does not exist
    if this_pos == 'n':
        alt_pos = 'JJ'
    else:
        alt_pos = 'NN'
        
    first_alt = None
    for tagged_word in pos_tag(def_token):
        if pos_token in tagged_word[1]:
            return tagged_word[0], synset.definition()
        elif first_alt is None and alt_pos in tagged_word:
            first_alt = tagged_word[0]
    return first_alt, synset.definition()
        
def get_two_senses(seed_word):
    synsets = wn.synsets(seed_word)
    pair = least_similar(synsets)
#     return best_corresponding_pos(pair[0])[2], best_corresponding_pos(pair[1])[2]
    return first_corresponding_pos(pair[0]), first_corresponding_pos(pair[1])

def traverse_wn(word):
    # traverses wn synsets for word and returns best
    # word in definition of synsets with same pos
    
    # best word
#     min_similarity_other_pos = 2
#     min_similarity = 2
#     best_other_pos = None
#     best_pos = None
#     for synset in wn.synsets(word):
#         is_same_pos, this_similarity, this_pos = best_corresponding_pos(synset)
#         if is_same_pos and this_similarity < min_similarity:
#             min_similarity = this_similarity
#             best_pos = this_pos
#         elif not is_same_pos and this_similarity < min_similarity_other_pos:
#             min_similarity_other_pos = this_similarity
#             best_other_pos = this_pos
#     if best_pos is None:
#         return best_other_pos
#     return best_pos

    # first word
    for synset in wn.synsets(word):
        first_pos = first_corresponding_pos(synset)
        if first_pos is not None and first_pos != word:
            return first_pos
        
def five_word_algorithm(seed_word):
    word_c, word_d = get_two_senses(seed_word)
    word_b = traverse_wn(word_c[0])
    word_a = traverse_wn(word_b[0])
    word_e = traverse_wn(word_d[0])
    return word_a, word_b, word_c, word_d, word_e

def print_five_words(seed_word):
    words = five_word_algorithm(seed_word)
    print(words[0][0] + '->' + words[1][0] + '->\033[4m' + words[2][0] + 
          '\033[0m\033[1m~~>\033[0m\033[4m' + words[3][0] + '\033[0m->' + words[4][0])

In [91]:
five_word_algorithm('chicken')

(('vertebrates',
  'warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings'),
 ('bird',
  'a domesticated gallinaceous bird thought to be descended from the red jungle fowl'),
 ('fowl',
  'a domestic fowl bred for flesh or eggs; believed to have been developed from the red jungle fowl'),
 ('person', 'a person who lacks confidence, is irresolute and wishy-washy'),
 ('human', 'a human being'))

In [76]:
# first word
print_five_words('chicken')
print_five_words('fan')
print_five_words('Bat')
print_five_words('Battery')
print_five_words('Trip')
print_five_words('Cook')
print_five_words('Straw')
print_five_words('Fiddle')

vertebrates->bird->[4mfowl[0m[1m~~>[0m[4mperson[0m->human
person->follower->[4mdevotee[0m[1m~~>[0m[4mfollower[0m->person
sound->noise->[4mracket[0m[1m~~>[0m[4mclub[0m->team
property->number->[4mgroup[0m[1m~~>[0m[4munit[0m->division
document->act->[4mjourney[0m[1m~~>[0m[4mlight[0m->physics
kind->make->[4mprepare[0m[1m~~>[0m[4mapplying[0m->put
structure->buildings->[4mplant[0m[1m~~>[0m[4mconsisting[0m->originate
power->influence->[4mmanipulate[0m[1m~~>[0m[4mfalsify[0m->make


In [56]:
# best word
print_five_words('chicken')
print_five_words('fan')
print_five_words('Bat')
print_five_words('Battery')
print_five_words('Trip')
print_five_words('Cook')
print_five_words('Straw')
print_five_words('Fiddle')

person->names->[4mflesh[0m[1m~~>[0m[4mconfidence[0m->state
organisms->biology->[4msports[0m[1m~~>[0m[4madmirer[0m->team
biology->sports->[4mracket[0m[1m~~>[0m[4mclub[0m->golf
operate->pedal->[4mguns[0m[1m~~>[0m[4mcatcher[0m->baseball
dance->ball->[4mreturn[0m[1m~~>[0m[4mlight[0m->sensation
make->spoken->[4mprepare[0m[1m~~>[0m[4mapplying[0m->convey
plants->crop->[4mfodder[0m[1m~~>[0m[4mseed[0m->tournament
piece->performance->[4mor[0m[1m~~>[0m[4mfalsify[0m->falsifying


In [92]:
wn.synsets('chicken')

[Synset('chicken.n.01'),
 Synset('chicken.n.02'),
 Synset('wimp.n.01'),
 Synset('chicken.n.04'),
 Synset('chicken.s.01')]

In [93]:
wn.synset('wimp.n.01').definition()

'a person who lacks confidence, is irresolute and wishy-washy'

In [79]:
for synset in wn.synsets('Fiddle'):
    print(synset)
    print(synset.definition())
    print('')

Synset('violin.n.01')
bowed stringed instrument that is the highest member of the violin family; this instrument has four strings and a hollow body and an unfretted fingerboard and is played with a bow

Synset('fiddle.v.01')
avoid (one's assigned duties)

Synset('fiddle.v.02')
commit fraud and steal from one's employer

Synset('fiddle.v.03')
play the violin or fiddle

Synset('fiddle.v.04')
play on a violin

Synset('toy.v.02')
manipulate manually or in one's mind or imagination

Synset('tamper.v.01')
play around with or alter or falsify, usually secretively or dishonestly

Synset('tinker.v.03')
try to fix or mend



In [84]:
print(wn.synset('toy.v.02').wup_similarity(wn.synset('tamper.v.01')))
print(wn.synset('toy.v.02').wup_similarity(wn.synset('violin.n.01')))

0.75
0.11764705882352941


# Using Glove for similarity

In [36]:
class Meta_Poetry_Glove:
    glove_model = KeyedVectors.load_word2vec_format('~/Downloads/glove.6B/glove.6B.300d.w2v.txt', binary=False)
    glove_dim = len(glove_model['man'])
    punct = re.compile(r'[^\w\s]')
    ps = PorterStemmer()

    def __init__(self):
        self.already_seen = set()

    def get_glove_sim(self, w1, w2):
        """
        splits WordNet words or definitions
        and returns cosine similarity for
        the averages of all words in w1 and w2
        """
        split_re = re.compile(r'[\s|_]')
        avg_w1 = np.zeros(self.glove_dim)
        avg_w2 = np.zeros(self.glove_dim)
        for word in split_re.split(w1):
            if word not in self.glove_model:
                continue
            avg_w1 += self.glove_model.word_vec(word)
        avg_w1 /= np.sqrt(np.sum(avg_w1 ** 2))

        for word in split_re.split(w2):
            if word not in self.glove_model:
                continue
            avg_w2 += self.glove_model.word_vec(word)
        avg_w2 /= np.sqrt(np.sum(avg_w2 ** 2))
        return avg_w1.dot(avg_w2)

    def least_similar_glove(self, synsets):
        """
        finds two least similar synset
        among given synsets in Glove space
        """
        min_cos_sim = 1
        for i in range(len(synsets)):
            for j in range(i + 1, len(synsets)):
                word1 = remove_stopwords(self.punct.sub('', synsets[i].definition()))
                word2 = remove_stopwords(self.punct.sub('', synsets[j].definition()))
                this_sim = self.get_glove_sim(word1, word2)
                # assumes that Glove vocabulary has all words in wn,
                # may need to add error handling
                if this_sim < min_cos_sim:
                    min_cos_sim = this_sim
                    s1 = synsets[i]
                    s2 = synsets[j]
        return s1, s2

    def least_similar_glove_specify(self, synset):
        """
        returns synsets with lowest cosine
        similarity to specified synset in
        Glove space
        """
        word = synset.name().split('.')[0]
        min_sim = 1
        least_sim_synset = None
        for other_synset in wn.synsets(word):
            this_sim = self.get_glove_sim(synset.definition(), other_synset.definition())
            if this_sim < min_sim:
                min_sim = this_sim
                least_sim_synset = other_synset
        return self.get_sense_from_def(least_sim_synset), least_sim_synset.definition()

    def traverse_wn_glove(self, word):
        """
        finds most similar word among definitions
        of given words synsets
        """
        max_sim = -1
        best_word = None
        best_word_def = None
        for synset in wn.synsets(word):
            clean_def = remove_stopwords(self.punct.sub('', synset.definition()))
            for other_word in clean_def.split():
                this_sim = self.get_glove_sim(word, other_word)
                if self.ps.stem(other_word) not in self.already_seen and other_word != word and this_sim > max_sim:
                    max_sim = this_sim
                    best_word = other_word
                    best_word_def = synset.definition()
        return best_word, best_word_def

    def get_two_senses_glove(self, seed_word):
        """
        finds least similar synsets of seed word
        """
        synsets = wn.synsets(seed_word)
        pair = self.least_similar_glove(synsets)
        sense1 = self.get_sense_from_def(pair[0])
        sense2 = self.get_sense_from_def(pair[1])
        return (sense1, pair[0].definition()), (sense2, pair[1].definition())

    def get_sense_from_def(self, synset):
        """
        finds word that is most similar to given
        synset's name in Glove space
        """
        max_sim = -1
        best_word = None
        synset_word = synset.name().split('.')[0]
        # strip punctuation
        clean_def = remove_stopwords(self.punct.sub('', synset.definition()))
        for other_word in clean_def.split():
            if self.ps.stem(other_word) in self.already_seen:
                continue
            this_sim = self.get_glove_sim(synset_word, other_word)
            if other_word != synset_word and this_sim > max_sim:
                max_sim = this_sim
                best_word = other_word
        return best_word

    def five_word_algorithm_glove_specify(self, synset):
        """
        given a specific synset, traverse Wordnet to
        create five word outline for meta poetry
        """
        word_c = synset.name().split('.')[0], synset.definition()
        self.already_seen.add(self.ps.stem(word_c[0]))

        word_d = self.least_similar_glove_specify(synset)
        self.already_seen.add(self.ps.stem(word_d[0]))

        word_b = self.get_sense_from_def(synset)
        word_b = word_b, wn.synsets(word_b)[0].definition()
        self.already_seen.add(self.ps.stem(word_b[0]))

        word_a = self.traverse_wn_glove(word_b[0])
        self.already_seen.add(self.ps.stem(word_a[0]))

        word_e = self.traverse_wn_glove(word_d[0])

        self.already_seen.clear()
        return word_a, word_b, word_c, word_d, word_e

    def five_word_algorithm_glove(self, seed_word):
        """
        given a seed word, traverses Wordnet to create
        five word outline for meta poetry
        """
        word_c, word_d = self.get_two_senses_glove(seed_word)
        self.already_seen.add(self.ps.stem(word_c[0]))
        self.already_seen.add(self.ps.stem(word_d[0]))

        word_b = self.traverse_wn_glove(word_c[0])
        self.already_seen.add(self.ps.stem(word_b[0]))

        word_a = self.traverse_wn_glove(word_b[0])
        self.already_seen.add(self.ps.stem(word_a[0]))

        word_e = self.traverse_wn_glove(word_d[0])

        self.already_seen.clear()
        return word_a, word_b, word_c, word_d, word_e

    def print_five_words_glove(self, seed_word):
        words = self.five_word_algorithm_glove(seed_word)
        print(words[0][0] + '->' + words[1][0] + '->\033[4m' + words[2][0] +
              '\033[0m\033[1m~~>\033[0m\033[4m' + words[3][0] + '\033[0m->' + words[4][0])

    def print_five_words_glove_specify(self, synset):
        words = self.five_word_algorithm_glove_specify(synset)
        print(words[0][0] + '->' + words[1][0] + '->\033[4m' + words[2][0] +
              '\033[0m\033[1m~~>\033[0m\033[4m' + words[3][0] + '\033[0m->' + words[4][0])

In [37]:
mp_glove = Meta_Poetry_Glove()
mp_glove.print_five_words_glove('chicken')
mp_glove.print_five_words_glove('fan')
mp_glove.print_five_words_glove('Bat')
mp_glove.print_five_words_glove('Battery')
mp_glove.print_five_words_glove('Trip')
mp_glove.print_five_words_glove('Cook')
mp_glove.print_five_words_glove('Straw')
mp_glove.print_five_words_glove('Fiddle')

organism->animal->[4mfood[0m[1m~~>[0m[4mirresolute[0m->uncertain
excitement->enthusiasm->[4mardent[0m[1m~~>[0m[4mchaff[0m->countermeasure
sound->echo->[4mecholocation[0m[1m~~>[0m[4mbriefly[0m->time
league->division->[4munit[0m[1m~~>[0m[4mmortar[0m->shells
return->trip->[4mjourney[0m[1m~~>[0m[4membarrassing[0m->shame
temperature->heat->[4mhot[0m[1m~~>[0m[4mtamper[0m->falsify
red->orange->[4myellow[0m[1m~~>[0m[4mcover[0m->provide
owners->stolen->[4msteal[0m[1m~~>[0m[4mviolin[0m->instrument




In [223]:
# without already_seen
print_five_words_glove_specify(wn.synset('chicken.n.01'))
print_five_words_glove_specify(wn.synset('fan.n.01'))
print_five_words_glove_specify(wn.synset('bat.n.05'))
print_five_words_glove_specify(wn.synset('battery.n.02'))
print_five_words_glove_specify(wn.synset('trip.n.01'))
print_five_words_glove_specify(wn.synset('cook.v.01'))
print_five_words_glove_specify(wn.synset('straw.n.01'))
print_five_words_glove_specify(wn.synset('fiddle.v.05'))

animal->food->[4mchicken[0m[1m~~>[0m[4measily[0m->easy
aircraft->air->[4mfan[0m[1m~~>[0m[4memotion[0m->feeling
pitch->ball->[4mbat[0m[1m~~>[0m[4mbriefly[0m->time
invented->device->[4mbattery[0m[1m~~>[0m[4munit[0m->division
trip->journey->[4mtrip[0m[1m~~>[0m[4membarrassing[0m->embarrassment
heat->hot->[4mcook[0m[1m~~>[0m[4mheating[0m->heat
hat->hats->[4mstraw[0m[1m~~>[0m[4mscattering[0m->particles
number->ones->[4mtoy[0m[1m~~>[0m[4mindifferently[0m->indifferent




In [38]:
mp_glove.print_five_words_glove_specify(wn.synset('chicken.n.01'))
mp_glove.print_five_words_glove_specify(wn.synset('fan.n.01'))
mp_glove.print_five_words_glove_specify(wn.synset('bat.n.05'))
mp_glove.print_five_words_glove_specify(wn.synset('battery.n.02'))
mp_glove.print_five_words_glove_specify(wn.synset('trip.n.01'))
mp_glove.print_five_words_glove_specify(wn.synset('cook.v.01'))
mp_glove.print_five_words_glove_specify(wn.synset('straw.n.01'))
mp_glove.print_five_words_glove_specify(wn.synset('fiddle.v.05'))

animal->food->[4mchicken[0m[1m~~>[0m[4measily[0m->easy
aircraft->air->[4mfan[0m[1m~~>[0m[4memotion[0m->feeling
pitch->ball->[4mbat[0m[1m~~>[0m[4mbriefly[0m->time
invented->device->[4mbattery[0m[1m~~>[0m[4munit[0m->division
traveling->journey->[4mtrip[0m[1m~~>[0m[4membarrassing[0m->shame
temperature->hot->[4mcook[0m[1m~~>[0m[4mheating[0m->warmer
wear->hats->[4mstraw[0m[1m~~>[0m[4mscattering[0m->particles
number->ones->[4mtoy[0m[1m~~>[0m[4mindifferently[0m->manner


