In [1]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords, genesis
import numpy as np

In [2]:
help(wn)

Help on LazyCorpusLoader in module nltk.corpus.util object:

wordnet = class LazyCorpusLoader(builtins.object)
 |  wordnet(name, reader_cls, *args, **kwargs)
 |  
 |  To see the API documentation for this lazily loaded corpus, first
 |  run corpus.ensure_loaded(), and then run help(this_corpus).
 |  
 |  LazyCorpusLoader is a proxy object which is used to stand in for a
 |  corpus object before the corpus is loaded.  This allows NLTK to
 |  create an object for each corpus, but defer the costs associated
 |  with loading those corpora until the first time that they're
 |  actually accessed.
 |  
 |  The first time this object is accessed in any way, it will load
 |  the corresponding corpus, and transform itself into that corpus
 |  (by modifying its own ``__class__`` and ``__dict__`` attributes).
 |  
 |  If the corpus can not be found, then accessing this object will
 |  raise an exception, displaying installation instructions for the
 |  NLTK data package.  Once they've properly ins

In [3]:
synsets = list(wn.all_synsets())
print(len(synsets))

117659


In [4]:
print(synsets[:10])
help(synsets[0])

[Synset('able.a.01'), Synset('unable.a.01'), Synset('abaxial.a.01'), Synset('adaxial.a.01'), Synset('acroscopic.a.01'), Synset('basiscopic.a.01'), Synset('abducent.a.01'), Synset('adducent.a.01'), Synset('nascent.a.01'), Synset('emergent.s.02')]
Help on Synset in module nltk.corpus.reader.wordnet object:

class Synset(_WordNetObject)
 |  Synset(wordnet_corpus_reader)
 |  
 |  Create a Synset from a "<lemma>.<pos>.<number>" string where:
 |  <lemma> is the word's morphological stem
 |  <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
 |  <number> is the sense number, counting from 0.
 |  
 |  Synset attributes, accessible via methods with the same name:
 |  
 |  - name: The canonical name of this synset, formed using the first lemma
 |    of this synset. Note that this may be different from the name
 |    passed to the constructor if that string used a different lemma to
 |    identify the synset.
 |  - pos: The synset's part of speech, matching one of the module le

In [5]:
e = wn.synsets("dog")

In [69]:
def find_highest_uncommon_ancestors(word, noun_only = True):
    if noun_only:
        synsets = wn.synsets(word, pos=wn.NOUN)
    else:
        # TODO
        synsets = wn.synsets(word)
    print(len(synsets))
    synsets_lca = {syn: set() for syn in synsets}
    possible_relations = list(itertools.combinations(synsets, 2))
    print(len(possible_relations))
    for a, b in possible_relations:
        lch = a.lowest_common_hypernyms(b)
        synsets_lca[a].add(lch[0])
        synsets_lca[b].add(lch[0])
    synsets_hua = dict()
    for synset in synsets:
        tree = synset.tree(lambda s:s.hypernyms())
        current_entity = tree[0]
        trees = tree[1:]
        synsets_hua[synset] = highest_uncommon_ancestor(trees, synsets_lca[synset])
    return synsets_hua

def highest_uncommon_ancestor(trees, synsets_lca):
    ua_list = set()
    for further_tree in trees:
        while True:
            next_entity = further_tree[0]
            further_tree = further_tree[1:]
            if len(further_tree) > 1:
                ua_list.update(highest_uncommon_ancestor(further_tree, synsets_lca))
            else:
                further_tree = further_tree[0]
            if next_entity not in synsets_lca:
                current_entity = next_entity
            else:
                ua_list.add(current_entity)
                break
    return ua_list

hua = find_highest_uncommon_ancestors("dog")
for k, v in hua.items():
    print(k.definition(), v)

7
21
a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds {Synset('animal.n.01')}
a dull unattractive unpleasant girl or woman {Synset('unpleasant_person.n.01')}
informal term for a man {Synset('causal_agent.n.01'), Synset('male.n.02')}
someone who is morally reprehensible {Synset('villain.n.01')}
a smooth-textured sausage of minced beef or pork usually smoked; often served on a bread roll {Synset('matter.n.03')}
a hinged catch that fits into a notch of a ratchet to move a wheel forward or prevent it from moving backward {Synset('restraint.n.06')}
metal supports for logs in a fireplace {Synset('support.n.10')}


In [6]:
examples_count = np.array([len(synset.examples()) for synset in synsets])

In [7]:
examples = np.array([(synset, len(synset.examples())) for synset in synsets])
new_examples = examples[examples[:,1]>0]
np.sum(new_examples[:,1])

48339

In [8]:
from random import choice
e = choice(new_examples)[0]
print(e, e.examples())

Synset('cheeseparing.s.01') ['our cheeseparing administration', 'very close (or near) with his money', 'a penny-pinching miserly old man']


In [9]:
import itertools
len(set(list(itertools.chain(*[x.examples() for x in synsets]))))

48224

In [10]:
domestic_animal = wn.synsets('dog')[0].hypernyms()[1]
animal = domestic_animal.hypernyms()[0]
domestic_animal.root_hypernyms()

[Synset('entity.n.01')]

In [11]:
domestic_animal.tree(lambda s:s.hypernyms(), depth=3)

[Synset('domestic_animal.n.01'),
 [Synset('animal.n.01'),
  [Synset('organism.n.01'), [Synset('living_thing.n.01')]]]]

In [12]:
animal.tree(lambda s:s.hypernyms())

mouse = wn.synsets('mouse')[0]
prey_animal = wn.synsets('prey')[1]
prey_human = wn.synsets('prey')[0]

In [17]:
prey_human.lowest_common_hypernyms(prey_animal)

[Synset('organism.n.01')]

In [13]:
mouse.tree(lambda s:s.hypernyms())

[Synset('mouse.n.01'),
 [Synset('rodent.n.01'),
  [Synset('placental.n.01'),
   [Synset('mammal.n.01'),
    [Synset('vertebrate.n.01'),
     [Synset('chordate.n.01'),
      [Synset('animal.n.01'),
       [Synset('organism.n.01'),
        [Synset('living_thing.n.01'),
         [Synset('whole.n.02'),
          [Synset('object.n.01'),
           [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]]]]

In [37]:
prey_animal.tree(lambda s:s.hypernyms())

list

In [15]:
prey_human.tree(lambda s:s.hypernyms())

[Synset('prey.n.01'),
 [Synset('victim.n.01'),
  [Synset('unfortunate.n.01'),
   [Synset('person.n.01'),
    [Synset('causal_agent.n.01'),
     [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]],
    [Synset('organism.n.01'),
     [Synset('living_thing.n.01'),
      [Synset('whole.n.02'),
       [Synset('object.n.01'),
        [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]

In [16]:
help(prey_human)


Help on Synset in module nltk.corpus.reader.wordnet object:

class Synset(_WordNetObject)
 |  Synset(wordnet_corpus_reader)
 |  
 |  Create a Synset from a "<lemma>.<pos>.<number>" string where:
 |  <lemma> is the word's morphological stem
 |  <pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
 |  <number> is the sense number, counting from 0.
 |  
 |  Synset attributes, accessible via methods with the same name:
 |  
 |  - name: The canonical name of this synset, formed using the first lemma
 |    of this synset. Note that this may be different from the name
 |    passed to the constructor if that string used a different lemma to
 |    identify the synset.
 |  - pos: The synset's part of speech, matching one of the module level
 |    attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
 |  - lemmas: A list of the Lemma objects for this synset.
 |  - definition: The definition for this synset.
 |  - examples: A list of example strings for this synset.
 |  - offset: The offset