In [2]:
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords, genesis
import numpy as np
import itertools

In [50]:
def find_highest_uncommon_ancestors(word, noun_only = True):
    if noun_only:
        synsets = wn.synsets(word, pos=wn.NOUN)
    else:
        # TODO
        synsets = wn.synsets(word)
    if len(synsets) <= 1:
        return dict()
    synsets_lca = {syn: set() for syn in synsets}
    possible_relations = list(itertools.combinations(synsets, 2))
    # print(len(possible_relations))
    for a, b in possible_relations:
        lch = a.lowest_common_hypernyms(b)
        synsets_lca[a].add(lch[0])
        synsets_lca[b].add(lch[0])
    synsets_hua = dict()
    for synset in synsets:
        tree = synset.tree(lambda s:s.hypernyms())
        current_entity = tree[0]
        trees = tree[1:]
        synsets_hua[synset] = highest_uncommon_ancestor(trees, synsets_lca[synset], current_entity)
    return synsets_hua

def highest_uncommon_ancestor(trees, synsets_lca, previous_entity):
    ua_list = set()
    for further_tree in trees:
        while True:
            next_entity = further_tree[0]
            further_tree = further_tree[1:]
            if len(further_tree) > 1:
                ua_list.update(highest_uncommon_ancestor(further_tree, synsets_lca, previous_entity))
                break
            elif len(further_tree) == 1:
                further_tree = further_tree[0]
            if next_entity not in synsets_lca:
                previous_entity = next_entity
                if len(further_tree) == 0:
                    ua_list.add(previous_entity)
                    break
            else:
                ua_list.add(previous_entity)
                break
    return ua_list

hua = {}
lemmas_hua = {}
rev_cnt = 0
for l in wn.all_lemma_names(pos='n'):
    try:
        int(l)
    except ValueError:
        hua_dict = find_highest_uncommon_ancestors(l)
        hua.update(hua_dict)
        if len(list(hua_dict.items())):
            lemmas_hua[l] = {}
            for k, v in hua_dict.items():
                for val in v:
                    if val in lemmas_hua[l].keys():
                        rev_cnt += 1
                    lemmas_hua[l][val] = k
print(rev_cnt)
flatten_hua = set()
cnt = 0
for k, v in hua.items():
    if len(k.examples()):
        flatten_hua.add(k)
        flatten_hua.update(v)
        if len(v) == 0:
            print(k)
            cnt += 1
print(cnt)
print(len(flatten_hua))

1686
Synset('continent.n.02')
Synset('natal.n.01')
Synset('union.n.02')
Synset('earth.n.01')
Synset('anchorage.n.03')
Synset('athens.n.01')
Synset('boreas.n.02')
Synset('bunyan.n.02')
Synset('caliphate.n.01')
Synset('danube.n.01')
Synset('death.n.06')
Synset('downing_street.n.01')
Synset('elizabeth.n.01')
Synset('national_liberation_army.n.01')
Synset('basque_homeland_and_freedom.n.01')
Synset('fall.n.03')
Synset('father.n.06')
Synset('georgetown.n.02')
Synset('ghana.n.01')
Synset('indus.n.02')
Synset('al-gama'a_al-islamiyya.n.01')
Synset('al-jihad.n.01')
Synset('kali.n.02')
Synset('lashkar-e-taiba.n.01')
Synset('mammon.n.02')
Synset('mars.n.01')
Synset('missouri.n.02')
Synset('moon.n.01')
Synset('neptune.n.02')
Synset('paul.n.02')
Synset('pluto.n.03')
Synset('roosevelt.n.03')
Synset('reign_of_terror.n.02')
Synset('saturn.n.02')
Synset('zion.n.01')
Synset('shining_path.n.01')
Synset('sun.n.01')
Synset('tangier.n.01')
Synset('liberation_tigers_of_tamil_eelam.n.01')
Synset('uranus.n.02')

In [None]:
def cleaning(data):
    clean = re.sub('<.*?>', ' ', str(data))
#removes HTML tags
    clean = re.sub('\'.*?\s',' ', clean)
#removes all hanging letters afer apostrophes (s in it's)
    clean = re.sub(r'http\S+',' ', clean)
#removes URLs
    clean = re.sub('\W+',' ', clean)
#replacing the non alphanumeric characters
    return html.unescape(clean)
data['cleaned'] = data['review'].apply(cleaning)


def tokenizing(data):
    review = data['cleaned']
#tokenizing is done
    tokens = nltk.word_tokenize(review)
    return tokens
data['tokens'] = data.apply(tokenizing, axis=1)


stop_words = set(stopwords.words('english'))
def remove_stops(data):
    my_list = data['tokens']
    meaningful_words = [w for w in my_list if not w in stop_words]           #stopwords are removed from the tokenized data
    return (meaningful_words)
data['tokens'] = data.apply(remove_stops, axis=1)


lemmatizer = WordNetLemmatizer()
def lemmatizing(data):
    my_list = data['tokens']
    lemmatized_list = [lemmatizer.lemmatize(word) for word in my_list]
#lemmatizing is performed. It's more efficient than stemming.
    return (lemmatized_list)
data['tokens'] = data.apply(lemmatizing, axis=1)

def rejoin_words(data):
    my_list = data['tokens']
    joined_words = ( " ".join(my_list))
#rejoins all stemmed words
    return joined_words
data['cleaned'] = data.apply(rejoin_words, axis=1)

In [48]:
print(find_highest_uncommon_ancestors('earth'))

{Synset('earth.n.01'): set(), Synset('earth.n.02'): {Synset('matter.n.03'), Synset('relation.n.01')}, Synset('land.n.04'): {Synset('land.n.04')}, Synset('earth.n.04'): {Synset('location.n.01')}, Synset('earth.n.05'): {Synset('matter.n.03'), Synset('relation.n.01')}, Synset('worldly_concern.n.01'): {Synset('attribute.n.02')}, Synset('ground.n.09'): {Synset('artifact.n.01')}}
