# Examine anchor quality based on WordNet 


- https://wn.readthedocs.io/en/latest/setup.html
- https://wn.readthedocs.io/en/latest/api/wn.html


!pip install wn
!pip install wn[web]
wn.download('ewn:2020')


In [10]:
from nltk.corpus import wordnet as wn
import pandas as pd

# from inflection import singularize, pluralize 
def read_data(path = '../data/BLESS/bless.csv'):

    df = pd.read_csv(path, sep=';', names=['sub_label', 'obj_label', 'relation'])
    query_relation = ['coord', 'hyper']
    df = df.query(f"relation in {query_relation}")

    display(df['relation'].value_counts())
    return df 

def get_sister_terms(word, distance_to_hypernym=1):
    '''
    "Coordinate (sister) terms: share the same hypernym"
    "The sister relation is the usual one encountered when working with tree structures: sisters are word forms (either simple words or collocations) that are both immediate hyponyms of the same node"
    
    Args:
        word: the input word
        hop: the hops to hypernyms, default is 1, which means take the top 1 hypernym of x
    '''
    sister_terms = set()
    for synset in wn.synsets(word ,"n"):
        for hypernym in synset.hypernyms()[:distance_to_hypernym]:
#             print(hypernym)
            sister_synsets = hypernym.hyponyms()
            for sister_synset in sister_synsets:
                sister_names = [x.name() for x in sister_synset.lemmas()]
                sister_names_selected = [name.lower() for name in sister_names if len(name.split("_"))==1 and  len(name.split("-"))==1  and name!=word]
                sister_terms = sister_terms.union(set(sister_names_selected))
    return list(sister_terms)


# def test_sister_terms():
#     for k in range(1,6):
#         print(k, get_sister_terms('dog', k))
#         print()


query_relation = ['coord', 'hyper']
df_bless = read_data(path = '../data/BLESS/bless.csv') 

df = []
for name, group in df_bless.groupby('sub_label'):
    df.append({'sub_label': name, 
                'obj_label': list(group.query("relation == 'hyper'")['obj_label'].values), 
                'sub_sister': list(group.query("relation == 'coord'")['obj_label'].values), 
                'sub_sister_wn': get_sister_terms(name, 6), 
                'relation': 'IsA'})

df = pd.DataFrame(df)
df['sub_label'] = df['sub_label'].apply(lambda x: x.strip())

df.head()
for s, x,y in zip(df['sub_label'], df['sub_sister_wn'], df['obj_label']):
    shared = set(x).intersection(set(y))
    if len(shared)>0:
        print("sub_label:", s)
        print( x,y)
        print(len(shared), shared)
        print('-'*40)
# df['shared_']
# # df['obj_label'] = df['obj_label'].apply(lambda x: x + [pluralize(word) for word in x ])
# df['masked_sentences'] = df['sub_label'].apply(lambda x: [f"{_get_article(x)} {x}  is a [MASK].", f"{_get_article(x)} {x} is an [MASK]."])
# df['sub_label_pl'] = df['sub_label'].apply(lambda x: pluralize(x))
# display(df.head())


coord    3602
hyper    1279
Name: relation, dtype: int64

sub_label: beetle
['bug', 'earwig', 'queen', 'ephemeral', 'mallet', 'trichopteron', 'dipteran', 'trichopteran', 'percussor', 'odonate', 'defoliator', 'sledgehammer', 'termite', 'flea', 'telsontail', 'clawhammer', 'sledge', 'lepidopteran', 'thysanopter', 'ephemeropteran', 'thysanopteron', 'hymenopter', 'homopteran', 'plecopteran', 'proturan', 'pupa', 'neuropteron', 'hemipteron', 'worker', 'dipteron', 'gallfly', 'plessor', 'louse', 'hymenopteron', 'imago', 'lepidopteron', 'pollinator', 'thysanuron', 'ephemerid', 'stonefly', 'maul', 'orthopteron', 'neuropteran', 'metabola', 'mecopteran', 'ephemeron', 'springtail', 'hymenopteran', 'orthopteran', 'plexor', 'hemipteran', 'holometabola', 'collembolan', 'phasmid'] ['animal', 'arthropod', 'bug', 'creature', 'insect', 'invertebrate']
1 {'bug'}
----------------------------------------
sub_label: bowl
['cup', 'lapful', 'vessel', 'dispenser', 'thimbleful', 'tower', 'bathtub', 'cartload', 'balcony', 'heave', 'basket', 'wiffle', 'depression', 'corner

In [18]:
obj_labels = df.query("sub_label=='yacht'")['obj_label'].to_list()

for x in df.query("sub_label=='yacht'")['sub_sister_wn']:
#     if x == 'vegetable':
    if x in obj_labels:
        print(x)