
# Hypernymy Suite
HypernymySuite is a tool for evaluating some hypernymy detection modules. Its predominant focus is reproducing the results for the following paper.

Stephen Roller, Douwe Kiela, and Maximilian Nickel. 2018. Hearst Patterns Revisited: Automatic Hypernym Detection from Large Text Corpora. ACL. (arXiv)
https://github.com/facebookresearch/hypernymysuite 

In [5]:
# ! rm ../data/hypernymysuite/data/*.json
# !scp -r ../data/hypernymysuite/data/hypernymysuite spartan:~/cogsci/DAP/data/
# !rm -rf ../data/hypernymysuite/data/hypernymsuite/*
!scp -r ../data/hypernymysuite/data/hypernymsuite spartan:~/cogsci/DAP/data/

IsA.jsonl                                     100%  949KB  51.6MB/s   00:00    
.DS_Store                                     100% 6148     2.0MB/s   00:00    
IsA.jsonl                                     100% 1017KB  67.0MB/s   00:00    
IsA.jsonl                                     100% 1499KB  35.6MB/s   00:00    
IsA-checkpoint.jsonl                          100%   36KB  11.9MB/s   00:00    
IsA.jsonl                                     100% 2847KB  79.1MB/s   00:00    
IsA.jsonl                                     100% 1700KB  75.7MB/s   00:00    
IsA-checkpoint.jsonl                          100%  127KB  36.4MB/s   00:00    
IsA.jsonl                                     100% 7442KB  94.7MB/s   00:00    


In [1]:
def save_dict_to_json(examples, output_path):
    ''' 
    save a list of dicts into otuput_path, orient='records' (each line is a dict) 
    examples: a list of dicts
    output_path: 
    '''
    with open(output_path, 'w') as fout:
        for example in examples:
            json.dump(example, fout)
            fout.write("\n")
        print(f"save {output_path} with {len(examples)} lines")

def _get_article(word):
    
    if word[0] in ['a', 'e', 'i', 'o', 'u']:
        return 'an'.capitalize()
    return 'a'.capitalize()

In [2]:
import os 
import json 
from collections import defaultdict
import pandas as pd
from pathlib import Path
import string
from inflection import pluralize
from util_wordnet import get_sister_terms

def add_plural(word):
    word_plural = pluralize(word)
    return [word, word_plural] if word_plural!=word else [word]


def merge_multiple_labels(df, relations, output_path):
    df= df.query(f"relation in {relations}")
    examples = []
    for name, group in df.groupby(by='sub_label'):
        example = defaultdict()
        example['sub_label'] = name
        example['sub_sister'] = get_sister_terms(name, distance_to_hypernym=6)
        example['obj_label'] = [obj[0] for obj in group['obj_label'].values] #.tolist()
        example['masked_sentences'] = [f"{_get_article(name)} {name} is a [MASK].", f"{_get_article(name)} {name} is an [MASK]."]
        examples.append(example)
    examples = pd.DataFrame(examples)
#     save_dict_to_json(examples.to_dict(orient="records") , output_path =output_path )
    return examples

def remove_noisy_examples(examples):
    clean_examples = []
    for example in examples:
        
        if example['sub_label'] !=  example['sub_label'].translate(str.maketrans('', '', string.punctuation)):
            print(example)
            continue 
        if example['obj_label'][0]!=  example['obj_label'][0].translate(str.maketrans('', '', string.punctuation)):
            print(example)
            continue 
        clean_examples.append(example)
    return clean_examples


data_dir = '../data/hypernymysuite/data'
add_plural_ground_truth = False #True 
multi_label = False #True 
p = Path(data_dir)
paths = list(p.glob(f'*.tsv'))
for path in paths:
    if 'hyperlex_rnd' in path.stem or 'wbless' in path.stem: continue 
    print(path)
    dataset_name = path.stem.upper()
    out_dir = f"{path.parents[0]}/hypernymsuite/{dataset_name}/"
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    out_path = out_dir + "IsA.jsonl"

    df = pd.read_csv(path, sep='\t').query("label==True")
    if 'relation' in df.columns:
        df = df.query("relation=='hyper'")
    df = df.drop_duplicates().reset_index(drop=True)
    
    
    
    df['masked_sentences'] = df[['word1', 'word2']].apply(lambda x: [_get_article(x[0]) + f" {x[0]} is a [MASK].", _get_article(x[0]) + f" {x[0]} is an [MASK]."], axis=1)
    
    word_pairs = [name for name, group in df.groupby(['word1', 'word2'])]
    hyper = set(df['word2'])
    
    df = df.rename(columns={'word1': 'sub_label', 'word2': 'obj_label'})
    print(f"#Hypo {len(set(df['sub_label']))}")
    print(f"#Hyper {len(hyper)}")
    print(f"#Hypo-Hyper pairs:",len(word_pairs) )
    
    df['uuid'] = df.index 
    df['relation'] = 'IsA'
#     df['sub_sister'] = df['sub_label'].apply(lambda x: get_sister_terms(x, distance_to_hypernym=1))
    
    if add_plural_ground_truth: 
        df['obj_label'] = df['obj_label'].apply(lambda x: add_plural(x)) 
    else: 
        df['obj_label'] = df['obj_label'].apply(lambda x: [x]) 
        
    df['sub_sister'] = df['sub_label'].apply(lambda x: get_sister_terms(x, distance_to_hypernym=6))
    
    if not multi_label: 
        df = df[['sub_label', 'obj_label', 'relation', 'masked_sentences', 'sub_sister', 'uuid']]
        display(df.head())
        examples = df.to_dict(orient='records')
    else: 
        examples = merge_multiple_labels(df, ['IsA'],output_path=out_path ).to_dict(orient="records")

    if dataset_name == 'SHWARTZ':
        examples = remove_noisy_examples(examples)
    save_dict_to_json(examples=examples, output_path=out_path)
    print("-"*80)
    print()

../data/hypernymysuite/data/bibless.tsv
#Hypo 192
#Hyper 108
#Hypo-Hyper pairs: 834


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,pheasant,[game],IsA,"[A pheasant is a [MASK]., A pheasant is an [MA...","[partridge, quail, grouse]",0
1,pheasant,[bird],IsA,"[A pheasant is a [MASK]., A pheasant is an [MA...","[partridge, quail, grouse]",1
2,pheasant,[vertebrate],IsA,"[A pheasant is a [MASK]., A pheasant is an [MA...","[partridge, quail, grouse]",2
3,pheasant,[creature],IsA,"[A pheasant is a [MASK]., A pheasant is an [MA...","[partridge, quail, grouse]",3
4,sparrow,[creature],IsA,"[A sparrow is a [MASK]., A sparrow is an [MASK].]","[tyrannid, lyrebird, wren, dunnock, oscine, br...",4


save ../data/hypernymysuite/data/hypernymsuite/BIBLESS/IsA.jsonl with 834 lines
--------------------------------------------------------------------------------

../data/hypernymysuite/data/bless.tsv
#Hypo 200
#Hyper 132
#Hypo-Hyper pairs: 1337


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,goose,[waterbird],IsA,"[A goose is a [MASK]., A goose is an [MASK].]","[volaille, chicken, wally, meshuggeneh, bozo, ...",0
1,robin,[passerine],IsA,"[A robin is a [MASK]., A robin is an [MASK].]","[ouzel, fieldfare, solitaire, throstle, nighti...",1
2,catfish,[fish],IsA,"[A catfish is a [MASK]., A catfish is an [MASK].]","[snakefish, mudcat, salmonid, smelt, moonfish,...",2
3,musket,[firearm],IsA,"[A musket is a [MASK]., A musket is an [MASK].]","[arquebus, firelock, hagbut, flintlock, harque...",3
4,lemon,[produce],IsA,"[A lemon is a [MASK]., A lemon is an [MASK].]","[construction, covering, extra, fixture, squar...",4


save ../data/hypernymysuite/data/hypernymsuite/BLESS/IsA.jsonl with 1337 lines
--------------------------------------------------------------------------------

../data/hypernymysuite/data/eval.tsv
#Hypo 623
#Hyper 350
#Hypo-Hyper pairs: 957


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,accident,[error],IsA,"[An accident is a [MASK]., An accident is an [...","[collapse, eventuality, stroke, concomitant, a...",0
1,accident,[mistake],IsA,"[An accident is a [MASK]., An accident is an [...","[collapse, eventuality, stroke, concomitant, a...",1
2,action,[event],IsA,"[An action is a [MASK]., An action is an [MASK].]","[emergence, merchantability, unification, suit...",2
3,action,[work],IsA,"[An action is a [MASK]., An action is an [MASK].]","[emergence, merchantability, unification, suit...",3
4,actor,[person],IsA,"[An actor is a [MASK]., An actor is an [MASK].]","[yearner, dupe, nonresident, welcomer, romanti...",4


save ../data/hypernymysuite/data/hypernymsuite/EVAL/IsA.jsonl with 957 lines
--------------------------------------------------------------------------------

../data/hypernymysuite/data/eval.revisit_hypernym.tsv
#Hypo 1020
#Hyper 577
#Hypo-Hyper pairs: 1766


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,ice,[solid],IsA,"[An ice is a [MASK]., An ice is an [MASK].]","[ganja, makeweight, lollipop, finding, icing, ...",0
1,pan,[container],IsA,"[A pan is a [MASK]., A pan is an [MASK].]","[blarina, dasyprocta, leontocebus, suricata, m...",1
2,touch,[act],IsA,"[A touch is a [MASK]., A touch is an [MASK].]","[smidgen, emergence, spray, raptus, diddlyshit...",2
3,sex,[class],IsA,"[A sex is a [MASK]., A sex is an [MASK].]","[levity, sadness, stab, unconcern, aliveness, ...",3
4,drive,[event],IsA,"[A drive is a [MASK]., A drive is an [MASK].]","[potence, intractableness, cellularity, autocu...",4


save ../data/hypernymysuite/data/hypernymsuite/EVAL.REVISIT_HYPERNYM/IsA.jsonl with 1766 lines
--------------------------------------------------------------------------------

../data/hypernymysuite/data/leds.tsv
#Hypo 1142
#Hyper 406
#Hypo-Hyper pairs: 1385


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,etching,[art],IsA,"[An etching is a [MASK]., An etching is an [MA...","[gravure, linecut, photoengraving, woodcut, ha...",0
1,panda,[vertebrate],IsA,"[A panda is a [MASK]., A panda is an [MASK].]","[raccoon, ringtail, potto, bassarisk, kinkajou...",1
2,decrease,[change],IsA,"[A decrease is a [MASK]., A decrease is an [MA...","[evolution, margin, inadequacy, processing, po...",2
3,dinghy,[boat],IsA,"[A dinghy is a [MASK]., A dinghy is an [MASK].]","[cockleshell, dory, canoe, skiff, coracle, gig...",3
4,islander,[inhabitant],IsA,"[An islander is a [MASK]., An islander is an [...","[plainsman, asiatic, landsman, australian, naz...",4


save ../data/hypernymysuite/data/hypernymsuite/LEDS/IsA.jsonl with 1385 lines
--------------------------------------------------------------------------------

../data/hypernymysuite/data/shwartz.tsv
#Hypo 11375
#Hyper 1312
#Hypo-Hyper pairs: 13104


Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,golo,[river],IsA,"[A golo is a [MASK]., A golo is an [MASK].]",[],0
1,kerrobert,[town],IsA,"[A kerrobert is a [MASK]., A kerrobert is an [...",[],1
2,geometria,[film],IsA,"[A geometria is a [MASK]., A geometria is an [...",[],2
3,evaporation,[place],IsA,"[An evaporation is a [MASK]., An evaporation i...","[thawing, elution, beneficiation, vapor, freez...",3
4,bonehead,[band],IsA,"[A bonehead is a [MASK]., A bonehead is an [MA...","[berk, blockhead, dunderhead, muttonhead, shit...",4


{'sub_label': 'respect.', 'obj_label': ['magazine'], 'relation': 'IsA', 'masked_sentences': ['A respect. is a [MASK].', 'A respect. is an [MASK].'], 'sub_sister': [], 'uuid': 50}
{'sub_label': 'a.w.o.l.', 'obj_label': ['album'], 'relation': 'IsA', 'masked_sentences': ['An a.w.o.l. is a [MASK].', 'An a.w.o.l. is an [MASK].'], 'sub_sister': [], 'uuid': 251}
{'sub_label': '7-tease', 'obj_label': ['album'], 'relation': 'IsA', 'masked_sentences': ['A 7-tease is a [MASK].', 'A 7-tease is an [MASK].'], 'sub_sister': [], 'uuid': 370}
{'sub_label': "sant'eustachio", 'obj_label': ['church'], 'relation': 'IsA', 'masked_sentences': ["A sant'eustachio is a [MASK].", "A sant'eustachio is an [MASK]."], 'sub_sister': [], 'uuid': 505}
{'sub_label': 'm-34', 'obj_label': ['road'], 'relation': 'IsA', 'masked_sentences': ['A m-34 is a [MASK].', 'A m-34 is an [MASK].'], 'sub_sister': [], 'uuid': 523}
{'sub_label': 'm-39', 'obj_label': ['road'], 'relation': 'IsA', 'masked_sentences': ['A m-39 is a [MASK].', 

save ../data/hypernymysuite/data/hypernymsuite/SHWARTZ/IsA.jsonl with 12994 lines
--------------------------------------------------------------------------------



# Degbug SHWAERTZ (noisy data)

the sub_label are noisy:
shaadi.com
.hack//sign
risk/reward
dmz//38
s.w.a.t.
f.l.m.
s.r.o.
i.o.u.s.a.
c.r.a.z.y.
brother/sister
m.sc
m.p.g.
m.b.b.s.
d.s.
d.p.o.
n.i.b.
t.n.t.shaadi.com
.hack//sign
risk/reward
dmz//38
s.w.a.t.
f.l.m.
s.r.o.
i.o.u.s.a.
c.r.a.z.y.
brother/sister
m.sc
m.p.g.
m.b.b.s.
d.s.
d.p.o.
n.i.b.
t.n.t.

In [23]:
import os 
import json 
from collections import defaultdict
import pandas as pd
from pathlib import Path
from inflection import pluralize
from util_wordnet import get_sister_terms
import string

def add_plural(word):
    word_plural = pluralize(word)
    return [word, word_plural] if word_plural!=word else [word]

def merge_multiple_labels(df, relations, output_path):
    df= df.query(f"relation in {relations}")
    examples = []
    for name, group in df.groupby(by='sub_label'):
        example = defaultdict()
        example['sub_label'] = name
        example['obj_label'] = [obj[0] for obj in group['obj_label'].values] #.tolist()
        example['masked_sentences'] = [f"{_get_article(name)} {name} is a [MASK].", f"{_get_article(name)} {name} is an [MASK]."]
        examples.append(example)
    examples = pd.DataFrame(examples)
    
    return examples






data_dir = '../data/hypernymysuite/data'
add_plural_ground_truth = False #True 
multi_label = True 
p = Path(data_dir)
paths = list(p.glob(f'*.tsv'))
for path in paths:
    if 'hyperlex_rnd' in path.stem or 'wbless' in path.stem: continue 
    print(path)
    dataset_name = path.stem.upper()
    print(dataset_name)
    if dataset_name !='SHWARTZ': continue 
    out_dir = f"{path.parents[0]}/hypernymsuite/{dataset_name}/"
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    out_path = out_dir + "IsA.jsonl"

    df = pd.read_csv(path, sep='\t').query("label==True")
    if 'relation' in df.columns:
        df = df.query("relation=='hyper'")
    df = df.drop_duplicates().reset_index(drop=True)
    
    
    
    df['masked_sentences'] = df[['word1', 'word2']].apply(lambda x: [_get_article(x[0]) + f" {x[0]} is a [MASK].", _get_article(x[0]) + f" {x[0]} is an [MASK]."], axis=1)
    
    word_pairs = [name for name, group in df.groupby(['word1', 'word2'])]
    hyper = set(df['word2'])
    
    df = df.rename(columns={'word1': 'sub_label', 'word2': 'obj_label'})
    print(f"#Hypo {len(set(df['sub_label']))}")
    print(f"#Hyper {len(hyper)}")
    print(f"#Hypo-Hyper pairs:",len(word_pairs) )
    
    df['uuid'] = df.index 
    df['relation'] = 'IsA'
#     df['sub_sister'] = df['sub_label'].apply(lambda x: get_sister_terms(x, distance_to_hypernym=1))
    
    if add_plural_ground_truth: 
        df['obj_label'] = df['obj_label'].apply(lambda x: add_plural(x)) 
    else: 
        df['obj_label'] = df['obj_label'].apply(lambda x: [x]) 
    
    if not multi_label: 
        df = df[['sub_label', 'obj_label', 'relation', 'masked_sentences', 'uuid']]
        display(df.head())
        examples = df.to_dict(orient='records')
    else: 
        examples = merge_multiple_labels(df, ['IsA'],output_path=out_path ).to_dict(orient="records")
#         save_dict_to_json(examples.to_dict(orient="records") , output_path =output_path )
    examples = remove_noisy_examples(examples)
    save_dict_to_json(examples=examples, output_path=out_path)
    print("-"*80)
    print()


# a_string = '!hi. wh?at is the weat[h]er lik?e.'
# new_string = a_string.translate(str.maketrans('', '', string.punctuation))

# df.query("{}")
df_new = pd.DataFrame(examples)
# df = ['sub_label'] 
sub_labels = set(df_new['sub_label'])
for x in sub_labels:
    if '/' in x or '.' in x: 
        print(x)

../data/hypernymysuite/data/eval.tsv
EVAL
../data/hypernymysuite/data/bibless.tsv
BIBLESS
../data/hypernymysuite/data/bless.tsv
BLESS
../data/hypernymysuite/data/leds.tsv
LEDS
../data/hypernymysuite/data/eval.revisit_hypernym.tsv
EVAL.REVISIT_HYPERNYM
../data/hypernymysuite/data/shwartz.tsv
SHWARTZ
#Hypo 11375
#Hyper 1312
#Hypo-Hyper pairs: 13104
{'sub_label': '.hack//sign', 'obj_label': ['anime'], 'masked_sentences': ['A .hack//sign is a [MASK].', 'A .hack//sign is an [MASK].']}
{'sub_label': '.info', 'obj_label': ['magazine'], 'masked_sentences': ['A .info is a [MASK].', 'A .info is an [MASK].']}
{'sub_label': '3oh!3', 'obj_label': ['band'], 'masked_sentences': ['A 3oh!3 is a [MASK].', 'A 3oh!3 is an [MASK].']}
{'sub_label': '7-tease', 'obj_label': ['album'], 'masked_sentences': ['A 7-tease is a [MASK].', 'A 7-tease is an [MASK].']}
{'sub_label': 'a*teens', 'obj_label': ['band'], 'masked_sentences': ['An a*teens is a [MASK].', 'An a*teens is an [MASK].']}
{'sub_label': 'a.w.o.l.', 'o

In [15]:

new_string

'hi what is the weather like'

In [None]:
import re
l = re.findall(r"[\w']+|[.,!?;]", ".hack//signs")

filepath = '../data/hypernymysuite/data/hypernymsuite/SHWARTZ/IsA.jsonl'
with open(filepath, 'r', encoding='utf-8') as fin:
    data = fin.readlines()
    data = [eval(x) for x in data]
    df = pd.DataFrame(data)
df.head()
    