In [1]:
import pandas as pd 
import json 
import copy
import re 
from pathlib import Path

from collections import Counter, defaultdict
from copy import deepcopy

pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth',500)
from tqdm import tqdm
tqdm.pandas()

import os 
from pathlib import Path
from inflection import pluralize, singularize


In [34]:
!scp ../data/BLESS/hypernym/IsA.jsonl spartan:/home/chunhua/cogsci/DAP/data/BLESS/hypernym/IsA.jsonl
!scp ../data/BLESS/coordinate/IsA.jsonl spartan:/home/chunhua/cogsci/DAP/data/BLESS/coordinate//IsA.jsonl
# !ls ../data/BLESS/hypernym

IsA.jsonl                                     100% 3422KB  97.9MB/s   00:00    
IsA.jsonl                                     100%  538KB  60.8MB/s   00:00    


# Data exploration

In [2]:
def read_data(path = '../data/BLESS/bless.csv'):

    df = pd.read_csv(path, sep=';', names=['sub_label', 'obj_label', 'relation'])
    query_relation = ['coord', 'hyper']
    df = df.query(f"relation in {query_relation}")

    display(df['relation'].value_counts())

    return df 

# Helper functions

In [3]:
def _get_article(word):
    if word[0] in ['a', 'e', 'i', 'o', 'u']:
        return 'an'
    return 'a'


def save_dict_to_json(examples, output_path):
    ''' 
    save a list of dicts into otuput_path, orient='records' (each line is a dict) 
    examples: a list of dicts
    output_path: 
    '''
    with open(output_path, 'w') as fout:
        for i, example in enumerate(examples):
            json.dump(example, fout)
            fout.write("\n")
        print(f"save {output_path} with {len(examples)} lines")
        
def add_period_at_the_end_of_sentence(sentence):
    last_token = sentence[-1]
    if last_token != '.': 
        return sentence + '.'
    return [sentence]


def add_plural(word):
    '''
    append the plural(s) of a word or a list of words to the original list 
    
    For example: 
    add_plural('apple')
    add_plural(['apple', 'pear', 'cherry'])
    '''
    if isinstance(word, str): 
        word_plural = pluralize(word)
        return [word, word_plural] if word_plural!=word else [word]
    elif isinstance(word, list): 
        word_plurals = word 
        for w in word: 
            w_plural = pluralize(w)
            if w_plural!=w: word_plurals.append(w_plural)
        return word_plurals

# Process data for anchor/coordinate evalaution 

In [4]:
df_coord = read_data(path = '../data/BLESS/bless.csv').query("relation == 'coord'")
multi_label = False #consider a list of ground truth or separate them 

if multi_label: 
    dfc = []
    for name, group in df_coord.groupby('sub_label'):
        dfc.append({'sub_label': name, 'obj_label': list(group['obj_label'].values), 'relation': 'Coordinate'})

    dfc = pd.DataFrame(dfc)
    
else: dfc = df_coord.reset_index(drop = True)
dfc['sub_label'] = dfc['sub_label'].apply(lambda x: x.strip())
dfc['masked_sentences'] = dfc['sub_label'].apply(lambda x: [f"{x} and [MASK]."])
# dfc['sub_label_pl'] = dfc['sub_label'].apply(lambda x: pluralize(x))
dfc['obj_label'] = dfc['obj_label'].apply(lambda x: [x] if not isinstance(x,list) else x)
dfc['sub_sister'] = dfc['obj_label']
dfc['uuid'] = dfc.index + 1
display(dfc.head())
# df_coord['obj_label'] = df['obj_label'].apply(lambda x:  add_plural(x))

output_path = '../data/BLESS/coordinate/IsA.jsonl'
save_dict_to_json(dfc.to_dict(orient='records'), output_path=output_path)

# dfn = pd.read_csv(output_path)

coord    3602
hyper    1279
Name: relation, dtype: int64

Unnamed: 0,sub_label,obj_label,relation,masked_sentences,sub_sister,uuid
0,alligator,[crocodile],coord,[alligator and [MASK].],[crocodile],1
1,alligator,[frog],coord,[alligator and [MASK].],[frog],2
2,alligator,[lizard],coord,[alligator and [MASK].],[lizard],3
3,alligator,[snake],coord,[alligator and [MASK].],[snake],4
4,alligator,[toad],coord,[alligator and [MASK].],[toad],5


save ../data/BLESS/coordinate/IsA.jsonl with 3602 lines


In [8]:
!mkdir ../data/BLESS/coordinate

# Process data for comparing SAP and DAP (true anchors)

In [5]:
def insert_anchors_into_def_dap(x, subj_anchors):
    prompts = []
    for subj_anchor in subj_anchors: 
        prompts.extend([f"{_get_article(x)} {x} or {subj_anchor} is a [MASK].",
                       f"{_get_article(x)} {x} or {subj_anchor} is an [MASK]."])
    return prompts 

def insert_anchors_into_lsp_dap(x, subj_anchors):
    prompts = []
    for subj_anchor in subj_anchors: 
        for pattern in lsp_dap:
            prompt = pattern.replace("[X]", x).replace("[Z]", subj_anchor).replace("[Y]", "[MASK]")
            prompts.append(prompt)
    return prompts 

def get_def_sap(x):
    prompts = []
    for pattern in lsp_sap:
        prompt = pattern.replace("[X]", x).replace("[Y]", "[MASK]")
        prompts.append(prompt)
    return prompts 


lsp_sap = [
         "[Y] such as [X].", 
         "[Y], including [X].", 
         "[Y], especially [X].", 
         "[X] or other [Y].", 
         "[X] and other [Y].", 
         "such [Y] as [X].", 
        ]

lsp_dap = [
         "[Y] such as [X] and [Z].", 
         "[Y], including [X] and [Z].", 
         "[Y], especially [X] and [Z].", 
         "[X], [Z] and other [Y].", 
         "[X], [Z] or other [Y].", 
         "such [Y] as [X] and [Z]."]

# read the data, fill x into the placeholder and fill Y with [MASK]
import random 
query_relation = ['coord', 'hyper']
df_coord = read_data(path = '../data/BLESS/bless.csv') 
multi_label = False 

sub_to_sister = defaultdict()
for name, group in df_coord.query("relation == 'coord'").groupby('sub_label'):
    sub_to_sister[name] = list(group.query("relation == 'coord'")['obj_label'].values)

df = []
if multi_label: 
    for name, group in df_coord.groupby('sub_label'):
        df.append({'sub_label': name, 
                    'obj_label': list(group.query("relation == 'hyper'")['obj_label'].values), 
                    'sub_sister': list(group.query("relation == 'coord'")['obj_label'].values), 
                    'relation': 'IsA'})
else:
    for name, group in df_coord.query("relation== 'hyper'").groupby(['sub_label', 'obj_label']):
        df.append({'sub_label': name[0], 
                    'obj_label': name[1], 
                    'sub_sister': sub_to_sister.get(name[0]), 
                    'relation': 'IsA'})
    
df = pd.DataFrame(df)
# df.head()

df['sub_label'] = df['sub_label'].apply(lambda x: x.strip())
df['subj_anchors'] = df['sub_sister'].apply(lambda x: random.sample(x, min(5, len(x))))


df['masked_sentences'] = df['sub_label'].apply(lambda x: [f"{_get_article(x)} {x}  is a [MASK].", f"{_get_article(x)} {x} is an [MASK]."])
df['masked_sentences_with_subj_anchor'] = df[['sub_label', 'subj_anchors']].apply(lambda x: insert_anchors_into_def_dap(x[0], x[1]), axis=1)
df['def_sap'] = df['sub_label'].apply(lambda x: [f"{_get_article(x)} {x}  is a [MASK].", f"{_get_article(x)} {x} is an [MASK]."])
df['def_dap_with_subj_anchor'] = df[['sub_label', 'subj_anchors']].apply(lambda x: insert_anchors_into_def_dap(x[0], x[1]), axis=1)


df['sub_label_pl'] = df['sub_label'].apply(lambda x: pluralize(x))
df['subj_anchors_pl'] = df['subj_anchors'].apply(lambda x: [pluralize(word) for word in x])

df['lsp_sap'] =  df['sub_label_pl'].apply(lambda x: get_def_sap(x))

df['lsp_dap_with_subj_anchor']  =  df[['sub_label_pl', 'subj_anchors_pl']].apply(lambda x: insert_anchors_into_lsp_dap(x[0], x[1]), axis=1)
df['obj_label'] = df['obj_label'].apply(lambda x: [x] if not isinstance(x, list) else x)
display(df.head())

# print(Counter(df['sub_sister_num']))
# display(df['sub_sister_num'].describe())
output_path = '../data/BLESS/hypernym/IsA.jsonl'
save_dict_to_json(df.to_dict(orient='records'), output_path=output_path)
df.to_csv(output_path.replace(".jsonl", ".csv"))

coord    3602
hyper    1279
Name: relation, dtype: int64

Unnamed: 0,sub_label,obj_label,sub_sister,relation,subj_anchors,masked_sentences,masked_sentences_with_subj_anchor,def_sap,def_dap_with_subj_anchor,sub_label_pl,subj_anchors_pl,lsp_sap,lsp_dap_with_subj_anchor
0,acacia,[tree],"[birch, cedar, cypress, elm, oak, pine, poplar, willow]",IsA,"[pine, cedar, poplar, willow, birch]","[an acacia is a [MASK]., an acacia is an [MASK].]","[an acacia or pine is a [MASK]., an acacia or pine is an [MASK]., an acacia or cedar is a [MASK]., an acacia or cedar is an [MASK]., an acacia or poplar is a [MASK]., an acacia or poplar is an [MASK]., an acacia or willow is a [MASK]., an acacia or willow is an [MASK]., an acacia or birch is a [MASK]., an acacia or birch is an [MASK].]","[an acacia is a [MASK]., an acacia is an [MASK].]","[an acacia or pine is a [MASK]., an acacia or pine is an [MASK]., an acacia or cedar is a [MASK]., an acacia or cedar is an [MASK]., an acacia or poplar is a [MASK]., an acacia or poplar is an [MASK]., an acacia or willow is a [MASK]., an acacia or willow is an [MASK]., an acacia or birch is a [MASK]., an acacia or birch is an [MASK].]",acacias,"[pines, cedars, poplars, willows, birches]","[[MASK] such as acacias., [MASK], including acacias., [MASK], especially acacias., acacias or other [MASK]., acacias and other [MASK]., such [MASK] as acacias.]","[[MASK] such as acacias and pines., [MASK], including acacias and pines., [MASK], especially acacias and pines., acacias, pines and other [MASK]., acacias, pines or other [MASK]., such [MASK] as acacias and pines., [MASK] such as acacias and cedars., [MASK], including acacias and cedars., [MASK], especially acacias and cedars., acacias, cedars and other [MASK]., acacias, cedars or other [MASK]., such [MASK] as acacias and cedars., [MASK] such as acacias and poplars., [MASK], including acacia..."
1,alligator,[animal],"[crocodile, frog, lizard, snake, toad, turtle]",IsA,"[frog, turtle, snake, toad, crocodile]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or crocodile is a [MASK]., an alligator or crocodile is an [MASK].]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or crocodile is a [MASK]., an alligator or crocodile is an [MASK].]",alligators,"[frogs, turtles, snakes, toads, crocodiles]","[[MASK] such as alligators., [MASK], including alligators., [MASK], especially alligators., alligators or other [MASK]., alligators and other [MASK]., such [MASK] as alligators.]","[[MASK] such as alligators and frogs., [MASK], including alligators and frogs., [MASK], especially alligators and frogs., alligators, frogs and other [MASK]., alligators, frogs or other [MASK]., such [MASK] as alligators and frogs., [MASK] such as alligators and turtles., [MASK], including alligators and turtles., [MASK], especially alligators and turtles., alligators, turtles and other [MASK]., alligators, turtles or other [MASK]., such [MASK] as alligators and turtles., [MASK] such as alli..."
2,alligator,[beast],"[crocodile, frog, lizard, snake, toad, turtle]",IsA,"[crocodile, frog, toad, snake, lizard]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or crocodile is a [MASK]., an alligator or crocodile is an [MASK]., an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK].]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or crocodile is a [MASK]., an alligator or crocodile is an [MASK]., an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK].]",alligators,"[crocodiles, frogs, toads, snakes, lizards]","[[MASK] such as alligators., [MASK], including alligators., [MASK], especially alligators., alligators or other [MASK]., alligators and other [MASK]., such [MASK] as alligators.]","[[MASK] such as alligators and crocodiles., [MASK], including alligators and crocodiles., [MASK], especially alligators and crocodiles., alligators, crocodiles and other [MASK]., alligators, crocodiles or other [MASK]., such [MASK] as alligators and crocodiles., [MASK] such as alligators and frogs., [MASK], including alligators and frogs., [MASK], especially alligators and frogs., alligators, frogs and other [MASK]., alligators, frogs or other [MASK]., such [MASK] as alligators and frogs., [..."
3,alligator,[carnivore],"[crocodile, frog, lizard, snake, toad, turtle]",IsA,"[toad, turtle, lizard, frog, snake]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK]., an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK].]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK]., an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK].]",alligators,"[toads, turtles, lizards, frogs, snakes]","[[MASK] such as alligators., [MASK], including alligators., [MASK], especially alligators., alligators or other [MASK]., alligators and other [MASK]., such [MASK] as alligators.]","[[MASK] such as alligators and toads., [MASK], including alligators and toads., [MASK], especially alligators and toads., alligators, toads and other [MASK]., alligators, toads or other [MASK]., such [MASK] as alligators and toads., [MASK] such as alligators and turtles., [MASK], including alligators and turtles., [MASK], especially alligators and turtles., alligators, turtles and other [MASK]., alligators, turtles or other [MASK]., such [MASK] as alligators and turtles., [MASK] such as alli..."
4,alligator,[chordate],"[crocodile, frog, lizard, snake, toad, turtle]",IsA,"[frog, snake, turtle, toad, lizard]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK].]","[an alligator is a [MASK]., an alligator is an [MASK].]","[an alligator or frog is a [MASK]., an alligator or frog is an [MASK]., an alligator or snake is a [MASK]., an alligator or snake is an [MASK]., an alligator or turtle is a [MASK]., an alligator or turtle is an [MASK]., an alligator or toad is a [MASK]., an alligator or toad is an [MASK]., an alligator or lizard is a [MASK]., an alligator or lizard is an [MASK].]",alligators,"[frogs, snakes, turtles, toads, lizards]","[[MASK] such as alligators., [MASK], including alligators., [MASK], especially alligators., alligators or other [MASK]., alligators and other [MASK]., such [MASK] as alligators.]","[[MASK] such as alligators and frogs., [MASK], including alligators and frogs., [MASK], especially alligators and frogs., alligators, frogs and other [MASK]., alligators, frogs or other [MASK]., such [MASK] as alligators and frogs., [MASK] such as alligators and snakes., [MASK], including alligators and snakes., [MASK], especially alligators and snakes., alligators, snakes and other [MASK]., alligators, snakes or other [MASK]., such [MASK] as alligators and snakes., [MASK] such as alligators..."


save ../data/BLESS/hypernym/IsA.jsonl with 1276 lines


# Process data for comparing SAP and DAP (no true anchors)

In [55]:
query_relation = ['coord', 'hyper']
df_coord = read_data(path = '../data/BLESS/bless.csv') 

df = []
for name, group in df_coord.groupby('sub_label'):
    df.append({'sub_label': name, 
                'obj_label': list(group.query("relation == 'hyper'")['obj_label'].values), 
                'sub_sister': list(group.query("relation == 'coord'")['obj_label'].values), 
                'relation': 'IsA'})

df = pd.DataFrame(df)
df['sub_label'] = df['sub_label'].apply(lambda x: x.strip())
# df['obj_label'] = df['obj_label'].apply(lambda x: x + [pluralize(word) for word in x ])
df['masked_sentences'] = df['sub_label'].apply(lambda x: [f"{_get_article(x)} {x}  is a [MASK].", f"{_get_article(x)} {x} is an [MASK]."])
df['sub_label_pl'] = df['sub_label'].apply(lambda x: pluralize(x))
display(df.head())

df['sub_sister_num'] = df['sub_sister'].apply(lambda x: len(x))
print(Counter(df['sub_sister_num']))
display(df['sub_sister_num'].describe())
output_path = '../data/BLESS/IsA.jsonl'
save_dict_to_json(df.to_dict(orient='records'), output_path=output_path)

coord    3602
hyper    1279
Name: relation, dtype: int64

Unnamed: 0,sub_label,obj_label,sub_sister,relation,masked_sentences,sub_label_pl
0,acacia,[tree],"[birch, cedar, cypress, elm, oak, pine, poplar, willow]",IsA,"[an acacia is a [MASK]., an acacia is an [MASK].]",acacia
1,alligator,"[animal, beast, carnivore, chordate, creature, predator, reptile, vertebrate]","[crocodile, frog, lizard, snake, toad, turtle]",IsA,"[an alligator is a [MASK]., an alligator is an [MASK].]",alligators
2,ambulance,"[artefact, artifact, conveyance, vehicle]","[battleship, bicycle, bike, bomber, bus, car, ferry, fighter, frigate, glider, helicopter, jet, moped, motorbike, motorcycle, scooter, tanker, tractor, train, truck, van, yacht]",IsA,"[an ambulance is a [MASK]., an ambulance is an [MASK].]",ambulances
3,ant,"[animal, arthropod, bug, creature, insect, invertebrate]","[bee, beetle, butterfly, cockroach, cricket, dragonfly, fly, grasshopper, hornet, ladybug, locust, mosquito, moth, silverfish, wasp]",IsA,"[an ant is a [MASK]., an ant is an [MASK].]",ants
4,apple,"[food, fruit, produce]","[apricot, banana, cherry, coconut, cranberry, grape, grapefruit, lemon, lime, peach, pear, pineapple, plum, strawberry]",IsA,"[an apple is a [MASK]., an apple is an [MASK].]",apples


Counter({22: 20, 16: 19, 17: 15, 10: 14, 14: 13, 28: 10, 19: 10, 9: 10, 25: 8, 8: 7, 11: 7, 26: 7, 12: 7, 15: 6, 27: 6, 18: 5, 21: 5, 13: 5, 24: 5, 20: 5, 6: 4, 30: 3, 23: 2, 29: 2, 31: 2, 32: 1, 36: 1, 42: 1})


count    200.000000
mean      18.010000
std        6.767488
min        6.000000
25%       13.000000
50%       17.000000
75%       22.000000
max       42.000000
Name: sub_sister_num, dtype: float64

save ../data/BLESS/IsA.jsonl with 200 lines


## Plural hypernyms

In [None]:
oov_path = '../data/BLESS/consistency/plural_hypernyms_not_in_bert.txt'
with open(oov_path, 'r') as fin:
    lines = fin.readlines()
    for line in lines: 
        line = line.strip()
        target_token = ''
    