In [21]:
import pandas as pd 
import json 
import copy
import re 
from pathlib import Path

from collections import Counter, defaultdict
from copy import deepcopy

pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth',500)

# Convert the LM-Diagnostic-Extended data into LM probing format

In [19]:

def save_dict_to_json(examples, output_path):
    ''' 
    save a list of dicts into otuput_path, orient='records' (each line is a dict) 
    examples: a list of dicts
    output_path: 
    '''

    with open(output_path, 'w') as fout:
        for example in examples:
            json.dump(example, fout)
            fout.write("\n")
        print(f"save {output_path} with {len(examples)} lines")

def add_period_at_the_end_of_sentence(sentence):
    last_token = sentence[-1]
    if last_token != '.': 
        return sentence + '.'
    return [sentence]

def process_data_to_lm():
    data_dir = '../probe-generalization/Syntagmatic/LM-Diagnostic-Extended/' 
    files = ['singular.tsv', 'plural.tsv'] #, 'contextual.tsv']

    for file in files: 
        path = f"{data_dir}/{file}"
        df = pd.read_csv(path, sep='\t', names=['masked_sentences', 'obj_label'])
        df['masked_sentences'] =  df['masked_sentences'].apply(lambda x: add_period_at_the_end_of_sentence(x))
        vocab = set(df['obj_label'].to_list())
        print(vocab)
        if 'singular' in file: 
             df['sub_label'] = df['masked_sentences'].apply(lambda x: x.split()[1])
        elif 'plural' in file: 
             df['sub_label'] = df['masked_sentences'].apply(lambda x: x.split()[0])

        df['obj_label'] =  df['obj_label'].apply(lambda x: [x])
        df['masked_sentences'] =  df['masked_sentences'].apply(lambda x: [x])
        df['relation'] = 'IsA'
        df['uuid'] = df.index + 1
        display(df.head())
        
        out_file = file.replace(".tsv", "")
        out_dir = f"{data_dir}/{out_file}/"
        Path( out_dir ).mkdir( parents=True, exist_ok=True )
        
        save_dict_to_json(examples=df.to_dict(orient='records'), output_path=out_dir + 'IsA.jsonl')   
    
process_data_to_lm()

{'vehicle', 'bird', 'building', 'flower', 'fish', 'tool', 'insect', 'vegetable', 'tree'}


Unnamed: 0,masked_sentences,obj_label,sub_label,relation,uuid
0,[A graver is a [MASK].],[tool],graver,IsA,1
1,[A smallmouth is a [MASK].],[fish],smallmouth,IsA,2
2,[A pelican is a [MASK].],[bird],pelican,IsA,3
3,[A sapsucker is a [MASK].],[bird],sapsucker,IsA,4
4,[A mako is a [MASK].],[fish],mako,IsA,5


save ../probe-generalization/Syntagmatic/LM-Diagnostic-Extended//singular/IsA.jsonl with 576 lines
{'tools', 'insects', 'vehicles', 'buildings', 'flowers', 'vegetables', 'fish', 'trees', 'birds'}


Unnamed: 0,masked_sentences,obj_label,sub_label,relation,uuid
0,[gravers are [MASK].],[tools],gravers,IsA,1
1,[smallmouths are [MASK].],[fish],smallmouths,IsA,2
2,[pelicans are [MASK].],[birds],pelicans,IsA,3
3,[sapsuckers are [MASK].],[birds],sapsuckers,IsA,4
4,[makoes are [MASK].],[fish],makoes,IsA,5


save ../probe-generalization/Syntagmatic/LM-Diagnostic-Extended//plural/IsA.jsonl with 576 lines


# Merge the singular and plural obj label

In [17]:
def merge_singular_plural_objects(singular, plural):
    return [singular] if singular == plural else [singular, plural]

def merge_singular_plural():
    data_dir = '../probe-generalization/Syntagmatic/LM-Diagnostic-Extended/' 
    files = ['sgpl.tsv'] 

    for file in files: 
        path = f"{data_dir}/{file}"
        df = pd.read_csv(path, sep='\t', names=['masked_sentences', 'obj_label_singular', 'obj_label_plural'])
        df['masked_sentences'] =  df['masked_sentences'].apply(lambda x: add_period_at_the_end_of_sentence(x))
        
        
        df['sub_label'] = df['masked_sentences'].apply(lambda x: x.split()[1])
        
        df['obj_label'] =  df[['obj_label_singular', 'obj_label_plural']].apply(lambda x: merge_singular_plural_objects(x[0], x[1]), axis=1)
#         vocab = set(df['obj_label'].to_list())
        df['masked_sentences'] =  df['masked_sentences'].apply(lambda x: [x])
        df['relation'] = 'IsA'
        df['uuid'] = df.index + 1
        
        
        df = df[['sub_label', 'obj_label', 'masked_sentences', 'uuid', 'relation']]
        display(df.head())
        out_file = file.replace(".tsv", "")
        out_dir = f"{data_dir}/{out_file}/"

        Path( out_dir ).mkdir( parents=True, exist_ok=True )
        save_dict_to_json(examples=df.to_dict(orient='records'), output_path=out_dir+"IsA.jsonl") 

merge_singular_plural()


Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,relation
0,graver,"[tool, tools]",[A graver is a [MASK].],1,IsA
1,smallmouth,[fish],[A smallmouth is a [MASK].],2,IsA
2,pelican,"[bird, birds]",[A pelican is a [MASK].],3,IsA
3,sapsucker,"[bird, birds]",[A sapsucker is a [MASK].],4,IsA
4,mako,[fish],[A mako is a [MASK].],5,IsA


save Syntagmatic/LM-Diagnostic-Extended//sgpl/IsA.jsonl with 576 lines


In [21]:
# !scp Syntagmatic/LM-Diagnostic-Extended/plural/IsA.jsonl spartan:/home/chunhua/cogsci/DAP/data/lm_diagnostic_extended/plural/IsA.jsonl
# !scp Syntagmatic/LM-Diagnostic-Extended/singular/IsA.jsonl spartan:/home/chunhua/cogsci/DAP/data/lm_diagnostic_extended/singular/IsA.jsonl
!scp Syntagmatic/LM-Diagnostic-Extended/sgpl/IsA.jsonl spartan:/home/chunhua/cogsci/DAP/data/lm_diagnostic_extended/sgpl/IsA.jsonl
    

IsA.jsonl                                     100%   78KB  15.8MB/s   00:00    


In [None]:
df = pd.DataFrame(examples) 
df['obj_label']
# df.query("relation == 'cohyponym'").head()


# Prcoess data for consitency check

1. DEF-SAP '
`A(a) X is a(n) Y.`


2. X are Y.
DEF-DAP

A(n) X or Z is a(n) Y.
X or Z are Y.

LSP-SAP
Y such as X.
Y such as X.
LSP-DAP
Y such as X or Z.
Y such as X or Z.


In [75]:

def definition_sap_singular_plural(data_dir, file):
   
    path = f"{data_dir}/{file}"
    df = pd.read_csv(path, sep='\t', names=['mask_sentences_singular', 'obj_label_singular', 
                                            'mask_sentences_plural', 'obj_label_plural'])
    
    df['mask_sentences_singular'] =  df['mask_sentences_singular'].apply(lambda x: add_period_at_the_end_of_sentence(x))
    df['mask_sentences_plural'] =  df['mask_sentences_plural'].apply(lambda x: add_period_at_the_end_of_sentence(x))


    df['sub_label_singular'] = df['mask_sentences_singular'].apply(lambda x: x.split()[1])
    df['sub_label_plural'] = df['mask_sentences_plural'].apply(lambda x: x.split()[0])


    df['relation'] = 'IsA'
    df['uuid'] = df.index + 1


    df = df[['sub_label_singular', 'obj_label_singular', 'mask_sentences_singular', 
             'sub_label_plural', 'obj_label_plural', 'mask_sentences_plural', 
             'uuid', 'relation']]
    display(df.head())
    out_file = file.replace(".tsv", "")
    out_dir = f"{data_dir}/{out_file}/"

    Path( out_dir ).mkdir( parents=True, exist_ok=True )
    save_dict_to_json(examples=df.to_dict(orient='records'), output_path=out_dir+"IsA.jsonl") 
    return df 



def definition_dap_singular_plural(df):
    df['mask_sentences_singular'] = df[['sub_label_singular', 'mask_sentences_singular']].apply(lambda x: x[1].replace(x[0], f"{x[0]} or [Z]"), axis=1)
    df['mask_sentences_plural'] = df[['sub_label_plural', 'mask_sentences_plural']].apply(lambda x: x[1].replace(x[0], f"{x[0]} or [Z]"), axis=1)

    df = df[['sub_label_singular', 'obj_label_singular', 'mask_sentences_singular', 
            'sub_label_plural', 'obj_label_plural', 'mask_sentences_plural', 
             'uuid', 'relation']]

    return df 

def lsp_sap_singular_plural(df):
    '''
    sap: Y such as X 

    '''
    df['mask_sentences_singular'] = df['sub_label_singular'].apply(lambda x: f"[MASK] such as {x}.")
    df['mask_sentences_plural'] = df['sub_label_plural'].apply(lambda x:f"[MASK] such as {x}.")

    df = df[['sub_label_singular', 'obj_label_singular', 'mask_sentences_singular', 
            'sub_label_plural', 'obj_label_plural', 'mask_sentences_plural', 
             'uuid', 'relation']]

    return df 


def lsp_dap_singular_plural(df):
    '''
    sap: Y such as X 

    '''
    df['mask_sentences_singular'] = df['sub_label_singular'].apply(lambda x: f"[MASK] such as {x} or [Z].")
    df['mask_sentences_plural'] = df['sub_label_plural'].apply(lambda x:f"[MASK] such as {x} or [Z].")

    df = df[['sub_label_singular', 'obj_label_singular', 'mask_sentences_singular', 
            'sub_label_plural', 'obj_label_plural', 'mask_sentences_plural', 
             'uuid', 'relation']]

    return df 



Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver is a [MASK].,gravers,tools,gravers are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth is a [MASK].,smallmouths,fish,smallmouths are [MASK].,2,IsA
2,pelican,bird,A pelican is a [MASK].,pelicans,birds,pelicans are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker is a [MASK].,sapsuckers,birds,sapsuckers are [MASK].,4,IsA
4,mako,fish,A mako is a [MASK].,makoes,fish,makoes are [MASK].,5,IsA


save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended//singular_plural/IsA.jsonl with 576 lines


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver is a [MASK].,gravers,tools,gravers are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth is a [MASK].,smallmouths,fish,smallmouths are [MASK].,2,IsA
2,pelican,bird,A pelican is a [MASK].,pelicans,birds,pelicans are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker is a [MASK].,sapsuckers,birds,sapsuckers are [MASK].,4,IsA
4,mako,fish,A mako is a [MASK].,makoes,fish,makoes are [MASK].,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver or [Z] is a [MASK].,gravers,tools,gravers or [Z] are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth or [Z] is a [MASK].,smallmouths,fish,smallmouths or [Z] are [MASK].,2,IsA
2,pelican,bird,A pelican or [Z] is a [MASK].,pelicans,birds,pelicans or [Z] are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker or [Z] is a [MASK].,sapsuckers,birds,sapsuckers or [Z] are [MASK].,4,IsA
4,mako,fish,A mako or [Z] is a [MASK].,makoes,fish,makoes or [Z] are [MASK].,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,[MASK] such as graver.,gravers,tools,[MASK] such as gravers.,1,IsA
1,smallmouth,fish,[MASK] such as smallmouth.,smallmouths,fish,[MASK] such as smallmouths.,2,IsA
2,pelican,bird,[MASK] such as pelican.,pelicans,birds,[MASK] such as pelicans.,3,IsA
3,sapsucker,bird,[MASK] such as sapsucker.,sapsuckers,birds,[MASK] such as sapsuckers.,4,IsA
4,mako,fish,[MASK] such as mako.,makoes,fish,[MASK] such as makoes.,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,[MASK] such as graver or [Z].,gravers,tools,[MASK] such as gravers or [Z].,1,IsA
1,smallmouth,fish,[MASK] such as smallmouth or [Z].,smallmouths,fish,[MASK] such as smallmouths or [Z].,2,IsA
2,pelican,bird,[MASK] such as pelican or [Z].,pelicans,birds,[MASK] such as pelicans or [Z].,3,IsA
3,sapsucker,bird,[MASK] such as sapsucker or [Z].,sapsuckers,birds,[MASK] such as sapsuckers or [Z].,4,IsA
4,mako,fish,[MASK] such as mako or [Z].,makoes,fish,[MASK] such as makoes or [Z].,5,IsA


In [85]:
def read_anchors(path):
    df = pd.read_csv(path)
    df['subj_anchors'] = df['subj_anchors'].apply(lambda x: eval(x))
    return dict(zip(df['sub_label'], df['subj_anchors']))
dic_sub_to_anchors_singular

def insert_anchors(dic_sub_to_anchors, df, mask_col, sub_col, anchor_col):
    df[mask_col].head()
    df[anchor_col] = df[sub_col].apply(lambda x: dic_sub_to_anchors.get(x) )
    df[mask_col] =  df[[anchor_col, mask_col]].apply(lambda x: [ x[1].replace('[Z]', anchor)  for anchor in x[0]], axis=1)
    return df 
    
# create the dataset 
data_dir = '../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/' 
file = 'singular_plural.tsv'
df_def_sap = definition_sap_singular_plural(data_dir, file)
df_def_dap = definition_dap_singular_plural(deepcopy(df_def_sap))
df_lsp_sap = lsp_sap_singular_plural(deepcopy(df_def_sap))
df_lsp_dap = lsp_dap_singular_plural(deepcopy(df_def_sap))

display(df_def_sap.head() )
display(df_def_dap.head() )
display(df_lsp_sap.head())
display(df_lsp_dap.head() )

# insert anchors
path = '../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/anchors.singular.csv'
dic_sub_to_anchors_singular = read_anchors(path)
df_def_dap = insert_anchors(dic_sub_to_anchors=dic_sub_to_anchors_singular, df= df_def_dap, mask_col = 'mask_sentences_singular', sub_col = 'sub_label_singular', anchor_col='subj_anchors_singular')

df_lsp_dap = insert_anchors(dic_sub_to_anchors=dic_sub_to_anchors_singular, df= df_lsp_dap, mask_col = 'mask_sentences_singular', sub_col = 'sub_label_singular', anchor_col='subj_anchors_singular')

df_lsp_dap[['sub_label_singular', 'subj_anchors_singular', 'mask_sentences_singular']].head() #''



path = '../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/anchors.plural.csv'
dic_sub_to_anchors_plural = read_anchors(path)
df_def_dap = insert_anchors(dic_sub_to_anchors=dic_sub_to_anchors_plural, df= df_def_dap, mask_col = 'mask_sentences_plural', sub_col = 'sub_label_plural', anchor_col='subj_anchors_plural')

df_lsp_dap = insert_anchors(dic_sub_to_anchors=dic_sub_to_anchors_plural, df= df_lsp_dap, mask_col = 'mask_sentences_plural', sub_col = 'sub_label_plural', anchor_col='subj_anchors_plural')

df_lsp_dap[['sub_label_plural', 'subj_anchors_plural', 'mask_sentences_plural']].head() #''


# save files 
out_dir = '../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/'
save_dict_to_json(examples=df_def_sap.to_dict(orient='records'), output_path=out_dir + 'IsA.def_sap.jsonl') 
save_dict_to_json(examples=df_def_dap.to_dict(orient='records'), output_path=out_dir + 'IsA.def_dap.jsonl') 
save_dict_to_json(examples=df_lsp_sap.to_dict(orient='records'), output_path=out_dir + 'IsA.lsp_sap.jsonl') 
save_dict_to_json(examples=df_lsp_dap.to_dict(orient='records'), output_path=out_dir + 'IsA.lsp_dap.jsonl') 


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver is a [MASK].,gravers,tools,gravers are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth is a [MASK].,smallmouths,fish,smallmouths are [MASK].,2,IsA
2,pelican,bird,A pelican is a [MASK].,pelicans,birds,pelicans are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker is a [MASK].,sapsuckers,birds,sapsuckers are [MASK].,4,IsA
4,mako,fish,A mako is a [MASK].,makoes,fish,makoes are [MASK].,5,IsA


save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended//singular_plural/IsA.jsonl with 576 lines


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver is a [MASK].,gravers,tools,gravers are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth is a [MASK].,smallmouths,fish,smallmouths are [MASK].,2,IsA
2,pelican,bird,A pelican is a [MASK].,pelicans,birds,pelicans are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker is a [MASK].,sapsuckers,birds,sapsuckers are [MASK].,4,IsA
4,mako,fish,A mako is a [MASK].,makoes,fish,makoes are [MASK].,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,A graver or [Z] is a [MASK].,gravers,tools,gravers or [Z] are [MASK].,1,IsA
1,smallmouth,fish,A smallmouth or [Z] is a [MASK].,smallmouths,fish,smallmouths or [Z] are [MASK].,2,IsA
2,pelican,bird,A pelican or [Z] is a [MASK].,pelicans,birds,pelicans or [Z] are [MASK].,3,IsA
3,sapsucker,bird,A sapsucker or [Z] is a [MASK].,sapsuckers,birds,sapsuckers or [Z] are [MASK].,4,IsA
4,mako,fish,A mako or [Z] is a [MASK].,makoes,fish,makoes or [Z] are [MASK].,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,[MASK] such as graver.,gravers,tools,[MASK] such as gravers.,1,IsA
1,smallmouth,fish,[MASK] such as smallmouth.,smallmouths,fish,[MASK] such as smallmouths.,2,IsA
2,pelican,bird,[MASK] such as pelican.,pelicans,birds,[MASK] such as pelicans.,3,IsA
3,sapsucker,bird,[MASK] such as sapsucker.,sapsuckers,birds,[MASK] such as sapsuckers.,4,IsA
4,mako,fish,[MASK] such as mako.,makoes,fish,[MASK] such as makoes.,5,IsA


Unnamed: 0,sub_label_singular,obj_label_singular,mask_sentences_singular,sub_label_plural,obj_label_plural,mask_sentences_plural,uuid,relation
0,graver,tool,[MASK] such as graver or [Z].,gravers,tools,[MASK] such as gravers or [Z].,1,IsA
1,smallmouth,fish,[MASK] such as smallmouth or [Z].,smallmouths,fish,[MASK] such as smallmouths or [Z].,2,IsA
2,pelican,bird,[MASK] such as pelican or [Z].,pelicans,birds,[MASK] such as pelicans or [Z].,3,IsA
3,sapsucker,bird,[MASK] such as sapsucker or [Z].,sapsuckers,birds,[MASK] such as sapsuckers or [Z].,4,IsA
4,mako,fish,[MASK] such as mako or [Z].,makoes,fish,[MASK] such as makoes or [Z].,5,IsA


save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/IsA.def_sap.jsonl with 576 lines
save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/IsA.def_dap.jsonl with 576 lines
save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/IsA.lsp_sap.jsonl with 576 lines
save ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/IsA.lsp_dap.jsonl with 576 lines


In [88]:
path = '../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency/IsA.def_dap.jsonl'
dftest = pd.read_json(path)
dftest.head()

ValueError: Trailing data

In [86]:
!scp -r ../data/probe-generalization/Syntagmatic/LM-Diagnostic-Extended/consistency spartan:/home/chunhua/cogsci/DAP/data/lm_diagnostic_extended/

# out_dir
# print(df_def_sap.columns)
# print(df_def_dap.columns)
# print(df_lsp_sap.columns)
# print(df_lsp_dap.columns)

IsA.lsp_dap.jsonl                             100%  418KB  42.5MB/s   00:00    
IsA.def_sap.jsonl                             100%  148KB  40.4MB/s   00:00    
IsA.def_dap.jsonl                             100%  405KB  57.5MB/s   00:00    
IsA.lsp_sap.jsonl                             100%  151KB  20.3MB/s   00:00    
IsA.def_dap-checkpoint.jsonl                  100%  405KB  58.9MB/s   00:00    
