In [17]:
import pandas as pd 
import json 
import copy
import re 
from collections import Counter, defaultdict
pd.set_option('display.max_columns',100)
pd.set_option('display.max_colwidth',500)
from util_wordnet import get_sister_terms

# Read LAMA 

In [27]:
def save_dict_to_json(examples, output_path):
    ''' 
    save a list of dicts into otuput_path, orient='records' (each line is a dict) 
    examples: a list of dicts
    output_path: 
    '''
    with open(output_path, 'w') as fout:
        for i, example in enumerate(examples):
            json.dump(example, fout)
            fout.write("\n")
        print(f"save {output_path} with {len(examples)} lines")
        
path = '../data/LAMA/IsA.jsonl'

def read_jsonl_to_df(filepath):
    with open(filepath, 'r', encoding='utf-8') as fin:
        data = fin.readlines()
        data = [eval(x) for x in data]
        df = pd.DataFrame(data)
    return df 

df = read_jsonl_to_df(path)
df['sub_sister'] = df['sub_label'].apply(lambda x: get_sister_terms(x))
df['obj_label'] = df['obj_label'].apply(lambda x: [x])
output_path = '../data/LAMA/IsA.jsonl'
save_dict_to_json(df.to_dict(orient='records'), output_path)
df.head()

save ../data/LAMA/IsA.jsonl with 1577 lines


Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,sub_sister
0,buildiings,[factories],[Some buildiings are [MASK].],0,[]
1,laser,[device],[A laser is a [MASK] which creates coherent light.],1,"[collimator, diffuser, finder, lense, polarimeter, polariscope, stereoscope, lens, coelostat, viewer, prism, autofocus, planetarium, projector, diffusor, biprism, grating, viewfinder]"
2,urge,[desire],[An urge is the strong [MASK] of something.],2,"[life, yearning, temptation, impulse, hungriness, eros, concupiscence, aspiration, wishing, caprice, morality, dream, morals, craving, ethics, want, bloodlust, longing, ambition, wish, itch, whim]"
3,truth,[virtue],[Truth is a [MASK].],3,"[aridity, regularity, positivity, finitude, explanation, materiality, mundanity, pathos, naivety, solution, pleading, brachycephaly, morbidity, combustibleness, commentary, difficulty, falsity, agreement, resolution, extraordinariness, domesticity, excellence, fruitfulness, impotency, sanctity, ulteriority, comprehensibility, ultimate, particularity, difference, sweetness, romance, bidding, ability, boundlessness, bid, humanness, promulgation, fruitlessness, sanctitude, point, corporeality, ..."
4,geometry,[subject],[Geometry is a [MASK] studied in school.],4,"[metamathematics, trigonometry, arithmetic, topology, calculus, algebra, trig]"


In [28]:
!scp ../data/LAMA/IsA.jsonl spartan:~/cogsci/DAP/data/LAMA/

IsA.jsonl                                     100% 1412KB  66.7MB/s   00:00    


# Data Exploration 

In [15]:
from inflection import singularize, pluralize 
df = read_jsonl_to_df(path)

df['sub_label_sg'] = df['sub_label'].apply(lambda x: singularize(x))
df['sub_label_pl'] = df['sub_label'].apply(lambda x: pluralize(x))


# sg = getInflection(
    
    
dfsg = df.query("sub_label == sub_label_sg")
dfpl = df.query("sub_label == sub_label_pl")
print(df.index)
print(f"#Singular sub_label: {len(dfsg.index)}")
print(f"#Plural sub_label: {len(dfpl.index)}")

unique = set(df['sub_label']) - set(df['sub_label_sg']) - set(df['sub_label_pl'])
unique = list(unique)
print(f"{len(unique)}")
dfu = df.query(f"sub_label in {unique}")
display(dfu)

RangeIndex(start=0, stop=1577, step=1)
#Singular sub_label: 1337
#Plural sub_label: 505
30


Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,sub_label_sg,sub_label_pl
65,acting,art,[Acting is an [MASK].],65,act,acts
106,jogging,hobby,[Jogging is a [MASK].],106,jog,jogs
173,reading,activity,[Reading fiction is a favorite [MASK] of millions of people.],173,read,reads
180,skating,activity,[An [MASK] someone can do is roller skating.],180,skate,skates
253,courting,dating,[Courting is [MASK].],253,court,courts
264,shot,treatment,[A shot is a kind of [MASK].],264,shoot,shoots
294,broiling,cooking,"[Another way to say ""gene is [MASK]"" is ""Gene is frying, baking, broiling, brazing, roasting and stewing up a storm in the kitchen."".]",294,broil,broils
350,swaying,motion,[Swaying is a type of [MASK].],350,sway,sways
360,dusting,cleaning,[Something that might happen while [MASK] the house is dusting.],360,dust,dusts
377,painting,image,[The first thing you do when you produce an [MASK] or text on paper is choose a writing or painting implement.],377,paint,paints


In [5]:
!python -m spacy info

[1m

spaCy version    3.2.1                         
Location         /Users/chunhua/anaconda3/lib/python3.8/site-packages/spacy
Platform         macOS-10.16-x86_64-i386-64bit 
Python version   3.8.8                         
Pipelines        en_core_web_sm (3.2.0)        



In [7]:
from lemminflect import getInflection

import spacy
import lemminflect
nlp = spacy.load('en_core_web_sm')
# doc = nlp('I am testing this example.')
# # > doc[2]._.lemma()
# # test
# > doc[4]._.inflect('NNS')
# examples

df = read_jsonl_to_df(path)
df['sub_label_sg'] = df['sub_label'].apply(lambda x: nlp(x)[0]._.inflect('NN'))
df['sub_label_pl'] = df['sub_label'].apply(lambda x: nlp(x)[0]._.inflect('NNS'))

    
dfsg = df.query("sub_label == sub_label_sg")
dfpl = df.query("sub_label == sub_label_pl")
print(df.index)
print(f"#Singular sub_label: {len(dfsg.index)}")
print(f"#Plural sub_label: {len(dfpl.index)}")
print(f"#sub_label != sg and !=pl", len(unique))

unique = set(df['sub_label']) - set(df['sub_label_sg']) - set(df['sub_label_pl'])
unique = list(unique)

dfu = df.query(f"sub_label in {unique}")
display(dfu)

RangeIndex(start=0, stop=1577, step=1)
#Singular sub_label: 1308
#Plural sub_label: 360
#sub_label != sg and !=pl 45


Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,sub_label_sg,sub_label_pl
13,moldavia,country,[Moldavia is a kind of [MASK].],13,moldavium,moldaviums
65,acting,art,[Acting is an [MASK].],65,act,acts
106,jogging,hobby,[Jogging is a [MASK].],106,jog,jogs
173,reading,activity,[Reading fiction is a favorite [MASK] of millions of people.],173,read,reads
180,skating,activity,[An [MASK] someone can do is roller skating.],180,skate,skates
195,striving,achievement,[Striving is [MASK].],195,strive,strives
253,courting,dating,[Courting is [MASK].],253,court,courts
264,shot,treatment,[A shot is a kind of [MASK].],264,shoot,shoots
294,broiling,cooking,"[Another way to say ""gene is [MASK]"" is ""Gene is frying, baking, broiling, brazing, roasting and stewing up a storm in the kitchen."".]",294,broil,broils
321,listening,activity,[An [MASK] someone can do is listening to the music.],321,listen,listens


In [11]:

| Package          | Verb  |  Noun | ADJ/ADV | Overall |  Speed  |
|----------------------------------------------------------------|
| LemmInflect      | 96.1% | 95.4% |  93.9%  |  95.6%  | 42.0 uS |
| CLiPS/pattern.en | 93.6% | 91.1% |   0.0%  |  n/a    |  3.0 uS |
| Stanford CoreNLP | 87.6% | 93.1% |   0.0%  |  n/a    |  n/a    |
| spaCy            | 79.4% | 88.9% |  60.5%  |  84.7%  |  5.0 uS |
| NLTK             | 53.3% | 52.2% |  53.3%  |  52.6%  | 13.0 uS |
|----------------------------------------------------------------|

from lemminflect import getInflection
df = read_jsonl_to_df(path)
df['sub_label_sg'] = df['sub_label'].apply(lambda x: getInflection(x, tag='NN')[0])
df['sub_label_pl'] = df['sub_label'].apply(lambda x:getInflection(x, tag='NNS')[0])

    
dfsg = df.query("sub_label == sub_label_sg")
dfpl = df.query("sub_label == sub_label_pl")
print(df.index)
print(f"#Singular sub_label: {len(dfsg.index)}")
print(f"#Plural sub_label: {len(dfpl.index)}")
print(f"#sub_label != sg and !=pl", len(unique))

unique = set(df['sub_label']) - set(df['sub_label_sg']) - set(df['sub_label_pl'])
unique = list(unique)

dfu = df.query(f"sub_label in {unique}")
display(dfu)

RangeIndex(start=0, stop=1577, step=1)
#Singular sub_label: 1577
#Plural sub_label: 147
#sub_label != sg and !=pl 1373


Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,sub_label_sg,sub_label_pl


In [12]:
dfpl

Unnamed: 0,sub_label,obj_label,masked_sentences,uuid,sub_label_sg,sub_label_pl
8,butane,gas,[Butane is a kind of [MASK].],8,butane,butane
12,ginseng,plant,[Ginseng is a [MASK].],12,ginseng,ginseng
32,polyethylene,plastic,[Polyethylene is a [MASK].],32,polyethylene,polyethylene
36,garbage,refuse,[You are likely to find [MASK] in a garbage can.],36,garbage,garbage
65,acting,art,[Acting is an [MASK].],65,acting,acting
...,...,...,...,...,...,...
1502,jazz,music,[Jazz is form of [MASK].],1502,jazz,jazz
1531,dew,thing,[Dew is a [MASK].],1531,dew,dew
1536,archery,sport,[Archery is [MASK].],1536,archery,archery
1538,advertising,communication,[Advertising is [MASK] used for commercial purposes.],1538,advertising,advertising


In [9]:
# 'reading'
# from lemminflect import getInflection

sg = getInflection('reading', tag='NN')
pl =  getInflection('reading', tag='NNS')
x = 'reading'
print(nlp(x)[0]._.inflect('NN'))
print(nlp(x)[0]._.inflect('NNS'))

print(sg, pl)

# sg = getInflection('readings', tag='NN')
# pl = getInflection('readings', tag='NNS')
# print(sg, pl)

# text = "Acting is a hobby."
# for token in nlp(text):
#     print(token, token._.inflect("NN"))

# # getInflection('xxwatch', tag='VBD')
# # # ('xxwatched',)


# # import spacy
# # import lemminflect
# # nlp = spacy.load('en_core_web_sm')
# # doc = nlp('I am testing this example.')
# # doc[2]._.lemma()
# # test

# doc[4]._.inflect('NNS')
# examples

read
reads
('reading',) ('readings', 'reading')
