In [4]:
import pandas as pd 
import json 
from collections import Counter, defaultdict 
import os, sys 
from copy import copy, deepcopy

from inflection import singularize,  pluralize 
import spacy
# import pyinflect
# import lemminflect

nlp = spacy.load('en_core_web_sm')
def word_to_pos(words):
    word_to_pos_dict = defaultdict()
    for word in words:
        if word == 'null': continue
        pos = nlp(str(word))[0].pos_ 
        word_to_pos_dict[word] = pos
    return word_to_pos_dict


def sort_similar_words(df, word_to_pos_dict=None, top_k = 10):
    '''
    sort simialr words and return the top_k
    '''
    cue_to_similar_words = defaultdict()
    for i, (k,v) in enumerate(df.to_dict().items()):
        if k=='null': continue
        #k_pos = word_to_pos_dict.get(k)
        #if k_pos !='NOUN': continue 

        v_sorted = Counter(v).most_common()
        v_sorted_noun = []
        for v in v_sorted:
            if len( str(v[0]).split())>1: continue 
            #v_pos = word_to_pos_dict.get(v[0])
            #if v_pos!='NOUN': continue 
            v_sorted_noun.append(v)
        cue_to_similar_words[k] = dict(v_sorted_noun[1:top_k+1])
    print('dog: ', cue_to_similar_words['dog'])
    return cue_to_similar_words

# path = '../output/2018/S_RW.R1.csv'


def get_similar_words(path):
    
    df1 = pd.read_csv(path)

    df1.index = df1["Unnamed: 0"]
    df1 = df1.drop(columns=["Unnamed: 0"], axis=1)
    #word_to_pos_dict = word_to_pos(list(df1.index))
    print("sorting similar words ..")
    cue_to_similar_words = sort_similar_words(df1) #1.head(20)) #, word_to_pos_dict )
    return cue_to_similar_words 
    #cue_to_similar_words['bicycle']
# cue_to_similar_words.keys()


def load_cue_to_similar_words(path, path_rw='../../data/S_RW.R123.csv'):
    if os.path.exists(path):
        print(f"loading {path}...")
        cue_to_similar_words = json.load(open(path))
    else:
        print(f"loading {path_rw} to get similar words ...")
        cue_to_similar_words = get_similar_words(path_rw)
        save_similar_words(cue_to_similar_words, path)
    return cue_to_similar_words 


def save_similar_words(cue_to_similar_words, output_file, indent=4):
    
    with open(output_file, 'w') as fout:
        json.dump(cue_to_similar_words, fout, indent=indent)
    print(f"save {output_file}")
    
def save_anchors_from_swow(json_path = '../../data/swow/swow.en.similar_words.json',  path_rw='../../data/swow/S_RW.R123.csv', debug=False):
    cue_to_similar_words_score = load_cue_to_similar_words(json_path, path_rw)
    
    vocab_cues = set(cue_to_similar_words_score.keys())
    vocab_res = set()
    for k,v in cue_to_similar_words_score.items():
        vocab_res.update(v.keys() )
    vocab = vocab_cues.union(vocab_res)
    vocab_to_singular = {k: singularize(k) for k in vocab}
    vocab_to_plural = {k: pluralize(k) for k in vocab}

    cue_to_similar_words_score_sgpl = defaultdict()
    cue_to_similar_words_score_sg = defaultdict()
    for k,v in cue_to_similar_words_score.items():
        k = vocab_to_singular.get(k)
        v = { vocab_to_singular.get(k1): v1 for k1, v1 in v.items()}
        cue_to_similar_words_score_sg[k] = v 
        cue_to_similar_words_score_sgpl[k] =v 

    cue_to_similar_words_score_pl = defaultdict()
    for k,v in cue_to_similar_words_score.items():
        k = vocab_to_plural.get(k)
        v = { vocab_to_plural.get(k1): v1 for k1, v1 in v.items()}
        cue_to_similar_words_score_pl[k] = v 
        cue_to_similar_words_score_sgpl[k] =v 

    return dic_sub_to_anchors_singular,  dic_sub_to_anchors_plural


def save_foward_associaitons_from_swow():
    path = '../../data/swow/strength.SWOW-EN.R123.csv'
    df = pd.read_csv(path, sep='\t')
    df.head()

    df['cue'] = df['cue'].apply(lambda x: str(x).strip().lower())
    df['response'] = df['response'].apply(lambda x: str(x).strip().lower())
    df['cue_token_num'] = df['cue'].apply(lambda x: len(str(x).split(" ")))
    df['response_token_num'] = df['response'].apply(lambda x: len(str(x).split(" ")))
    df = df.query("response_token_num ==1 and cue_token_num==1")    

    cue_association_dict = defaultdict()
    for name, group in df.groupby(['cue']):
        cue = name
        response_strength =  dict(zip(group['response'], group['R123.Strength']))
        cue_association_dict[cue] = response_strength

    vocab_cues = set(cue_association_dict.keys())
    vocab_res = set()
    for k,v in cue_association_dict.items():
        vocab_res.update(v.keys() )

    vocab = vocab_cues.union(vocab_res)
    vocab_to_singular = {k: singularize(k) for k in vocab}
    vocab_to_plural = {k: pluralize(k) for k in vocab}

    cue_association_dict_sgpl = defaultdict()
    for k,v in cue_association_dict.items():
        cue_association_dict_sgpl[vocab_to_singular[k]] = v
        cue_association_dict_sgpl[vocab_to_plural[k]] = v 


    output_file = '../../data/swow/swow.en.strength.R123.json'
    with open(output_file, 'w') as fout:
        json.dump(cue_association_dict_sgpl, fout, indent=4)
    print(f"save {output_file}")


# if __name__ == '__main__':
#     association_type =  sys.argv[1]
if association_type == 'similar':
    print("generating the related/similar words in swow")
    json_path = '../../data/swow/swow.en.similar_words.json'
    dic_sub_to_anchors_singular,  dic_sub_to_anchors_plural = save_anchors_from_swow(json_path)

    # cue_to_similar_words_score = load_cue_to_similar_words(json_path, path_rw='../../data/S_RW.R123.csv')
    print("test: ")
    #print( dic_sub_to_anchors_singular['bicycle'] )
    #print( dic_sub_to_anchors_plural['bicycles'] )
    print( dic_sub_to_anchors_singular['cartoon'] )
    print( dic_sub_to_anchors_plural['cartoons'] )
elif association_type == 'strength':
    print('generating forward associations')
    save_foward_associaitons_from_swow()



loading ../../data/swow/S_RW.R123.csv to get similar words ...


NameError: name 'word_to_pos' is not defined

save ../../data/swow/swow.en.strength.R123.json


In [27]:
json_path = '../../data/swow/swow.en.similar_words.json'
path_rw='../../data/swow/S_RW.R123.csv'

cue_to_similar_words_score = load_cue_to_similar_words(json_path, path_rw)
    
#cue_to_similar_words_score= {'cartoon': cue_to_similar_words_score['cartoon']}

#cue_debug = ['cartoon', 'animal', 'dog']
# cue_to_similar_words = defaultdict()
# for k,v in cue_to_similar_words_score.items():
#     #if k in cue_debug:
#     cue_to_similar_words[k]= list(v.keys()) 

# add a vocab, voab_to_singular, vocab_to_plural to save time
#  

# return dic_sub_to_anchors_singular,  dic_sub_to_anchors_plural

loading ../../data/swow/swow.en.similar_words.json...


In [36]:
# set([list(v.keys())  for v in cue_to_similar_words.values() for x in ]


In [46]:

    
save_similar_words(cue_to_similar_words_score_sg, json_path.replace(".json", ".sg.json"), indent=4)
save_similar_words(cue_to_similar_words_score_pl, json_path.replace(".json", ".pl.json"), indent=4)
save_similar_words(cue_to_similar_words_score_sgpl, json_path.replace(".json", ".sgpl.json"), indent=4)

# print(cue_to_similar_words_score_sgpl['rabbits'])
# print(cue_to_similar_words_score_sgpl['rabbit'])

save ../../data/swow/swow.en.similar_words.sg.json
save ../../data/swow/swow.en.similar_words.pl.json
save ../../data/swow/swow.en.similar_words.sgpl.json


In [None]:

dic_sub_to_anchors_singular = {vocab_to_singular(k): {vocab_to_singular(x) for k1,v1 in v.items()} for k,v in deepcopy(cue_to_similar_words_score).items()}
dic_sub_to_anchors_plural = {vocab_to_plural(k): [vocab_to_plural(x) for x in v] for k,v in deepcopy(cue_to_similar_words).items()}

dic_sub_to_anchors_singular = {singularize(k): [singularize(x) for x in v] for k,v in deepcopy(cue_to_similar_words).items()}
dic_sub_to_anchors_plural = {pluralize(k): [pluralize(x) for x in v] for k,v in deepcopy(cue_to_similar_words).items()}



In [11]:
path_rw='../../data/swow/S_RW.R123.csv'
df1 = pd.read_csv(path_rw)
df1.index = df1["Unnamed: 0"]
df1 = df1.drop(columns=["Unnamed: 0"], axis=1)

# return cue_to_similar_words 

In [12]:
def word_to_pos(words):
    word_to_pos_dict = defaultdict()
    for word in words:
        if word=='null': continue 
        pos = nlp(str(word))[0].pos_ 
        word_to_pos_dict[word] = pos
    return word_to_pos_dict

word_to_pos_dict = word_to_pos(list(df1.index))
print("sorting similar words ..")
cue_to_similar_words = sort_similar_words(df1.head(20), word_to_pos_dict )

sorting similar words ..


KeyError: 'umpire'

In [16]:

def get_swow_score(cue, response, score_dict):
    if cue in score_dict and response in score_dict[cue]:
        return score_dict[cue][response]
    return 0

path = '../../data/swow/swow.en.strength.R123.json'
path = '../../data/swow/swow.en.similar_words.json'

score_dict = json.load(open(path))
score = get_swow_score('dog', 'cat', score_dict)


print(score)
# score_dict['abdomen']
print (len( list(score_dict.keys())))

0.377987526653529
12215


In [15]:
if 'dogs' in score_dict:
    print("yes")
else:
    print("no")
# len(word_to_pos_dict.keys())

no


In [19]:
path='../../log/bert-large-uncased/clsb/swow_rw/exp_data_results_anchor_type_Coordinate_remove_Y_PUNC_FULL_concate_or_single_max_anchor_num_5_anchor_scorer_probAvg_filter_obj_True_filter_objects_with_input_True_wnp_False_cpt_False_anchor_source_LM_swow_score_source_SWOWSimilar.CLSB.csv'
dfp = pd.read_csv(path)
dfp[['sub_label', 'subj_anchors', 'subj_anchors_swow']].head()

Unnamed: 0,sub_label,subj_anchors,subj_anchors_swow
0,alligators,"['snakes', 'turtles', 'frogs', 'lizards', 'sha...","['crocodile', 'reptile', 'reptile', 'cayman', ..."
1,alligators,"['snakes', 'turtles', 'frogs', 'lizards', 'sha...","['crocodile', 'reptile', 'reptile', 'cayman', ..."
2,ambulances,"['police', 'buses', 'helicopters', 'cars', 'em...","['emergency', 'medic', 'hospital', 'firetruck'..."
3,ambulances,"['police', 'buses', 'helicopters', 'cars', 'em...","['emergency', 'medic', 'hospital', 'firetruck'..."
4,anchors,"['reporters', 'ropes', 'pilots', 'boats', 'pro...","['nautical', 'sailboat', 'sail', 'boat', 'ship..."


In [26]:

# cue_to_similar_words_score = load_cue_to_similar_words(json_path, path_rw='../../data/S_RW.R123.csv')

# print( dic_sub_to_anchors_singular['bicycle'] )
# print( dic_sub_to_anchors_plural['bicycles'] )
# coat

dict1 = json.load(open('../../data/swow/swow.en.similar_words.sg.json'))
dict2 = json.load(open('../../data/swow/swow.en.similar_words.pl.json'))

# dict_all = dict1+dict2 
print( len(dict_all.keys ) )
dict2['rabbits']

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'

In [None]:
anchor_num = 5
k='absent'
cue_to_similar_words= {k: [str(x[0]) for x in cue_to_similar_words_score[k]] }
print(cue_to_similar_words)
dic_sub_to_anchors_singular = {singularize(k): [singularize(x) for x in v[:anchor_num]] for k,v in cue_to_similar_words.items()}
dic_sub_to_anchors_plural = {pluralize(k): [pluralize(x) for x in v[:anchor_num]] for k,v in cue_to_similar_words.items()}
print(dic_sub_to_anchors_singular)
print(dic_sub_to_anchors_plural)

In [None]:
dic_sub_to_anchors_singular,  dic_sub_to_anchors_plural = read_anchors_from_swow()

In [None]:
print(dic_sub_to_anchors_singular['bicyble'])
print("\n")
print(dic_sub_to_anchors_plural['bicycles'])

In [None]:
cue_to_similar_words_score = load_cue_to_similar_words(json_path, path_rw='../../data/swow/S_RW.R123.csv')

In [None]:
for k,v in cue_to_similar_words_score.items():
    for x in v:
        if isinstance(str(x[0]), float):
            print(k,x[0])

In [None]:
# absent
cue_to_similar_words_score['absent']


## Analyzing frequency

#Vocab: 800071


Unnamed: 0,0,1,2,3
index,HIGH,LOW,MEDI,UNSEEN
sub_freq_level,1.0,0.0,0.0,0.0


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

import spacy
import pyinflect

nlp = spacy.load('en_core_web_sm')

word = 'yes'

def pluralize_test(word):
    '''
    sg is None if the word is not in the vocab 
    '''
    
    pl= nlp(word)[0]._.inflect('NNS')
    return pl if pl is not None else word

pl = pluralize_test(word)
# print(pl)



In [None]:
def pluralize(word):
    pl = nlp(word)
    if pl[0].pos_ not in ["NOUN"] or x.tag_ == 'NNS':
        return word
    else:
        return pl[0]._.inflect('NNS')

def singularize(word):
    '''
    sg is None if the word is not in the vocab 
    '''
    sg =  nlp(word) #[0]._.inflect('NN')

    if sg[0].pos_ not in ["NOUN"] or x.tag_ == 'NN':
        return word
    else:
        return sg[0]._.inflect('NN')
    
    return sg if sg is not None else word

word = 'flower'
print( pluralize_test2(word) )
print( singularize_test2(word) )
pl = nlp(word)
for x in pl:
    print(x, x.pos_, x.lemma_, x.tag_, x._.inflect('NNS'))

# Unifying datapoins from five datasets

In [1]:
from collections import defaultdict, Counter 
import pandas as pd
import json 
import os, sys 

def read_cohyponyms(path = '../log/word_to_cohyponyms.txt'):
    if os.path.exists(path):
        print(f"reading cohyponyms: {path}")
        df = pd.read_csv(path)
        df['cohyponyms'] = df['cohyponyms'].apply(lambda x: eval(x))
        word_to_cohyponym = dict(zip(df['word'].to_list(), df['cohyponyms'].to_list()))
        return word_to_cohyponym
    print(f"{path} not found")
    
def read_bert_vocab(bert_vocab_path = 'data/bert-large-uncased-vocab.txt'):
    
    vocab = set()
    with open(bert_vocab_path, 'r') as fin: 
        lines = fin.readlines()
        for line in lines: 
            line = line.strip()
            vocab.add(line)
    return vocab        


def save_dict_to_json(examples, output_path):
    ''' 
    save a list of dicts into otuput_path, orient='records' (each line is a dict) 
    examples: a list of dicts
    output_path: 
    '''
    with open(output_path, 'w') as fout:
        for example in examples:
            json.dump(example, fout)
            fout.write("\n")
        print(f"save {output_path} with {len(examples)} lines")
        
        
dataset_to_orig_paths = {
    'CLSB': '../../data/clsb/singular/IsA.jsonl',
    'BLESS': '../../data/hypernymsuite/BLESS/IsA.jsonl',
    'EVAL': '../../data/hypernymsuite/EVAL/IsA.jsonl',
    'LEDS': '../../data/hypernymsuite/LEDS/IsA.jsonl',
    'SHWARTZ': '../../data/hypernymsuite/SHWARTZ/IsA.jsonl',
    'DIAG': '../../data/lm_diagnostic_extended/singular/IsA.jsonl'
}

def load_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as fin:
        data = fin.readlines()
        data = [eval(x) for x in data]
        df = pd.DataFrame(data)

        df['obj_label'] = df['obj_label'].apply(lambda x: x if isinstance(x, str) else x[0])
        return df 

dataset_to_df = defaultdict()
for dataset, path in dataset_to_orig_paths.items():
    df = load_data(path)
    #display (df.head())
    dataset_to_df[dataset] = df 
    
atom_datasets = ['BLESS', 'DIAG', 'CLSB', 'LEDS','EVAL'] #, 'SHWARTZ'] exluding SHWARTZ as it's too noisy 
dataset_to_df['ALL'] = pd.concat([dataset_to_df[dataset] for dataset in atom_datasets], axis=0)
dataset_to_df['ALL'] = dataset_to_df['ALL'][['sub_label', 'obj_label', 'relation']].drop_duplicates(keep=False)


dataset_to_df['ALL'].head()
print(len(dataset_to_df['ALL'].index))
bert_vocab= read_bert_vocab(bert_vocab_path = '../../data/bert-large-uncased-vocab.txt')


# path = '../../data/swow/swow.en.fbs.json'
path = '../../data/swow/swow.en.strength.R123.json'
# path = "../../data/swow/vocab_to_plural.json"

swow_cue_vocab = list(json.load(open(path, 'r')).keys())
print(len(swow_cue_vocab))

word_to_cohyponyms = read_cohyponyms(path = '../log/word_to_cohyponyms.txt')

########## filter the subjects and objects 
dataset_to_df['ALL']['obj_in_BERT'] = dataset_to_df['ALL']['obj_label'].apply(lambda x: 1 if x in bert_vocab else 0)
df = dataset_to_df['ALL'].query(f"sub_label in {swow_cue_vocab} and obj_in_BERT==1").reset_index(drop=True)
print("#instances shared in SWOW and BERT", len(df.index))

df['masked_sentences'] = df['sub_label'].apply(lambda x: [f"A {x} is a [MASK]"])
df['uuid'] = df.index + 1

df['subj_sister'] = df['sub_label'].apply(lambda x: word_to_cohyponyms.get(x))
print(len(df.loc[df.subj_sister.str.len()==0].index))
# save_dict_to_json(df.to_dict(orient='records'), output_path = '../../data/hypernymsuite/ALL/swow_rw/IsA.jsonl')




# word_to_plural = {word: pluralize(word) for word in set(df['sub_label']).union(df['obj_label'])}

# swow_type_to_paths = {"Stregnth": '../../data/swow/swow.en.strength.R123.pl.json', 
#                       "FW+BW": '../../data/swow/swow.en.fbs.json',
#                       "Similar": '../../data/swow/swow.en.similar_words.sgpl.json'}

# df['sub_label_pl'] = df['sub_label'].apply(lambda x: word_to_plural[x])

# label_col = 'subj_anchors_label'
# df['obj_label'] = df['obj_label'].apply(lambda x: [x] )

# ######################################### 
# for swow_type, path in swow_type_to_paths.items():
#     print(swow_type)
#     word_to_swow = json.load( open(path, 'r'))
#     word_to_swow = {k: list(v.keys()) for k,v in word_to_swow.items()}
#     df['subj_anchors_pred'] = df['sub_label_pl'].progress_apply(lambda x: word_to_swow.get(x, []))
#     df_new = df.loc[df['subj_anchors_pred'].str.len() >0]
#     print("#instances with non empty swow anchors", len(df_new))

#     df = df.loc[df['subj_anchors_label'].str.len() >0]
#     print("#instances with non empty wordnet anchors", len(df))

#     pred_cols = ['subj_anchors_pred']# ['subj_anchors_all_sg'] #['subj_anchors_swow']
#     #df = df.loc[df['subj_anchors_swow'].str.len() >0]
#     df_prec_anchor = get_precision_at_k_concept(df, relation, pred_cols, label_col, k_list=[1, 5, 10],pred_col_suffix=pred_col_suffix ) ##note that this would be super slow when top_k is large (>1000) 
#     df_mrr =  get_mrr(df, relation, pred_cols, label_col, pred_col_suffix)
#     df_prec_anchor['mrr'] = df_prec_anchor['mask_type'].apply(lambda x:  df_mrr.loc[df_mrr['mask_type']==x, f'mrr'].values[0])
#     df_prec_anchor_display = df_prec_anchor[["mask_type", 'mrr', "p@1", "p@5", "p@10"]] 
#     print(tabulate(df_prec_anchor_display, tablefmt='latex', headers=df_prec_anchor_display.columns).replace("\\", "").replace("&", "\t"))
    

4549
11292
reading cohyponyms: ../log/word_to_cohyponyms.txt
#instances shared in SWOW and BERT 2094
269


In [2]:
# df.loc[df.subj_sister.str.len()==0]

dataset_to_df['ALL']['sub_in_BERT'] = dataset_to_df['ALL']['sub_label'].apply(lambda x: 1 if x in bert_vocab else 0)
dataset_to_df['ALL']["sub_in_BERT"].value_counts()

1    3136
0    1413
Name: sub_in_BERT, dtype: int64

In [3]:
dataset_to_df['ALL'].query("sub_in_BERT==0")

Unnamed: 0,sub_label,obj_label,relation,obj_in_BERT,sub_in_BERT
3,musket,firearm,IsA,1,0
21,giraffe,creature,IsA,1,0
26,sieve,object,IsA,1,0
60,hatchet,object,IsA,1,0
61,cockroach,arthropod,IsA,0,0
...,...,...,...,...,...
397,gent,gentleman,IsA,1,0
459,humility,emotion,IsA,1,0
627,pant,trouser,IsA,0,0
777,skateboard,sport,IsA,1,0


# Stats between SWOW and six hypernym dataset

In [None]:
import json
import pandas as pd 

swow_path = '../../data/swow/swow.en.similar_words.json'
cue_to_similar_words = json.load(open(swow_path))


# from inflection import singularize, pluralize 

import pandas as pd
from sklearn.metrics import accuracy_score

import spacy
# import pyinflect
import lemminflect
nlp = spacy.load('en_core_web_sm')


def pluralize(word):
    pl = nlp(word)
    if pl[0].pos_ not in ["NOUN"] or pl[0].tag_ == 'NNS':
        return word
    else:
        return pl[0]._.inflect('NNS')

def singularize(word):
    '''
    sg is None if the word is not in the vocab 
    '''
    sg =  nlp(word) #[0]._.inflect('NN')

    if sg[0].pos_ not in ["NOUN"] or sg[0].tag_ == 'NN':
        return word
    else:
        return sg[0]._.inflect('NN')
    
    
word = 'cartoon'#'animation'
print(cue_to_similar_words[word])
for v in cue_to_similar_words[word]:
    #print(v)
    print (v[0], singularize(v[0]))
    print (v[0], pluralize(v[0]))
    
print( len(cue_to_similar_words.keys()) )
import os, sys 
from utils_path import dataset_to_respath

cues = set(cue_to_similar_words.keys())
df_stats = []
dfs = []
for dataset, path in dataset_to_respath.items():
    path = "../../"+path
    df = pd.read_csv(path)
    sub_label_sg = set(df['sub_label_sg'].to_list())
    shared = sub_label_sg.intersection(cues)
    shared_rate = round( len(shared)/ len(sub_label_sg), 3)
    stats = {"dataset": dataset, '#sub_label': len(sub_label_sg), "#SWOW_shared_sub_label": len(shared), "#shared_rate": shared_rate}
    #print(stats)
    df['dataset'] = dataset
    shared_list = list(shared)
    dfs.append(df.query(f'sub_label_sg in {shared_list}').reset_index(drop=True))
    df_stats.append(stats)
df_stats = pd.DataFrame(df_stats)

display(df_stats.sort_values(by=['#shared_rate'], ascending=False))
dfs = pd.concat(dfs)

# Examples

In [13]:
import os, sys
import json 
import pandas as pd
from inflection import singularize, pluralize 

def get_strength_dict(path, source_path):
    if os.path.exists(path):
        swow_score_dict_pl = json.load(open(path, 'r'))
    else: 
        swow_score_dict = json.load( open (source_path))

        vocab_cues = set(swow_score_dict.keys())
        vocab_res = set()
        for k,v in swow_score_dict.items():
            vocab_res.update(v.keys() )
            
        vocab = vocab_cues.union(vocab_res)
        vocab = set({str(k) for k in vocab})
        vocab_to_plural = {k: pluralize(k) for k in vocab }

        swow_score_dict_pl = defaultdict()
        for k,v in swow_score_dict.items():
            v_pl = {vocab_to_plural.get(k1):v1 for k1, v1 in v.items() }
            swow_score_dict_pl[vocab_to_plural.get(k)] = v_pl
        output_path = input_path.replace('.json', '.pl.json')
        json.dump(swow_score_dict_pl, open(output_path, 'w'))
    return swow_score_dict_pl 

def query_strength_score(cue, response, score_dict):
    if cue in score_dict and response in score_dict[cue]:
        return score_dict[cue][response]
    return 0



def get_sim_matrix(path_rw='../../data/swow/S_RW.R123.csv'):
    df = pd.read_csv(path_rw)
    sim_matrix = df.to_numpy()
    vocab = df.columns[1:]
    vocab_pl = [pluralize(word) for word in vocab] #this is normalized to fit into the anchor mining module
    word2id = {word:i for i, word in enumerate(vocab)}
    return word2id, sim_matrix


def query_sim(word1, word2, swow_score_tuple ):
    '''
    query the similary between two words in a similarity matrix 
    '''
    word2id, sim_matrix = swow_score_tuple
    sim_score =0 
    if word1 in word2id and word2 in word2id:
        id1 = word2id.get(word1)
        id2 = word2id.get(word2)

        sim_score = sim_matrix[id1][id2+1] #the first col is the word 
    return sim_score 



In [14]:

# swow_score_source = 'AddSWOWStrength'
swow_score_source = 'AddSWOWSimilar'
if swow_score_source =='AddSWOWStrength':
    swow_score_tuple = get_strength_dict(path='../../data/swow/swow.en.fbs.json', source_path = 'data/swow/swow.en.strength.R123.json')
    query_swow_score = query_strength_score
elif swow_score_source  =='AddSWOWSimilar': #similar words
    swow_score_tuple = get_sim_matrix(path_rw='../../data/swow/S_RW.R123.csv')   #word2id, sim_matrix 
    query_swow_score = query_sim
    
cue = 'lime'
responses = swow_score_tuple[cue]
print(responses)
responses = ['lemon', 'zest', 'citrus', 'margarita']
for response in responses:
    score = query_swow_score(cue =cue,  response= response, score_dict=swow_score_tuple)
    print(cue, response, score)

KeyboardInterrupt: 

In [16]:
data = json.load( open('../../data/swow/swow.en.similar_words.sg.json', 'r'))
data['lime']

{'lemon': 0.583459797300129,
 'citrus': 0.583859542567453,
 'lemony': 0.487566375532904,
 'tequila': 0.474077408141611,
 'juice': 0.470873289673751,
 'guacamole': 0.468157988084381,
 'lemonade': 0.467682347096958,
 'tangy': 0.451454685461879,
 'sour': 0.434657464571513}