In [59]:
import sys
import re
from tqdm import tqdm
import json
import spacy
import networkx as nx
from collections import Counter
import pandas as pd 
pd.set_option('display.max_rows', 500)
def is_negated(node, pattern="seeds"):
    neg_list = ('no', 'not', 'none', 'nor', 'no_one','nobody', 'nothing', 'neither', 'nowhere', 'never', 'hardly', 'barely', 'scarcely', 'non', 'without', 'fail', 'cannot', 'cant', 'nolonger', 'dont', 'wont' )
    # 'no one', lacks
    # neg_list =['except', 'prevent', 'neglected', 'refused', 'absence', 'without', 'fail', 'nor', 'n\'t']

    # neg_dict = {'prefixes': ['in', 'un', 'im', 'dis', 'ir'], 'infixes': ['less'], 'suffixes': ['less']}

    # node = node.replace("_", " ")
    if pattern=="seeds":
        return bool(node in neg_list)
    else:
        neg_flag=False
        for n in node.split("_"):
            if n in neg_list:
                neg_flag=True
                break
        return neg_flag

    # else:
        # pattern = re.compile(r'(?:{})_(?:{}) ' % '|'.join(neg_list))
        # node = node.replace("_", " ")
        # pattern = re.compile(r'*({} )*' .format('|'.join(neg_list)))
        # pattern = re.compile(r'( .?{0}(_|$))'.format('|'.join(neg_list)))
        # return  bool(re.match(pattern, node)) 

nlp = spacy.load('en_core_web_sm')

def is_negated_spacy(node, pattern="seeds"):
    node = node.replace("_", " ")
    neg_flag=False

    doc = nlp(node)
    for tok in doc:
        negation_tokens = [tok for tok in doc if tok.dep_ == 'neg']
        if len(negation_tokens)>0:
            negation_head_tokens = [token.head for token in negation_tokens]
            
            # for token in negation_head_tokens:
                # print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])
            neg_flag=True
    return neg_flag


def load_negated_nodes(path, outpath, debug, pattern="seeds"):
    negated_words = set()
    with open(path, 'r') as fo, open(outpath, 'w') as fout:
        for i, line in enumerate(tqdm(fo.readlines())):
            line = line.strip()
            if is_negated(line, pattern):
            # if is_negated_spacy(line):
                # print(line)
                negated_words.add(line)
                fout.write(line+"\n")
            if debug =='True' and i>10000: break

    print("write {}".format(outpath))
    return negated_words


# def get_cpnet_simple(nx_graph):
#     cpnet_simple = nx.Graph()
#     for u, v, data in nx_graph.edges(data=True):
#         w = data['weight'] if 'weight' in data else 1.0
#         if cpnet_simple.has_edge(u, v):
#             cpnet_simple[u][v]['weight'] += w
#         else:
#             cpnet_simple.add_edge(u, v, weight=w)
#     return cpnet_simple

def load_negated_pairs(negation_nodes, path, outpath, kg_name):
    negated_pairs=[]
    # G=nx.MultiGraph()
    with open(path, "r") as fo, open(outpath, 'w') as fout:
        for line in tqdm(fo.readlines()):
            line = line.strip()
            rel, subj, obj, weight = line.split("\t")
            if kg_name =='swow' and rel =='bidirectionalassociated':
                continue
            if kg_name =='swow' and float(weight)<=1:
                continue
            if subj in negation_nodes or obj in negation_nodes:
                negated_pairs.append(line)
                fout.write(line+"\n")
    print("write {}".format(outpath))
    return negated_pairs




In [14]:
def load_graph(path, graph_name):
    graph = nx.MultiDiGraph()
    with open(path, "r") as fo:
        for line in tqdm(fo.readlines()):
            rel, subj, obj, weight = line.strip().split("\t")
            if graph_name=='swow' and float(weight)<=1.0: continue 
            if graph.has_edge(subj, obj):
                if rel not in graph[subj][obj]:
                    # print(rel, subj, obj)
                    graph.add_edge(subj, obj, key=rel, rel=rel, weight=weight)
            else:
                graph.add_edge(subj, obj, key=rel, rel=rel,weight=weight)
            # graph.add_edge(subj, obj, rel=rel, weight=weight)
            # graph.add_edge(obj, subj, rel="_"+rel, weight=weight)
    return graph



In [15]:
def detect_neg(data_dirs, kg_names):
   for data_dir,kg_name in zip(data_dirs, kg_names):
        path_node = data_dir +'concept.txt'
        path_triple = data_dir + 'conceptnet.en.csv'
        out_node = data_dir + 'negated_nodes.json'
        out_triple = data_dir + 'negated_triples.json'
        negated_nodes = load_negated_nodes(path_node, out_node, debug, pattern)
        negated_pairs = load_negated_pairs(negated_nodes, path_triple, out_triple, kg_name)
    
        print("{}: neg  {} nodes appearning in {} triples".format(kg_name, len(negated_nodes), len(negated_pairs)))

In [16]:
def get_negpairs_rel(graph, swow_neg_path, outpath):
    rel_seen = Counter()
    with  open(swow_neg_path) as fo, open(outpath, 'w') as fout:
        for line in tqdm(fo.readlines()):
            rel, subj, obj, weight = line.strip().split("\t")
            if graph.has_edge(subj, obj):
                edge_data = graph[subj][obj]
                for idx, data  in edge_data.items():
                    rel_cn = data['rel']
                    weight_cn = data['weight']
                    rel_seen[rel_cn] +=1
                    # print(rel_cn)
                    fout.write("{}\t{}\t{}\t{}\n".format(rel_cn, subj, obj, weight_cn))
            else:
                fout.write("{}\t{}\t{}\t{}\n".format(rel, subj, obj, weight))
            # elif graph.has_edge(obj, subj):
            #     edge_data = graph[obj][subj]
            #     for idx, data  in edge_data.items():                        
            #         rel_cn = data['rel']
            #         weight_cn = data['weight']
            #         rel_seen[rel_cn] +=1
            #         # print(rel_cn)
            #         fout.write("{}\t{}\t{}\t{}\n".format(rel_cn, obj, subj, weight_cn))
    print()
    all = 0
    for x in rel_seen.most_common():
        print("{} \t {}".format(x[0], x[1]))
        all +=x[1]
    print("total {} neg sw triples recalled from CN".format(all))


In [17]:
def load_negated_nodes_2(path):
    nodes = set()
    with open(path, 'r') as fin:
        lines = fin.readlines()
        for line in lines:
            nodes.add(line.strip())
    return nodes 

# Negation detection

In [18]:
debug = sys.argv[1]
pattern=sys.argv[2]
data_dirs=[ 'swow/','cpnet47rel/']
kg_names = [ 'swow', 'cpnet']
detect_neg(data_dirs, kg_names)
sw_nodes = load_negated_nodes_2(data_dirs[0]+"negated_nodes.json")
cn_nodes = load_negated_nodes_2(data_dirs[1]+"negated_nodes.json")
print(len(sw_nodes.intersection(cn_nodes)), len(sw_nodes), len(cn_nodes))
# cn=rel_distribution(data_dirs[1]+"negated_triples.json")
# all = 0
# for x in cn:
#     print("{} \t {}".format(x[0], x[1]))
#     all +=x[1]
# print("total {} neg cn triples".format(all))

  0%|          | 0/124627 [00:00<?, ?it/s]

100%|██████████| 124627/124627 [00:00<00:00, 718990.74it/s]
  0%|          | 0/1593564 [00:00<?, ?it/s]

write swow/negated_nodes.json


100%|██████████| 1593564/1593564 [00:01<00:00, 1307750.24it/s]
  0%|          | 0/1080870 [00:00<?, ?it/s]

write swow/negated_triples.json
swow: neg  2833 nodes appearning in 2307 triples


100%|██████████| 1080870/1080870 [00:01<00:00, 903713.78it/s]


write cpnet47rel/negated_nodes.json


100%|██████████| 3054300/3054300 [00:02<00:00, 1418057.27it/s]

write cpnet47rel/negated_triples.json
cpnet: neg  4116 nodes appearning in 10962 triples
318 2833 4116





In [19]:

graph_cn =  load_graph(data_dirs[1]+  'conceptnet.en.csv', 'conceptnet')
graph_sw =  load_graph(data_dirs[0]+  'conceptnet.en.csv', 'swow')

# graph_cn_sw =  load_graph('overlap_cn.en.csv')
get_negpairs_rel(graph_cn, data_dirs[0]+"negated_triples.json", data_dirs[0] + "negated_triples_cn_rel.json")

100%|██████████| 3054300/3054300 [00:20<00:00, 146915.82it/s]
100%|██████████| 1593564/1593564 [00:05<00:00, 317052.51it/s]
100%|██████████| 2307/2307 [00:00<00:00, 367178.66it/s]


relatedto 	 191
antonym 	 58
distinctfrom 	 36
synonym 	 33
similarto 	 12
etymologicallyrelatedto 	 9
derivedfrom 	 8
mannerof 	 3
isa 	 2
formof 	 2
hascontext 	 1
hasa 	 1
total 356 neg sw triples recalled from CN





In [20]:
graph_cn_neg = load_graph(data_dirs[1] + "negated_triples.json", 'ConceptNet-Neg')
graph_sw_neg = load_graph(data_dirs[0] + "negated_triples.json", 'SWOW-Neg')

print(nx.classes.function.density(graph_sw))
print(nx.classes.function.density(graph_cn))
print(nx.classes.function.density(graph_sw_neg))
print(nx.classes.function.density(graph_cn_neg))

100%|██████████| 10962/10962 [00:00<00:00, 219201.54it/s]
100%|██████████| 2307/2307 [00:00<00:00, 252465.87it/s]




0.0004555362269282503
2.5454008122660245e-06
0.0008072671539896214
9.336158987165134e-05


In [21]:

sw_neihgbors=set()
Gs = nx.MultiDiGraph()
seed='stunt'
for k,v in graph_sw[seed].items():
    # print(k,v['forwardassociated']['rel'])
    
    sw_neihgbors.add(k)
    for k1, v1 in v.items():
        Gs.add_edge(seed,k,rel=k1,)
        print(seed, k,k1,)
sw_cn=[]
for k,v in graph_cn[seed].items():
    for k1,v1 in v.items():
        if k in sw_neihgbors:
            sw_cn.append((seed,k,k1))
        if k1 in ['derivedfrom', 'antonym', 'hascontext', 'relatedto','distinctfrom', 'formof']:
            print(k,k1)
                     
print("#######")
shared_neighbor=set()
if len(sw_cn)>0:
    for x in sw_cn:
        print(x)
        shared_neighbor.add(x[1])

print("#######SW only######")
for x in sw_neihgbors-shared_neighbor:
    print(x)
       
# graph = load_graph(data_dirs[1]+  'conceptnet.en.csv')
# # %%
# def rel_distribution(path):
#     rel_count = Counter()
#     with open(path, "r") as fo:
#         for line in tqdm(fo.readlines()):
#             rel, subj, obj, weight = line.strip().split("\t")
#             rel_count[rel] +=1
#     return rel_count.most_common()

stunt growth forwardassociated
stunt man forwardassociated
stunt driver forwardassociated
stunt explosion forwardassociated
stunt trick forwardassociated
stunt trick bidirectionalassociated
stunt motorbike forwardassociated
stunt act forwardassociated
stunt crazy forwardassociated
stunt cunning forwardassociated
stunt dangerous forwardassociated
stunt dangerous bidirectionalassociated
stunt movies forwardassociated
stunt prank forwardassociated
stunt prank bidirectionalassociated
stunt double forwardassociated
stunt motorcycle forwardassociated
stunt fire forwardassociated
stunt actor forwardassociated
stunt movie forwardassociated
stunt bike forwardassociated
stunt risk forwardassociated
stunt publicity forwardassociated
stunt publicity bidirectionalassociated
stunt circus forwardassociated
stunt antics forwardassociated
stunt silly forwardassociated
stunt car forwardassociated
stunt danger forwardassociated
stunt action forwardassociated
stunt risky forwardassociated
stunt jump forwa

In [8]:
out_file="swow/overlap_cn_high_recall_mannerof.en.txt"
with open(out_file, "w") as fout:
    for k,v, data in graph_cn_sw.edges(data=True):
        # if data["rel"] in ['antonym','distinctfrom']:
        if data["rel"] in ['mannerof']:
            # print(k,k1,data["rel"])
            fout.write("{}\t{}\t{}\t{}\n".format(data["rel"],k,v,  data["weight"]))
print("save {}".format(out_file))

save swow/overlap_cn_high_recall_mannerof.en.txt


# CN nodes

In [3]:
import pandas as pd
path = './cpnet47rel/conceptnet.en.csv'
df_cn = pd.read_csv(path, sep='\t', names=['relation', 'head', 'tail', 'weight'])
df_cn.head()

nodes_df_cn = set(df_cn['head']).union(set(df_cn['tail']))
print("nodes_df_cn  (count>=2)", len(nodes_df_cn))


nodes_df_cn  (count>=2) 1080869


# SWOW nodes (count>=2)

In [1]:
import pandas as pd
path = './swow/conceptnet.en.csv'
df_sw = pd.read_csv(path, sep='\t', names=['relation', 'head', 'tail', 'weight'])
df_sw.head()
all_triples = len(df_sw.index)
df_sw2 = df_sw.query("weight>=2")
triples_2 = len(df_sw2.index)
print(f"SWOW all triples: {all_triples}")
print(f"SWOW triples (count>=2): {triples_2 }")

nodes_df_sw2 = set(df_sw2['head']).union(set(df_sw2['tail']))
print("Nodes (count>=2)", len(nodes_df_sw2))


SWOW all triples: 1593564
SWOW triples (count>=2): 538985
Nodes (count>=2) 34397


## Load SWOW negated triples

examine what's in SWOW alone, and what's shared

In [55]:
import pandas as pd
path = './swow/negated_triples.json'
df_sw_neg = pd.read_csv(path, sep='\t', names=['relation', 'head', 'tail', 'weight'])

# df_sw_neg.head()


df_sw_neg_cnrel =pd.read_csv('./swow/negated_triples_cn_rel.json', sep='\t', names=['relation', 'head', 'tail', 'weight']) 
# df_sw_neg_cnrel_shared = df_sw_neg_cnrel.query("relation!='forwardassociated'")
# print(len(df_sw_neg_cnrel_shared.index))
# df_sw_neg_cnrel_shared.head()
# df_sw_neg_cnrel_shared[['head', 'tail']].tolist()
# df_sw_neg_cnrel.query("relation=='forwardassociated'")

df_sw_neg_cnrel_shared = []
df_sw_neg_sw_only = []
for key, value in df_sw_neg_cnrel.groupby(['head', 'tail']):
    if len(value.index)==1 and value.relation.values[0]=='forwardassociated':
        # display(key, value)
         df_sw_neg_sw_only.append(value)
        
    else: 
        df_sw_neg_cnrel_shared.append(value)
       
    

df_sw_neg_sw_only = pd.concat(df_sw_neg_sw_only, axis=0)
df_sw_neg_cnrel_shared = pd.concat(df_sw_neg_cnrel_shared, axis=0)
display(df_sw_neg_sw_only.head())
display(df_sw_neg_cnrel_shared.head())


print(len(df_sw_neg_sw_only.index))
print(len(df_sw_neg_cnrel_shared.index))

Unnamed: 0,relation,head,tail,weight
2260,forwardassociated,abdicate,fail,2.0
2311,forwardassociated,abhor,no,2.0
867,forwardassociated,abnormal,not_normal,3.0
1116,forwardassociated,absence,none,2.0
1115,forwardassociated,absence,not_here,2.0


Unnamed: 0,relation,head,tail,weight
638,relatedto,absent,without,1.0
182,antonym,again,never,0.188
50,antonym,all,none,0.953
51,distinctfrom,all,none,0.668
52,relatedto,all,none,1.104


2057
352


In [65]:
df_sw_neg_cnrel_shared.sort_values(by=['weight'], ascending=False).head(100) #[100:200]
# df_sw_neg_sw_only.sort_values(by=['weight'], ascending=False)[100:200]
# head(100)

Unnamed: 0,relation,head,tail,weight
977,relatedto,neither,nor,4.213
1077,relatedto,nor,neither,4.203
13,relatedto,none,nothing,3.344
323,relatedto,not,negative,3.141
95,antonym,always,never,3.049
1647,synonym,no_one,nobody,2.828
8,relatedto,none,zero,2.431
213,antonym,no,yes,2.33
334,relatedto,not,negation,2.173
96,distinctfrom,always,never,2.036
