In [171]:
from ete3 import Tree
import re
import json
import codecs
from collections import Counter

In [131]:
def to_newick_format(parse_tree):
    parse_tree = parse_tree.replace(",", "*COMMA*")
    parse_tree = parse_tree.replace(":", "*COLON*")
    tree_list = load_syntax_tree(parse_tree)
    if tree_list == None:
        return None
    tree_list = tree_list[1] #去 root
    s = syntax_tree_to_newick(tree_list)
    s = s.replace(",)",")")
    if s[-1] == ",":
        s = s[:-1] + ";"
    return s

def load_syntax_tree(raw_text):
    stack = ["ROOT"]
    text = re.sub(r"\(", " ( ", raw_text)
    text = re.sub(r"\)", " ) ", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"^\(\s*\(\s*", "", text)
    text = re.sub(r"\s*\)\s*\)$", "", text)
    for c in text.strip(" ").split(" "):
        if c == ")":
            node = []
            while(1):
                popped = stack.pop()
                if popped == "(":
                    break
                node.append(popped)
            node.reverse()
            if len(node) > 1:
                stack.append(node)
            else:
                if node == []:
                    return None
                stack.append(node[0])
        else:
            stack.append(c)
    return stack

def syntax_tree_to_newick(syntax_tree):
    s = "("
    for child in syntax_tree[1:]:
        if not isinstance(child,list):
            s += child
        else:
            s += syntax_tree_to_newick(child)
    s += ")" + str(syntax_tree[0]) + ","
    return s


def get_all_tree(parse_tree):
    parse_tree_text = to_newick_format(parse_tree)
    tree = Tree(parse_tree_text, format=1)
    treelist = []
    tree_dict = {o:str(i) for i,o in enumerate(tree.get_leaves())}
    return [[int(i) for i in o.split()] for o in set(_get_all_tree(tree, treelist, tree_dict))]

def _get_all_tree(tree, treelist, tree_dict):
    punct = ['.', ',']
    treelist.append(' '.join([tree_dict[o] for o in tree.get_leaves() if str(o).split('-')[-1] not in punct]))
    if tree.get_children() == []:
        return treelist
    else:
        for child in tree.get_children():
            treelist = _get_all_tree(child, treelist, tree_dict)
        return treelist
    

def merge3dicts(x, y, z):
    m = x
    m.update(y)
    m.update(z)
    return m

def get_related_doc(parse_data, docid):
    ret = []
    for i, r in enumerate(parse_data):
        if r['DocID'] == docid:
            ret.append(r)
    return ret

In [163]:
# parse_tree = "( (S (S (NP (DT Some)) (VP (MD may) (VP (VB have) (VP (VBN forgotten))))) (: --) (CC and) (S (NP (DT some) (JJR younger) (NNS ones)) (VP (MD may) (ADVP (RB never)) (VP (VB have) (ADJP (JJ experienced)) (: --) (SBAR (WHNP (WP what)) (S (NP (PRP it)) (VP (VBZ 's) (VP (VB like) (S (VP (TO to) (VP (VB invest) (PP (IN during) (NP (DT a) (NN recession))))))))))))) (. .)) )"
# all_tree = get_all_tree(parse_tree)

In [104]:
conll_train = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-train/pdtb-parses.json'
parse_dict_train = json.loads(codecs.open(conll_train, encoding='utf-8', errors='ignore').read())
conll_dev = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-dev/pdtb-parses.json'
parse_dict_dev = json.loads(codecs.open(conll_dev, encoding='utf-8', errors='ignore').read())
conll_test = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-test/pdtb-parses.json'
parse_dict_test = json.loads(codecs.open(conll_test, encoding='utf-8', errors='ignore').read())
parse_dict = merge3dicts(parse_dict_train, parse_dict_dev, parse_dict_test)

parse_data_path = "/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-train/relations.json"
parse_data_dev_path = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-dev/relations.json'
parse_data_test_path = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-test/relations.json'
parse_data = [json.loads(line) for line in codecs.open(parse_data_path).readlines()]
parse_data_dev = [json.loads(line) for line in codecs.open(parse_data_dev_path).readlines()]
parse_data_test = [json.loads(line) for line in codecs.open(parse_data_test_path).readlines()]
parse_data.extend(parse_data_dev)
parse_data.extend(parse_data_test)

datasets loaded


In [164]:
true = 0
false = 0
k=0
for DocID in list(parse_dict.keys()):
    k=0
    related_doc = get_related_doc(parse_data, DocID)
    doc_trees = {}
    for i, sentence in enumerate(parse_dict[DocID]['sentences']):
        try:
            doc_trees[i] = get_all_tree(sentence['parsetree'])
        except Exception:
            k = 1
    if k == 1:
        continue
    for relation in related_doc:
        arg1_sentence_id = [o[3] for o in relation['Arg1']['TokenList']]
        if len(set(arg1_sentence_id)) == 1:
            arg1_token = [o[4] for o in relation['Arg1']['TokenList']]

        if arg1_token in doc_trees[arg1_sentence_id[0]]:
#             print(True)
            true += 1
        else:
            false += 1
#             print(' '.join([parse_dict[DocID]['sentences'][arg1_sentence_id[0]]['words'][o][0] for o in arg1_token]))
#             print()
#             for tree_index in doc_trees[arg1_sentence_id[0]]:
#                 print(' '.join([parse_dict[DocID]['sentences'][arg1_sentence_id[0]]['words'][o][0] for o in tree_index]))
#             print(False)
#             print('========================================')
            

In [165]:
parse_dict[DocID]['sentences'][0]['parsetree']

"( (S (NP (DT The) (NN following)) (VP (VBD were) (PP (IN among) (NP (NP (NP (NNP Friday) (POS 's)) (NNS offerings) (CC and) (NNS pricings)) (PP (IN in) (NP (DT the) (UCP (NNP U.S.) (CC and) (JJ non-U.S.)) (NN capital) (NNS markets))))) (, ,) (PP (IN with) (NP (NP (NNS terms)) (CC and) (NP (NN syndicate) (NN manager)))) (, ,) (SBAR (IN as) (S (VP (VBN compiled) (PP (IN by) (NP (NNP Dow) (NNP Jones) (NNP Capital) (NNP Markets) (NNP Report))))))) (: :)) )\n"

In [166]:
true / (true + false)

0.6426481620405101

In [158]:
conll_train = '/home/pengfei/data/PDTB-3.0/all/conll/train/pdtb-parses.json'
parse_dict_train = json.loads(codecs.open(conll_train, encoding='utf-8', errors='ignore').read())
conll_dev = '/home/pengfei/data/PDTB-3.0/all/conll/dev/pdtb-parses.json'
parse_dict_dev = json.loads(codecs.open(conll_dev, encoding='utf-8', errors='ignore').read())
conll_test = '/home/pengfei/data/PDTB-3.0/all/conll/test/pdtb-parses.json'
parse_dict_test = json.loads(codecs.open(conll_test, encoding='utf-8', errors='ignore').read())
print("datasets loaded")
parse_dict = merge3dicts(parse_dict_train, parse_dict_dev, parse_dict_test)

parse_data_path = "/home/pengfei/data/PDTB-3.0/all/conll/train/relations.json"
parse_data_dev_path = '/home/pengfei/data/PDTB-3.0/all/conll/dev/relations.json'
parse_data_test_path = '/home/pengfei/data/PDTB-3.0/all/conll/test/relations.json'
parse_data = [json.loads(line) for line in codecs.open(parse_data_path).readlines()]
parse_data_dev = [json.loads(line) for line in codecs.open(parse_data_dev_path).readlines()]
parse_data_test = [json.loads(line) for line in codecs.open(parse_data_test_path).readlines()]
parse_data.extend(parse_data_dev)
parse_data.extend(parse_data_test)

datasets loaded


In [161]:
true = 0
false = 0
k=0
for DocID in list(parse_dict.keys()):
    k=0
    related_doc = get_related_doc(parse_data, DocID)
    doc_trees = {}
    for i, sentence in enumerate(parse_dict[DocID]['sentences']):
        try:
            doc_trees[i] = get_all_tree(sentence['parsetree'])
        except Exception:
            k = 1
    if k == 1:
        continue
    for relation in related_doc:
        arg1_sentence_id = [o[3] for o in relation['Arg1']['TokenList']]
        if len(set(arg1_sentence_id)) == 1:
            arg1_token = [o[4] for o in relation['Arg1']['TokenList']]

        if arg1_token in doc_trees[arg1_sentence_id[0]]:
#             print(True)
            true += 1
        else:
            false += 1
#             print(' '.join([parse_dict[DocID]['sentences'][arg1_sentence_id[0]]['words'][o][0] for o in arg1_token]))
#             print()
#             for tree_index in doc_trees[arg1_sentence_id[0]]:
#                 print(' '.join([parse_dict[DocID]['sentences'][arg1_sentence_id[0]]['words'][o][0] for o in tree_index]))
#             print(False)
#             print('========================================')
            

In [162]:
true / (true + false)

0.6426481620405101

In [167]:
conll_train = '/home/pengfei/data/PDTB-3.0/all/conll/train/pdtb-parses.json'
parse_dict_train = json.loads(codecs.open(conll_train, encoding='utf-8', errors='ignore').read())
conll_dev = '/home/pengfei/data/PDTB-3.0/all/conll/dev/pdtb-parses.json'
parse_dict_dev = json.loads(codecs.open(conll_dev, encoding='utf-8', errors='ignore').read())
conll_test = '/home/pengfei/data/PDTB-3.0/all/conll/test/pdtb-parses.json'
parse_dict_test = json.loads(codecs.open(conll_test, encoding='utf-8', errors='ignore').read())
print("datasets loaded")
parse_dict = merge3dicts(parse_dict_train, parse_dict_dev, parse_dict_test)

parse_data_path = "/home/pengfei/data/PDTB-3.0/all/conll/train/relations.json"
parse_data_dev_path = '/home/pengfei/data/PDTB-3.0/all/conll/dev/relations.json'
parse_data_test_path = '/home/pengfei/data/PDTB-3.0/all/conll/test/relations.json'
parse_data = [json.loads(line) for line in codecs.open(parse_data_path).readlines()]
parse_data_dev = [json.loads(line) for line in codecs.open(parse_data_dev_path).readlines()]
parse_data_test = [json.loads(line) for line in codecs.open(parse_data_test_path).readlines()]
parse_data.extend(parse_data_dev)
parse_data.extend(parse_data_test)

datasets loaded


# senses and their distribution

In [170]:
senses = []
for r in parse_data:
    for s in r['Sense']:
        senses.append(s)

In [173]:
Counter(senses).most_common()

[('Expansion.Conjunction', 11931),
 ('Comparison.Concession.Arg2-as-denier', 4942),
 ('EntRel', 4858),
 ('Contingency.Cause.Reason', 4503),
 ('Contingency.Cause.Result', 3573),
 ('Expansion.Level-of-detail.Arg2-as-detail', 3018),
 ('Temporal.Synchronous', 2290),
 ('Comparison.Contrast', 2097),
 ('Temporal.Asynchronous.Precedence', 1925),
 ('Expansion.Instantiation.Arg2-as-instance', 1707),
 ('Contingency.Condition.Arg2-as-cond', 1528),
 ('Contingency.Purpose.Arg2-as-goal', 1493),
 ('Temporal.Asynchronous.Succession', 1238),
 ('Comparison.Concession.Arg1-as-denier', 780),
 ('Expansion.Manner.Arg2-as-manner', 643),
 ('Expansion.Manner.Arg1-as-manner', 536),
 ('Expansion.Substitution.Arg2-as-subst', 532),
 ('Expansion.Equivalence', 343),
 ('Expansion.Disjunction', 282),
 ('Expansion.Level-of-detail.Arg1-as-detail', 279),
 ('NoRel', 243),
 ('Comparison.Similarity', 171),
 ('Contingency.Cause+Belief.Reason+Belief', 163),
 ('Hypophora', 122),
 ('Contingency.Purpose.Arg1-as-goal', 118),
 ('Ex