In [1]:
import os
from pathlib import Path
import pandas as pd
import json
import codecs
import numpy as np
import re
from collections import OrderedDict

In [14]:
def get_list(parse_dict, doc):
    doc = parse_dict[doc]['sentences']
    char_start_list = []
    char_end_list = []
    for sentence in doc:
        for word in sentence['words']:
            char_start_list.append(word[1]['CharacterOffsetBegin'])
            char_end_list.append(word[1]['CharacterOffsetEnd'])
    return char_start_list, char_end_list


def get_span_string(span_list):
    ret = ''
    for span in span_list:
        ret += str(span[0])
        ret += '..'
        ret += str(span[1])
        ret += ';'
    return ret[:-1]

def get_span_list(span):
    if span == '':
        assert(False)
    spans = span.split(';')
    return [[int(k) for k in o.split('..')] for o in spans if o != '']

def get_doc_word_dict(parse_dict, DocID):
    ret={}
    doc_token_index = 0
    for sent_index, sentence in enumerate(parse_dict[DocID]['sentences']):
        for token_index, token in enumerate(sentence['words']):
            start = token[1]['CharacterOffsetBegin']
            end = token[1]['CharacterOffsetEnd']
            ret[(start, end)] = [start, end, doc_token_index, sent_index, token_index]
            doc_token_index += 1
    return ret

def get_token_list(char_span_list, doc_word_dict):
    tokenlist = []
    doc_word_dict = OrderedDict(sorted(doc_word_dict.items()), keys=lambda x:x[0][0])
    for span in char_span_list:
        for key, value in doc_word_dict.items():
            if key[1] <= span[1]:
                break
            if key[0] >= span[0]:
                tokenlist.append(value)
    return tokenlist

def merge3dicts(x, y, z):
    m = x
    m.update(y)
    m.update(z)
    return m

In [3]:
pdtb3 = pd.read_csv('pdtb3.csv')
pdtb3.head()

Unnamed: 0,DocID,Relation_Type,Conn_SpanList,Conn_Src,Conn_Type,Conn_Pol,Conn_Det,Conn_Feat_SpanList,Conn1,SClass1A,...,Arg2_Det,Arg2_Feat_SpanList,Sup2_SpanList,Adju_Reason,Adju_Disagr,PB_Role,PB_Verb,Offset,Provenance,Link
0,wsj_0793,EntRel,,,,,,,,,...,,,,,,,,166,PDTB2::wsj_0793::166::SAME,
1,wsj_0793,Explicit,197..204,,,,,,instead,Expansion.Substitution.Arg2-as-subst,...,,,,,,,,197..204,PDTB2::wsj_0793::197..204::CHANGED,
2,wsj_0793,Implicit,,,,,,,then,Temporal.Asynchronous.Precedence,...,,,,,,,,281,PDTB3,
3,wsj_0793,Implicit,,,,,,,as a result,Contingency.Cause.Result,...,,,,,,,,333,PDTB2::wsj_0793::333::SAME,
4,wsj_0793,Implicit,,,,,,,but,Comparison.Concession.Arg2-as-denier,...,,,,,,,,740,PDTB2::wsj_0793::740::CHANGED,


In [7]:
conll_train = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-train/pdtb-parses.json'
parse_dict_train = json.loads(codecs.open(conll_train, encoding='utf-8', errors='ignore').read())
conll_dev = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-dev/pdtb-parses.json'
parse_dict_dev = json.loads(codecs.open(conll_dev, encoding='utf-8', errors='ignore').read())
conll_test = '/home/pengfei/data/2015-2016_conll_shared_task/data/conll16st-en-03-29-16-test/pdtb-parses.json'
parse_dict_test = json.loads(codecs.open(conll_test, encoding='utf-8', errors='ignore').read())
print("datasets loaded")
parse_dict = merge3dicts(parse_dict_train, parse_dict_dev, parse_dict_test)

datasets loaded


In [8]:
rawtext_foldername = Path('/home/pengfei/data/PDTB-3.0/all/raw')

In [20]:
relations = []
unattended = []
for i in range(len(pdtb3)):
    if i%1000 == 0:print(i)
    if pdtb3.loc[i,'DocID'] not in parse_dict.keys(): continue;unattended.append(pdtb3.loc[i,'DocID'])
    relation = {}

    relation['DocID'] = pdtb3.loc[i, 'DocID']

    relation['ID'] = i

    Sense = [pdtb3.loc[i,'SClass1A']]
    if type(pdtb3.loc[i,'SClass1B']) != float: Sense.append(pdtb3.loc[i,'SClass1B'])
    if type(pdtb3.loc[i,'SClass2A']) != float: Sense.append(pdtb3.loc[i,'SClass2A'])
    if type(pdtb3.loc[i,'SClass2B']) != float: Sense.append(pdtb3.loc[i,'SClass2B'])
    relation['Sense'] = Sense

    relation['Type'] = pdtb3.loc[i, 'Relation_Type']

    doc_word_dict = get_doc_word_dict(parse_dict, pdtb3.loc[i, 'DocID'])
    rawtext = codecs.open(rawtext_foldername/pdtb3.loc[i,'DocID'], encoding='utf-8', errors='ignore').read()

    # connective
    relation['Connective'] = {}
    if relation['Type'] in ['Explicit', 'AltLex', 'AltLexC']:
        relation['Connective']['CharacterSpanList'] = get_span_list(pdtb3.loc[i, 'Conn_SpanList'])
        relation['Connective']['RawText'] = pdtb3.loc[i, 'Conn1']
        relation['Connective']['TokenList'] = get_token_list(relation['Connective']['CharacterSpanList'], doc_word_dict)
    else:
        relation['Connective']['CharacterSpanList'] = []
        relation['Connective']['RawText'] = pdtb3.loc[i, 'Conn1']

    # Arg1
    relation['Arg1'] = {}
    char_span_list = get_span_list(pdtb3.loc[i,'Arg1_SpanList'])
    relation['Arg1']['CharacterSpanList'] = char_span_list
    arg_rawtext = ' '.join([rawtext[o[0]:o[1]] for o in char_span_list])
    relation['Arg1']['RawText'] = arg_rawtext
    arg_tokenlist = get_token_list(char_span_list, doc_word_dict)
    relation['Arg1']['TokenList'] = arg_tokenlist

    # Arg2
    relation['Arg2'] = {}
    char_span_list = get_span_list(pdtb3.loc[i,'Arg2_SpanList'])
    relation['Arg2']['CharacterSpanList'] = char_span_list
    arg_rawtext = ' '.join([rawtext[o[0]:o[1]] for o in char_span_list])
    relation['Arg2']['RawText'] = arg_rawtext
    arg_tokenlist = get_token_list(char_span_list, doc_word_dict)
    relation['Arg2']['TokenList'] = arg_tokenlist

    relations.append(relation)


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
