In [1]:
import spacy,en_core_web_sm
from pathlib import Path
import math
import pandas as pd
import os
from stanfordcorenlp import StanfordCoreNLP
import re
import json
from benepar.spacy_plugin import BeneparComponent

Problems: 
    1. from wsj_1001: sentence: 
    Rated Ba-3 by Moody's Investors Service Inc. and single-B-plus by Standard & Poor's Corp., the issue will be sold through Merrill Lynch Capital Markets. 
    tokenizer separate the sentence to: "plus by Standard & Poor's Corp.

In [2]:
nlp = StanfordCoreNLP(r'/home/pengfei/documents/stanford-corenlp-full-2018-10-05/')
tokenizor = spacy.load('en')
tokenizor.add_pipe(BeneparComponent("benepar_en2"))
DATA_PATH = Path("/home/pengfei/data/experiment/")
PDTB_PARSE = 'pdtb-parses.json'
PDTB_DATA = 'pdtb-data.json'

In [3]:
pdtb2 = pd.read_csv("../../pdtb2/pdtb2.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def get_linker(words, doc_lookup):
    ## TODO: connective also has linker
    """
    get look up for each word per doc level, if word does not below to any arg, it will append []
    Args:
            words(dict): {word_index_in_doc: [word_start_char, word_end_char]}
            doc_lookup: from function @get_data_prototype
    Returns: 
            ret: {word_index_in_doc: ['arg1_argID', 'arg2_argID', ...]}
    """
    lookup_list = doc_lookup.keys()
    ret = {}
    
    for w_idx, w_range in words.items():
        w_linkers = []
        for r in doc_lookup.keys():
            r_list = _get_span_list(r)
            if _in_between(w_range, r_list):
                linker = doc_lookup[r]
                w_linkers.append(linker)
        ret[w_idx] = w_linkers
    return ret

def _in_between(inner_list, outer_list):
    """
    Args:
            inner_list(list[int]): [3,4]
            outer_list(list[list[int]]): [[2,5]]
    Returns: 
            boolean
    """
    if len(outer_list) < 1:
        return False
    elif type(outer_list[0]) == str:
        return (inner_list[0] >= int(outer_list[0])) & (inner_list[1] <= int(outer_list[1]))
    elif len(outer_list) == 2:
        for r in outer_list:
            if _in_between(inner_list, r):
                return True
        return False
    
    return (inner_list[0] >= int(outer_list[0][0])) & (inner_list[1] <= int(outer_list[0][1]))
        
    
def _get_span_list(span):
    """
    Args:
            span(str): "34..96;97..101"
    Returns:
            ret(list[list[str]]):
            if one arg has one segment: [[34,96]]
            if one arg has two segment: example [[34, 96], [97, 101]]
    """
    if type(span) == float:
        return []
    spans = span.split(';')
    return [o.split('..') for o in spans]
        
def get_batch(section, filenumber):
    """
    Args: 
            section(int)
            filenumber(int)
    Returns:
            (list[int]): list of index with all the file with the same format in that batch"""
    return pdtb2.index[(pdtb2['Section'] == section) & (pdtb2.FileNumber == filenumber)].tolist()
# get_batch(0, 4)


def get_data_prototype(section, filenumber, relation_id, batch_idx):
    """
    Args:
            section(int)
            filenumber(int)
            relation_id(int): relation_id is unique accross every relation, this is start relation_id
            batch_idx(list[int]): list of corresponding int from same file
    Returns:
            doc_data: [{"Arg1": {"CharacterSpanList": [[4564, 4610]], 
                "RawText": "", 
                "TokenList": []}, # example: [4612, 4616, 888, 32, 11]
                "DocID": "wsj_1000", 
                "ID": 15025, 
                "Sense": ["Contingency.Condition"], 
                "Type": "Explicit"},
                {}, ...]
            doc_lookup: dictionary contains all the span for argument 1 or argument 2 or conn
                used for function @get_linker
                {"34..36;90..107": ["arg1_1234"]}
                
            
    """
    doc_data = []
    doc_lookup = {}
    for i, idx in enumerate(batch_idx):
        arg1_span = pdtb2.loc[idx, 'Arg1_SpanList']
        arg1_char_span_list = _get_span_list(arg1_span)
        arg1_rawtext = pdtb2.loc[idx, 'Arg1_RawText']
        arg1_token_list = []
        
        arg2_span = pdtb2.loc[idx, 'Arg2_SpanList']
        arg2_char_span_list = _get_span_list(arg2_span)
        arg2_rawtext = pdtb2.loc[idx, 'Arg2_RawText']
        arg2_token_list = []
        
        conn_span = pdtb2.loc[idx, 'Connective_SpanList']
        conn_char_span_list = _get_span_list(conn_span)
        conn_rawtext = pdtb2.loc[idx, 'Connective_RawText']
        conn_token_list = []
        
        relation_type = pdtb2.loc[idx, 'Relation']
        relation_sense = [pdtb2.loc[idx, 'ConnHeadSemClass1']]
        if relation_type == 'NoRel' or relation_type == 'EntRel':  
            relation_sense = [relation_type]
        elif type(pdtb2.loc[idx, 'ConnHeadSemClass2']) == str:
            relation_sense.append(pdtb2.loc[idx, 'ConnHeadSemClass2'])
        Doc_id = str(section) + '/' + str(filenumber)
        
        
        ID = relation_id+i
        
        doc_lookup[arg1_span] = 'Arg1_' + str(ID)
        doc_lookup[arg2_span] = 'Arg2_' + str(ID)
        doc_lookup[conn_span] = 'Connective_' + str(ID)
        
        arg1 = {"CharacterSpanList": arg1_char_span_list, "RawText": arg1_rawtext, "TokenList": arg1_token_list}
        arg2 = {"CharacterSpanList": arg2_char_span_list, "RawText": arg2_rawtext, "TokenList": arg2_token_list}
        conn = {"CharacterSpanList": conn_span, "RawText": conn_rawtext, "TokenList": conn_token_list}
        relation_dict = {"Arg1": arg1, "Arg2": arg2, "Connective": conn, "DocID": Doc_id, "ID": ID, 
                      "Sense": relation_sense, "Type": relation_type}
        doc_data.append(relation_dict)
        
    return doc_data, doc_lookup

def _get_files(data_path=DATA_PATH):
    """Returns (list): [('00', 'wsj_0000'), ..., ('24', 'wsj_2400']"""
    sections = os.listdir(data_path)
    ret = []
    for sec in sections:
        for filename in os.listdir(data_path/sec):
            ret.append((sec, filename))
    return ret

def read_file(section, filename, data_path=DATA_PATH):
    """Returns the content in that file"""
    with open(data_path/section/filename) as f:
        file = f.read()
    return file

In [5]:
def constituent_parsing(sent):
    return str(sent._.parse_string).replace('\n', " ").replace('   ', ' ').replace('  ', ' ')

def dependency_parse(sent):
    s = nlp.word_tokenize(sent)
    tokens = [token for token in s]
    tokens.insert(0,'ROOT')
    dependency = []
    for tag, start, end in nlp.dependency_parse(sent):
        dependency.append([tag, f'{tokens[start]}-{start}', f'{tokens[end]}-{end}'])
    return dependency


In [6]:
def get_word_index(doc):
    """
    Args:
            doc: after tokenized document
    Returns: 
            words(dict): {word_index_in_doc: [word_start_char, word_end_char]}
    """
    words = {}
    for tok in doc:
        words[tok.i] = [tok.idx, tok.idx+len(tok)]
    return words

def add_token_list(doc_data, linkers, token_list):
    """
    Args: 
            doc_data: return value from @function get_data_prototype
            linker(list[str]): ["arg1_1234", "conn_1235"]
            token_list: append at token_list
    """
    for linker in linkers:
        arg_id = linker.split('_')[1]
        
        for rel in doc_data:
            if rel["ID"] == int(arg_id):
                rel[linker.split('_')[0]]["TokenList"].append(token_list)

def list_strip_punctuation(list):
    """Args:list[str]"""
    punctuation = """!"#&'*+,-..../:;<=>?@[\]^_`|~""" + "``" + "''"
    i = 0
    while i < len(list) and list[i][0] in punctuation + "-LCB--LRB-":
        i += 1
    if i == len(list):
        return []
    j = len(list) - 1
    while j >= 0 and list[j][0] in punctuation + "-RRB--RCB-":
        j -= 1
    return list[i: j+1]


def data_generator_per_doc_token_level(section, filenumber, relation_id, batch_idx, rawtext):
    """generate json format data for one document
    Args:
            section(str): 0~24
            filenumber(str): 0~99
            relation_id(int): relation_id is unique accross every relation, this is start relation_id
            batch_idx(list[int]): list of corresponding int from same file
            rawtext(str): rawtext of that doc, this rawtext is purged of \n
    Returns:
            pdtb-json
            pdtb-data
    """
    # for data
    doc_data, doc_lookup = get_data_prototype(section, filenumber, relation_id, batch_idx)
    doc = tokenizor(rawtext)
    words = get_word_index(doc)
    linkers = get_linker(words, doc_lookup)
    
    # for parse
    sentences = []
    # for each token in document
    for sentence_idx, sentence in enumerate(doc.sents):
        ## TODO: here need to change
        if sentence_idx == 0:
            continue
        else:   
            # begin of the new round
            words = []
            sent = str(sentence)
            dependencies = dependency_parse(sent)
            parsetree = constituent_parsing(sentence)
            
            if section == 10 and filenumber == 1 and sentence_idx == 3:
                print(parsetree)
                print(sent)
            for tok_idx_in_sentence, tok in enumerate(sentence):
                ## 5 attribute of each word
                char_offset_begin = tok.idx
                char_offset_end = tok.idx + len(tok)
                token_offset_in_doc = tok.i
                sentence_offset = sentence_idx
                token_offset_in_sent = tok_idx_in_sentence

                linker = linkers[token_offset_in_doc]
                token_dict = {'CharacterOffsetBegin': char_offset_begin, "CharacterOffsetEnd": char_offset_end, 
                              "PartOfSpeech": tok.pos_, "Linkers": linker}
                words.append([tok.text, token_dict])

                token_list = [char_offset_begin, char_offset_end, token_offset_in_doc, 
                              sentence_offset, token_offset_in_sent]
                add_token_list(doc_data, linker, token_list)
            
            if list_strip_punctuation(words) == []:
                continue
            sentences.append({"dependencies": dependencies, "parsetree": parsetree, "words": words})
    return {'sentences': sentences} , doc_data

            
def data_generator():
    """
    Returns:
            docs_parse(dict{docid: parse})
            docs_data(list[relation])
    """
    doc_paths = _get_files()
    relation_id = 0
    docs_parse = {}
    docs_data = []
    for doc_path in doc_paths:
        section = doc_path[0]
        filename = doc_path[1]
        print("section: ", section, " filename: ", filename)
        rawtext = read_file(section, filename)
        rawtext = rawtext.replace('\n', '')
        section = int(section)
        filenumber = int(filename[-2:])
        batch_idx = get_batch(section, filenumber)
        
        doc_parse, doc_data = data_generator_per_doc_token_level(section, filenumber, relation_id, batch_idx, rawtext)
        relation_id += len(batch_idx)
        
        docs_parse[filename] = doc_parse
        docs_data.extend(doc_data)
        
    return docs_parse, docs_data

In [7]:
def write_parse(docs_parse, filename=PDTB_PARSE):
    with open(filename, 'w') as f:
        content = json.dumps(docs_parse)
        f.write(content)

def write_data(relations, filename=PDTB_DATA):
    with open(filename, 'w') as f:
        for relation in relations:
            content = json.dumps(relation)
            f.write(content)
            f.write('\n')

In [8]:
docs_parse, docs_data = data_generator()
nlp.close()

section:  01  filename:  wsj_0138
section:  01  filename:  wsj_0139
section:  00  filename:  wsj_0001
section:  00  filename:  wsj_0002
section:  10  filename:  wsj_1001
(S (NP (DT 8%.The) (NNS notes)) (VP (VP (VBP are) (NP (JJ zero) (: -) (NN coupon) (NNS securities))) (CC and) (VP (MD will) (RB not) (VP (VB pay) (NP (NN interest)) (ADVP (RB periodically))))) (. .))
8%.The notes are zero-coupon securities and will not pay interest periodically.
section:  10  filename:  wsj_1000


In [26]:
docs_parse['wsj_1001']['sentences'][3]['parsetree'] = "(S (NP (NP (DT The) (NN size)) (PP (IN of) (NP (DT the) (NN offering)))) (VP (VBD was) (VP (VBN increased) (PP (IN from) (NP (NP (DT the) (ADJP (RB originally) (VBN planned)) (QP ($ $) (CD 250) (CD million))) (PRN (-LRB- -LRB-) (NP (NN redemption) (NN amount)) (-RRB- -RRB-)))))) (. .)) (S (NP (DT The) (NNS notes)) (VP (VBP are) (ADJP (JJ convertible) (PP (IN into) (NP (NP (JJ common) (NN stock)) (PP (IN of) (NP (NNP Blockbuster) (NNP Entertainment))))) (PP (IN at) (NP (NP (NP ($ $) (CD 22.26)) (NP (DT a) (NN share))) (, ,) (VP (VBG representing) (NP (NP (DT a) (ADJP (CD 12) (NN %)) (NN conversion) (NN premium)) (PP (IN over) (NP (NP (NN yesterday) (POS 's)) (NN closing) (NN price))))))))) (. .))"

In [27]:
write_parse(docs_parse)

In [28]:
write_data(docs_data)

In [9]:
with open(DATA_PATH/'01'/'wsj_0138') as f:
    rawtext = f.read()

rawtext has error

In [24]:
sent = "The size of the offering was increased from the originally planned $ 250 million ( redemption amount ) . The notes are convertible into common stock of Blockbuster Entertainment at $ 22.26 a share , representing a 12 % conversion premium over yesterday 's closing price ."
doc = tokenizor(sent)
for sent in doc.sents:
    print(constituent_parsing(sent))

(S (NP (NP (DT The) (NN size)) (PP (IN of) (NP (DT the) (NN offering)))) (VP (VBD was) (VP (VBN increased) (PP (IN from) (NP (NP (DT the) (ADJP (RB originally) (VBN planned)) (QP ($ $) (CD 250) (CD million))) (PRN (-LRB- -LRB-) (NP (NN redemption) (NN amount)) (-RRB- -RRB-)))))) (. .))
(S (NP (DT The) (NNS notes)) (VP (VBP are) (ADJP (JJ convertible) (PP (IN into) (NP (NP (JJ common) (NN stock)) (PP (IN of) (NP (NNP Blockbuster) (NNP Entertainment))))) (PP (IN at) (NP (NP (NP ($ $) (CD 22.26)) (NP (DT a) (NN share))) (, ,) (VP (VBG representing) (NP (NP (DT a) (ADJP (CD 12) (NN %)) (NN conversion) (NN premium)) (PP (IN over) (NP (NP (NN yesterday) (POS 's)) (NN closing) (NN price))))))))) (. .))
