# ongoing implementation of ExportXML importer (discoursegraphs branch: exportxml)

* initially, I was working on Tueba/D-Z 5.0, but now I have version 8.0 available
* the whole corpus is available as a single XML file, which would result in a graph  
  that is way to large for networkx (ca. 1.7 million edges)
  
## Tueba-D/Z 5.0

* I assumed that the corpus could not be split into documents and therefore wrote  
  a parser using igraph instead of networkx
* it turns out that each sentence has a ``origin`` attribute, e.g. ``T990507.2``,  
  which translates into (collection ID: T990507, document id: 2)
* all documents within a collection (NB: I will use those terms, Tueba doesn't)  
  have consequtively numbered token node IDs, i.e. if document 1 contains sentences  
  1 to 12, document 2 might contain sentences 13 to 43
  
## Tueba-D/Z 8.0

* ``tuebadz-8.0-mit-NE+Anaphern+Diskurs.exml.xml`` contains bad XML

```python
XMLSyntaxError: ID text_145 already defined, line 4663422, column 20
```

* two text IDs occur twice: ``text_3160`` and ``text_145``

In [8]:
import os
from collections import Counter
from operator import itemgetter
from lxml import etree, html
import igraph as ig

import discoursegraphs as dg

TUEBADZ5_FILE = os.path.expanduser(
    '~/corpora/tueba/tuebadz-5.0/data/XML/tuebadz-5.0.anaphora.export.xml')

TUEBADZ8_FILE = os.path.expanduser(
    '~/corpora/tueba/TuebaDZ8.0/tuebadz-8.0-mit-NE+Anaphern+Diskurs.exml.xml')

HTML_PARSER = html.HTMLParser()

In [9]:
def add_ns(key, ns='http://www.w3.org/XML/1998/namespace'):
    """
    adds a namespace prefix to a string, e.g. turns 'foo' into
    '{http://www.w3.org/XML/1998/namespace}foo'
    """
    return '{{{namespace}}}{key}'.format(namespace=ns, key=key)

# represent the whole corpus in an iterable (over documents)

In [10]:
from lxml import etree

class ExportXMLCorpus(object):
    def __init__(self, exportxml_file, parse=True):
        """
        Parameters
        ----------
        exportxml_file : str
            path to an ExportXML formatted corpus file
        parse : bool
            If True, create an iterator that parses the documents
            contained in the file into ExportXMLDocumentGraph instances.
            Otherwise, yield the etree element representations of the <text>
            elements found in the document.
        """
        self.parse = parse
        self.__context = etree.iterparse(exportxml_file, events=('end',), tag='text', recover=True)

    def __iter__(self):
        return iter(self.text_iter(self.__context))

    def next(self):
        # to build an iterable, __iter__() would be sufficient,
        # but adding a next() method is quite common
        return self.__iter__().next()
        
    def text_iter(self, context):
        """
        iterates over all the elements in an iterparse context (here: <text> elements)
        and yields an ExportXMLDocumentGraph instance for each of them.
        afterwards, the elements are removed from the DOM / main memory.
        """
        for _event, elem in context:
            if self.parse:
                yield ExportXMLDocumentGraph(elem)
            else:
                yield elem
            # removes element (and references to it) from memory after processing it
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
        del context 

In [27]:
import warnings

import pudb # TODO: remove after debugging

import discoursegraphs as dg
from discoursegraphs import DiscourseDocumentGraph


class ExportXMLDocumentGraph(DiscourseDocumentGraph):
    """
    represents an ExportXML document as a document graph.
    """
    def __init__(self, text_element, name=None, namespace='exportxml',
                 precedence=False):
        """
        creates a document graph from a <text> element from an ExportXML file.
        
        Parameters
        ----------
        text_element : lxml.etree._Element
            a <text> element from an ExportXML file parsed with lxml
        name : str or None
            the name or ID of the graph to be generated. If no name is
            given, the xml:id of the <text> element is used
        namespace : str
            the namespace of the document (default: exportxml)
        precedence : bool
            If True, add precedence relation edges
            (root precedes token1, which precedes token2 etc.)
        """
        # super calls __init__() of base class DiscourseDocumentGraph
        super(ExportXMLDocumentGraph, self).__init__()
        
        self.name = name if name else text_element.attrib[add_ns('id')]
        self.ns = namespace
        self.root = self.ns+':root_node'
        self.add_node(self.root, layers={self.ns}, label=self.ns+':root_node')

        self.sentences = []
        self.tokens = []
        
        self.parsers = {
            'connective': self.add_connective,
            'discRel': self.add_discrel,
            'edu': self.add_edu,
            'edu-range': self.add_edurange,
            'ne': self.add_ne,
            'node': self.add_node_element, # add_node() is already present in graph class
            'relation': self.add_relation,
            'secEdge': self.add_secedge,
            'sentence': self.add_sentence,
            'splitRelation': self.add_splitrelation,
            'topic': self.add_topic,
            'word': self.add_word
        }
        
        self.parse_descedant_elements(text_element)

    def parse_child_elements(self, element):
        '''parses all children of an etree element'''
        for child in element.iterchildren():
            self.parsers[child.tag](child)

    def parse_descedant_elements(self, element):
        '''parses all descendants of an etree element'''
        for descendant in element.iterdescendants():
            self.parsers[descendant.tag](descendant)
  
    def add_connective(self, connective):
        """
        Parameters
        ----------
        connective : etree.Element
            etree representation of a <connective> element
            (annotates connective tokens)

        Example
        -------
          <word xml:id="s29_1" form="Als" pos="KOUS" lemma="als" func="-"
                parent="s29_500" dephead="s29_14" deprel="KONJ">
              <connective konn="als" rel1="Temporal" rel2="enable"/>
          </word>
        """
        word_node_id = self.get_element_id(connective)
        # add a key 'connective' to the token with add rel1/rel2 attributes as a dict and
        # add the token to the namespace:connective layer
        connective_attribs = {key: val for (key, val) in connective.attrib.items() if key != 'konn'}
        word_node = self.node[word_node_id]
        word_node['layers'].add(self.ns+':connective')
        word_node.update({'connective': connective_attribs})
            
    def add_discrel(self, discrel):
        """
        Parameters
        ----------
        add_discrel : etree.Element
            etree representation of a <discRel> element
            Describes the relation between two EDUs.
            The ID of the other EDU is given in the arg2 attribute.
            Note, that arg2 can either reference an EDU (e.g. edu_9_3_2
            or an EDU range, e.g. edus9_3_1-5_0).
        
        Example
        -------

           <edu xml:id="edu_9_3_0">
            <discRel relation="Explanation-Speechact" marking="-" arg2="edus9_3_1-5_0"/>
            <node xml:id="s128_504" cat="SIMPX" func="--">
            ...
            </node>
            <word xml:id="s128_3" form=":" pos="$." lemma=":" func="--" deprel="ROOT"/>
           </edu>

             <edu xml:id="edu_9_3_1">
              <discRel relation="Continuation" marking="-" arg2="edu_9_3_2"/>
              <node xml:id="s128_506" cat="VF" func="-" parent="s128_525">
              ...
              </node>
              ...
             </edu>
        """
        arg1_id = self.get_element_id(discrel)
        arg2_id = discrel.attrib['arg2']
        reltype = discrel.attrib['relation']
        discrel_attribs = self.element_attribs_to_dict(discrel)
        self.node[arg1_id].update(discrel_attribs)
        self.add_layer(arg1_id, self.ns+':discourse')
        self.add_layer(arg1_id, self.ns+':relation')
        self.add_edge(arg1_id, arg2_id,
                      layers={self.ns, self.ns+':discourse', self.ns+':relation'},
                      edge_type=dg.EdgeTypes.pointing_relation,
                      relation=reltype,
                      label='discourse:'+reltype)

    def add_edu(self, edu):
        """
        Parameters
        ----------
        edu : etree.Element
            etree representation of a <edu> element
            (annotates an EDU)
            Note: the arg1 EDU has a discRel child, the arg2 doesn't

        Example
        -------
        <edu xml:id="edu_55_21_1">
         <discRel relation="Explanation-Cause" marking="-|*um zu" arg2="edu_55_21_2"/>
         <word xml:id="s905_9" form="und" pos="KON" lemma="und" func="-" parent="s905_526" dephead="s905_3" deprel="KON"/>
         <node xml:id="s905_525" cat="FKONJ" func="KONJ" parent="s905_526" span="s905_10..s905_19">

        ...

       <edu xml:id="edu_55_21_2" span="s905_14..s905_20">
        <node xml:id="s905_524" cat="NF" func="-" parent="s905_525">
        """
        edu_id = self.get_element_id(edu)
        edu_attribs = self.element_attribs_to_dict(edu) # contains 'span' or nothing
        self.add_node(edu_id, layers={self.ns, self.ns+':edu'}, attr_dict=edu_attribs)
        
        edu_token_ids = []
        for word in edu.iterdescendants('word'):
            word_id = self.get_element_id(word)
            edu_token_ids.append(word_id)
            self.add_edge(edu_id, word_id, layers={self.ns, self.ns+':edu'},
                          edge_type=dg.EdgeTypes.spanning_relation)
        
        self.node[edu_id]['tokens'] = edu_token_ids
    
    def add_edurange(self, edurange):
        """
        Parameters
        ----------
        edurange : etree.Element
            etree representation of a <edurange> element
            (annotation that groups a number of EDUs)            
            <edu-range> seems to glue together a number of `<edu> elements,
            which may be scattered over a number of sentences
            <edu-range> may or may not contain a span attribute
            (it seems that the span attribute is present, when <edu-range> is
            a descendent of <sentence>)

        Example
        -------
    
           <edu-range xml:id="edus9_3_1-5_0" span="s128_4..s130_7">
            <node xml:id="s128_525" cat="SIMPX" func="--">
             <edu xml:id="edu_9_3_1">
              <discRel relation="Continuation" marking="-" arg2="edu_9_3_2"/>
              <node xml:id="s128_506" cat="VF" func="-" parent="s128_525">
               <node xml:id="s128_505" cat="NX" func="ON" parent="s128_506">
                <relation type="expletive"/>
                <word xml:id="s128_4" form="Es" pos="PPER" morph="nsn3" lemma="es" func="HD" parent="s128_505" dephead="s128_5" deprel="SUBJ"/>
               </node>
              </node>

            ...

          <edu-range xml:id="edus37_8_0-8_1">
           <discRel relation="Restatement" marking="-" arg2="edu_37_9_0"/>
           <sentence xml:id="s660">
        """
        edurange_id = self.get_element_id(edurange)
        edurange_attribs = self.element_attribs_to_dict(edurange) # contains 'span' or nothing
        self.add_node(edurange_id, layers={self.ns, self.ns+':edu:range'}, attr_dict=edurange_attribs)
        for edu in edurange.iterdescendants('edu'):
            edu_id = self.get_element_id(edu)
            self.add_edge(edurange_id, edu_id, layers={self.ns, self.ns+':edu:range'},
                          edge_type=dg.EdgeTypes.spanning_relation)

    def add_ne(self, ne):
        """
        Parameters
        ----------
        ne : etree.Element
            etree representation of a <ne> element
            (marks a text span -- (one or more <node> or <word> elements) as a named entity)
        
        Example
        -------
            <ne xml:id="ne_23" type="PER">
             <word xml:id="s3_2" form="Ute" pos="NE" morph="nsf" lemma="Ute" func="-" parent="s3_501" dephead="s3_1" deprel="APP"/>
             <word xml:id="s3_3" form="Wedemeier" pos="NE" morph="nsf" lemma="Wedemeier" func="-" parent="s3_501" dephead="s3_2" deprel="APP"/>
            </ne>
        """
        ne_id = self.get_element_id(ne)
        ne_label = 'ne:'+ne.attrib['type']
        self.add_node(ne_id, layers={self.ns, self.ns+':ne'},
                      attr_dict=self.element_attribs_to_dict(ne),
                      label=ne_label)
        # possible children: [('word', 78703), ('node', 11152), ('ne', 49)]
        for child in ne.iterchildren():
            child_id = self.get_element_id(child)
            self.add_edge(ne_id, child_id, layers={self.ns, self.ns+':ne'},
                          edge_type=dg.EdgeTypes.spanning_relation,
                          label=ne_label)

    def add_node_element(self, node):
        """
        Parameters
        ----------
        node : etree.Element
            etree representation of a <node> element
            A <node> describes an element of a syntax tree.
            The root <node> element does not have a parent attribute,
            while non-root nodes do
        
        Example
        -------
        <node xml:id="s1_505" cat="SIMPX" func="--">
            <node xml:id="s1_501" cat="LK" func="-" parent="s1_505">
            
            # this is the root of the syntax tree of the sentence, but
            # it is not the root node of the sentence, since there might
            # be nodes outside of the tree which are children of the
            # sentence root node (e.g. <word> elements representing a
            # quotation mark)

        """
        node_id = self.get_element_id(node)
        if 'parent' in node.attrib:
            parent_id = self.get_parent_id(node)
        else:
            # <node> is the root of the syntax tree of a sentence,
            # but it might be embedded in a <edu> or <edu-range>.
            # we want to attach it directly to the <sentence> element
            parent_id = self.get_sentence_id(node)
        self.add_node(node_id, layers={self.ns, self.ns+':syntax'},
                      attr_dict=self.element_attribs_to_dict(node),
                      label=node.attrib['cat'])
        self.add_edge(parent_id, node_id, edge_type=dg.EdgeTypes.dominance_relation)

    def add_relation(self, relation):
        """
        Parameters
        ----------
        relation : etree.Element
            etree representation of a <relation> element
            A <relation> always has a type attribute and inherits
            its ID from its parent element. In the case of a non-expletive
            relation, it also has a target attribute.

        Example
        -------

          <node xml:id="s29_501" cat="NX" func="ON" parent="s29_523">
           <relation type="expletive"/>
           <word xml:id="s29_2" form="es" pos="PPER" morph="nsn3" lemma="es"
                 func="HD" parent="s29_501" dephead="s29_14" deprel="SUBJ"/>
          </node>

          ...

         <node xml:id="s4_507" cat="NX" func="ON" parent="s4_513">
          <relation type="coreferential" target="s1_502"/>
          <node xml:id="s4_505" cat="NX" func="HD" parent="s4_507">
          ...
          </node>
         </node>
        """
        parent_node_id = self.get_parent_id(relation)
        reltype = relation.attrib['type']
        # add relation type information to parent node
        self.node[parent_node_id].update({'relation': reltype})
        self.add_layer(parent_node_id, self.ns+':'+reltype)
        if 'target' in relation.attrib:
            # if the relation has no target, it is either 'expletive' or
            # 'inherent_reflexive', both of which should not be part of the
            # 'markable' layer
            self.add_layer(parent_node_id, self.ns+':markable')
            target_id = relation.attrib['target']
            self.add_edge(parent_node_id, target_id,
                          layers={self.ns, self.ns+':'+reltype,
                                  self.ns+':coreference'},
                          label=reltype,
                          edge_type=dg.EdgeTypes.pointing_relation)
            self.add_layer(target_id, self.ns+':markable')

    def add_secedge(self, secedge):
        """
        Parameters
        ----------
        secedge : etree.Element
            etree representation of a <secedge> element
        A <secEdge> element has a cat and a parent attribute,
        but inherits its ID from its parent element.
        It describes a secondary edge in a tree-like syntax representation.

        Example
        -------
           <node xml:id="s10_505" cat="VXINF" func="OV" parent="s10_507">
            <secEdge cat="refvc" parent="s10_504"/>
            <word xml:id="s10_6" form="worden" pos="VAPP" lemma="werden%passiv" func="HD" parent="s10_505" dephead="s10_7" deprel="AUX"/>
           </node>
        """
        edge_source = self.get_parent_id(secedge)
        edge_target = self.get_element_id(secedge)
        self.add_edge(edge_source, edge_target,
                      layers={self.ns, self.ns+':secedge'},
                      label='secedge:'+secedge.attrib['cat'],
                      edge_type=dg.EdgeTypes.pointing_relation)

    def add_sentence(self, sentence):
        """
        Parameters
        ----------
        sentence : etree.Element
            etree representation of a sentence
            (syntax tree with coreference annotation)
        """
        sent_root_id = sentence.attrib[add_ns('id')]
        # add edge from document root to sentence root
        self.add_edge(self.root, sent_root_id, edge_type=dg.EdgeTypes.spanning_relation)
        self.sentences.append(sent_root_id)

        sentence_token_ids = []
        
        for descendant in sentence.iterdescendants('word'):
            sentence_token_ids.append(self.get_element_id(descendant))
        
        self.node[sent_root_id]['tokens'] = sentence_token_ids

    def add_splitrelation(self, splitrelation):
        """
        Parameters
        ----------
        splitrelation : etree.Element
            etree representation of a <splitRelation> element
            A <splitRelation> annotates its parent element (e.g. as an anaphora).
            Its parent can be either a <word> or a <node>.
            A <splitRelation> has a target attribute, which describes
            the targets (plural! e.g. antecedents) of the relation.

        Example
        -------
            <node xml:id="s2527_528" cat="NX" func="-" parent="s2527_529">
             <splitRelation type="split_antecedent" target="s2527_504 s2527_521"/>
             <word xml:id="s2527_32" form="beider" pos="PIDAT" morph="gpf" lemma="beide" func="-" parent="s2527_528" dephead="s2527_33" deprel="DET"/>
             <word xml:id="s2527_33" form="Firmen" pos="NN" morph="gpf" lemma="Firma" func="HD" parent="s2527_528" dephead="s2527_31" deprel="GMOD"/>
            </node>

            <word xml:id="s3456_12" form="ihr" pos="PPOSAT" morph="nsm" lemma="ihr" func="-" parent="s3456_507" dephead="s3456_14" deprel="DET">
             <splitRelation type="split_antecedent" target="s3456_505 s3456_9"/>
            </word>
        """
        source_id = self.get_element_id(splitrelation)
        # the target attribute looks like this: target="s2527_504 s2527_521"
        target_node_ids = splitrelation.attrib['target'].split()
        # we'll create an additional node which spans all target nodes
        target_span_id = '__'.join(target_node_ids)
        reltype = splitrelation.attrib['type']
        self.add_node(source_id,
                      layers={self.ns, self.ns+':relation', self.ns+':'+reltype, self.ns+':markable'})
        self.add_node(target_span_id,
                      layers={self.ns, self.ns+':targetspan', self.ns+':'+reltype, self.ns+':markable'})
        self.add_edge(source_id, target_span_id,
                      layers={self.ns, self.ns+':coreference', self.ns+':'+reltype},
                      edge_type=dg.EdgeTypes.pointing_relation)        
        
        for target_node_id in target_node_ids:
            self.add_edge(target_span_id, target_node_id,
                          layers={self.ns, self.ns+reltype},
                          edge_type=dg.EdgeTypes.spanning_relation)

    def add_topic(self, topic):
        """
        Parameters
        ----------
        topic : etree.Element
            etree representation of a <topic> element
            (topic annotation of a text span, e.g. a sentence, edu or edu-range)

        Example
        -------
            <topic xml:id="topic_9_0" description="Kuli">
                <sentence xml:id="s128">

            ...

            <topic xml:id="topic_37_1" description="Die Pläne der AG">
                <edu-range xml:id="edus37_8_0-8_1">
                    <discRel relation="Restatement" marking="-" arg2="edu_37_9_0"/>
                        <sentence xml:id="s660">
        """
        topic_id = self.get_element_id(topic)
        self.add_node(topic_id, layers={self.ns, self.ns+':topic'},
                      description=topic.attrib['description'])
        topic_tokens = []
        for word in topic.iterdescendants('word'):
            word_id = self.get_element_id(word)
            topic_tokens.append(word_id)
            self.add_edge(topic_id, word_id, layers={self.ns, self.ns+':topic'},
                          edge_type=dg.EdgeTypes.spanning_relation)
        self.node[topic_id]['tokens'] = topic_tokens

    def add_word(self, word):
        """
        Parameters
        ----------
        word : etree.Element
            etree representation of a <word> element
            (i.e. a token, which might contain child elements)
        """
        word_id = self.get_element_id(word)
        if word.getparent().tag in ('node', 'sentence'):
            parent_id = self.get_parent_id(word)
        else:
            # ExportXML is an inline XML format. Therefore, a <word>
            # might be embedded in weird elements. If this is the case,
            # attach it directly to the closest <node> or <sentence> node
            try:
                parent = word.iterancestors(tag=('node', 'sentence')).next()
                parent_id = self.get_element_id(parent)
            except StopIteration as e:
                # there's at least one weird edge case, where a <word> is
                # embedded like this: (text (topic (edu (word))))
                # here, we guess the sentence ID from the
                parent_id = self.get_element_id(word).split('_')[0]

        self.tokens.append(word_id)
        # use all attributes except for the ID
        word_attribs = self.element_attribs_to_dict(word)
        # add the token string under the key namespace:token
        token_str = word_attribs['form']
        word_attribs.update({self.ns+':token': token_str, 'label': token_str})
        self.add_node(word_id, layers={self.ns, self.ns+':token'}, attr_dict=word_attribs)
        self.add_edge(parent_id, word_id, edge_type=dg.EdgeTypes.dominance_relation)
        self.parse_child_elements(word)

    def element_attribs_to_dict(self, element):
        """
        converts the .attrib attributes of an etree element (from ``lxml.etree._Attrib``)
        into a dict, leaving out the xml:id attribute.
        """
        return {key: val for (key, val) in element.attrib.items()
                if key != add_ns('id')}
    
    def get_element_id(self, element):
        """
        Returns the ID of an element (or, if the element doesn't have one:
        the ID of its parent). Returns an error, if both elements have no ID.
        """
        id_attrib_key = add_ns('id')
        if id_attrib_key in element.attrib:
            return element.attrib[id_attrib_key]
        try:
            return element.getparent().attrib[id_attrib_key]
        except KeyError as e:
            raise KeyError(
                'Neither the element "{}" nor its parent "{}" '
                'have an ID'.format(element, element.getparent()))
    
    def get_parent_id(self, element):
        """returns the ID of the parent of the given element"""
        if 'parent' in element.attrib:
            return element.attrib['parent']
        else:
            return element.getparent().attrib[add_ns('id')]
        
    def get_sentence_id(self, element):
        """returns the ID of the sentence the given element belongs to."""
        try:
            sentence_elem = element.iterancestors('sentence').next()
        except StopIteration as e:
            warnings.warn("<{}> element is not a descendant of a <sentence> "
                          "We'll try to extract the sentence ID from the "
                          "prefix of the element ID".format(element.tag))
            return self.get_element_id(element).split('_')[0]
        return self.get_element_id(sentence_elem)


In [28]:
exml_corpus = ExportXMLCorpus(TUEBADZ8_FILE, parse=True)

In [29]:
texts = []

for i in range(10):
    texts.append(exml_corpus.next())

In [30]:
text = texts[9]

dg.write_conll(text, '/tmp/9.conll', coreference_layer='exportxml:coreference')

In [31]:
for doc in ExportXMLCorpus(TUEBADZ8_FILE, parse=True):
    output_filepath = os.path.join('/tmp/{}.conll'.format(doc.name))
    if os.path.isfile(output_filepath):
        # Tüba-D/Z ExportXML 8.0 uses two <text> IDs twice!1!!
        output_filepath = os.path.join('/tmp/{}_a.conll'.format(doc.name))
    dg.write_conll(doc, output_filepath, coreference_layer='exportxml:coreference')

  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))
  docgraph))

RuntimeError: maximum recursion depth exceeded

In [15]:
# text = texts[9]

# for elem in dg.select_nodes_by_layer(text, 'exportxml:discourse'):
#     print elem
#     print text.node[elem]
#     targets = [target for source, target in text.out_edges(elem)]
#     for target in targets:
#         print '\ttarget: ', target, 'target text: ', dg.get_text(text, target)
#     print 'span: ', dg.get_span(text, elem)
#     print 'text: ', dg.get_text(text, elem)
#     print '\n\n'

In [16]:
# for exml_doc in exml_corpus:
# #     pass
#     dg.write_dot(exml_doc, '/tmp/{}.dot'.format(exml_doc.name))

In [17]:
# %load_ext gvmagic

In [18]:
# %dotstr dg.print_dot(text)

# What do those ExportXML elements represent?

In [19]:
# are all <relation> elements without a 'target' attribute of type="expletive"?

from collections import Counter

elements = Counter()

for text_elem in ExportXMLCorpus(TUEBADZ8_FILE, parse=False):
    for relation in text_elem.iterdescendants('relation'):
        if 'target' not in relation.attrib:
            elements[relation.attrib['type']] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

[('inherent_reflexive', 7735), ('expletive', 6835)]

In [26]:
# do all type="inherent_reflexive" <relation> elements belong to a 'sich' token?

from collections import Counter

elements = Counter()

for text_elem in ExportXMLCorpus(TUEBADZ8_FILE, parse=False):
    for relation in text_elem.iterdescendants('relation'):
        if 'target' not in relation.attrib:
            parent_node = relation.getparent()
            words = list(parent_node.iterdescendants('word'))
            assert len(words) == 1, "{}".format(etree.tostring(parent_node))
#             elements[relchildren[0].attrib['form']] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

AssertionError: <node xml:id="s274_543" cat="FKONJ" func="KONJ" parent="s274_544">
       <relation type="expletive"/>
       <node xml:id="s274_521" cat="LK" func="-" parent="s274_543">
        <node xml:id="s274_504" cat="VXFIN" func="HD" parent="s274_521">
         <word xml:id="s274_10" form="sei" pos="VAFIN" morph="3sks" lemma="sein%aux" func="HD" parent="s274_504" dephead="s274_35" deprel="S"/>
        </node>
       </node>
       <node xml:id="s274_541" cat="MF" func="-" parent="s274_543">
        <node xml:id="s274_539" cat="NX" func="PRED" parent="s274_541">
         <word xml:id="s274_11" form="als" pos="KOKOM" lemma="als" func="-" parent="s274_539" dephead="s274_12" deprel="-UNKNOWN-"/>
         <node xml:id="s274_505" cat="NX" func="HD" parent="s274_539">
          <relation type="coreferential" target="s272_502"/>
          <word xml:id="s274_12" form="Beitrag" pos="NN" morph="nsm" lemma="Beitrag" func="HD" parent="s274_505" dephead="s274_20" deprel="PRED"/>
         </node>
         <node xml:id="s274_537" cat="PX" func="-" parent="s274_539">
          <word xml:id="s274_13" form="f&#252;r" pos="APPR" morph="a" lemma="f&#252;r" func="-" parent="s274_537" dephead="s274_12" deprel="PP"/>
          <node xml:id="s274_533" cat="NX" func="HD" parent="s274_537">
           <node xml:id="s274_522" cat="NX" func="HD" parent="s274_533">
            <word xml:id="s274_14" form="den" pos="ART" morph="asm" lemma="der" func="-" parent="s274_522" dephead="s274_16" deprel="DET"/>
            <node xml:id="s274_506" cat="ADJX" func="-" parent="s274_522">
             <word xml:id="s274_15" form="kulturellen" pos="ADJA" morph="asm" lemma="kulturell" func="HD" parent="s274_506" dephead="s274_16" deprel="ATTR"/>
            </node>
            <word xml:id="s274_16" form="Austausch" pos="NN" morph="asm" lemma="Austausch" func="HD" parent="s274_522" dephead="s274_13" deprel="PN"/>
           </node>
           <node xml:id="s274_523" cat="PX" func="-" parent="s274_533">
            <word xml:id="s274_17" form="zwischen" pos="APPR" morph="d" lemma="zwischen" func="-" parent="s274_523" dephead="s274_16" deprel="PP"/>
            <node xml:id="s274_507" cat="NX" func="HD" parent="s274_523">
             <word xml:id="s274_18" form="den" pos="ART" morph="dpn" lemma="das" func="-" parent="s274_507" dephead="s274_19" deprel="DET"/>
             <word xml:id="s274_19" form="V&#246;lkern" pos="NN" morph="dpn" lemma="Volk" func="HD" parent="s274_507" dephead="s274_17" deprel="PN"/>
            </node>
           </node>
          </node>
         </node>
        </node>
       </node>
       <node xml:id="s274_524" cat="VC" func="-" parent="s274_543">
        <node xml:id="s274_508" cat="VXINF" func="OV" parent="s274_524">
         <word xml:id="s274_20" form="gedacht" pos="VVPP" lemma="denken" func="HD" parent="s274_508" dephead="s274_10" deprel="AUX"/>
        </node>
       </node>
      </node>
      

In [22]:
# children of <ne>

from collections import Counter

elements = Counter()

for text_elem in ExportXMLCorpus(TUEBADZ8_FILE, parse=False):
    for ne in text_elem.iterdescendants('ne'):
        for ne_child in ne.iterchildren():
            elements[ne_child.tag] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

KeyboardInterrupt: 

In [None]:
# parents of <word>

from collections import Counter

elements = Counter()

for text_elem in ExportXMLCorpus(TUEBADZ8_FILE, parse=False):
    for word in text_elem.iterdescendants('word'):
        elements[word.getparent().tag] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

In [None]:
# internal structure of non-<sentence> children of <text>

from collections import defaultdict

elements = defaultdict(lambda : defaultdict(int))

In [None]:
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE, parse=False):
    for child in text_elem.iterchildren():
        if child.tag != 'sentence':
            for grandchild in child.iterchildren():
                elements[child.tag][grandchild.tag] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

In [None]:
for child in elements:
    print child
    print elements[child], '\n\n'

```python
elements = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for descendant in text_elem.iterdescendants():
        elements[descendant.tag] += 1
```

```python
from operator import itemgetter
sorted(elements.items(), key=itemgetter(1), reverse=True)

[('node', 1611076),
 ('word', 1365642),
 ('relation', 109239),
 ('sentence', 75408),
 ('ne', 66564),
 ('secEdge', 5450),
 ('edu', 1612),
 ('connective', 1522),
 ('discRel', 1458),
 ('edu-range', 323),
 ('splitRelation', 297),
 ('topic', 141)]
```

## Do ``<text>`` elements only contain ``<sentence>`` as children? NO!

* ``<topic>`` can occur as a parent of ``<sentence>`` and describes them
* ``<topic>`` never occur elsewhere

```xml
...
   <word xml:id="s132_21" form="." pos="$." lemma="." func="--" deprel="ROOT"/>
   </edu>
  </sentence>
 </topic>
 <topic xml:id="topic_9_2" description="Probleme bei weiblichen Alkoholismus">
  <sentence xml:id="s133">
   <node xml:id="s133_530" cat="SIMPX" func="--">
    <edu xml:id="edu_9_8_0">
...
```

* ```<edu-range>``` seems to glue together a number of ```<edu>`` elements,  
  which may be scattered over a number of sentences
* ```<edu-range>``` may or may not contain a ``span`` attribute

```python
   <edu-range xml:id="edus9_3_1-5_0" span="s128_4..s130_7">
    <node xml:id="s128_525" cat="SIMPX" func="--">
     <edu xml:id="edu_9_3_1">
      <discRel relation="Continuation" marking="-" arg2="edu_9_3_2"/>
      <node xml:id="s128_506" cat="VF" func="-" parent="s128_525">
       <node xml:id="s128_505" cat="NX" func="ON" parent="s128_506">
        <relation type="expletive"/>
        <word xml:id="s128_4" form="Es" pos="PPER" morph="nsn3" lemma="es" func="HD" parent="s128_505" dephead="s128_5" deprel="SUBJ"/>
       </node>
      </node>
```



## Do ``<topic>`` elements describe one or potentially more sentences?

In [None]:
elements = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for child in text_elem.iterchildren():
        elements[child.tag] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

In [None]:
# children of <topic>

elements = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for topic in text_elem.iterdescendants('topic'):
        for child in topic.iterchildren():
            elements[child.tag] += 1
        
sorted(elements.items(), key=itemgetter(1), reverse=True)

# Do ``<sentence>`` elements only contain ``<node>`` as children? NO!

* ``<word>, <ne>, <edu> and <edu-range>`` can be children of ``<sentence>`` as well!
* e.g. quotation marks ``<word>`` elements occur outside of a ``<node>``

```xml
 <sentence xml:id="s2196">
  <word xml:id="s2196_1" form='"' pos="$(" lemma='"' func="--" deprel="ROOT"/>
  <node xml:id="s2196_526" cat="SIMPX" func="--">
```

In [None]:
# all types of children of <sentence>

sent_children = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for sentence in text_elem.iterdescendants('sentence'):
        for child in sentence.iterchildren():
            sent_children[child.tag] += 1
        
sorted(sent_children.items(), key=itemgetter(1), reverse=True)

In [None]:
# all types of descendants of <sentence>

sent_children = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for sentence in text_elem.iterdescendants('sentence'):
        for child in sentence.iterdescendants():
            sent_children[child.tag] += 1
        
sorted(sent_children.items(), key=itemgetter(1), reverse=True)

# Do ``<word>`` elements have children? YES!

In [None]:
word_children = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for word in text_elem.iterdescendants('word'):
        for child in word.iterdescendants():
            word_children[child.tag] += 1

sorted(word_children.items(), key=itemgetter(1), reverse=True)

* when parsing ``<word>`` as tokens for a sentence, keep an eye on ``<relation>, <connective> and <splitRelation>``
* ``<relation>`` can either occur as a daughter of ``<word>`` **OR** as a predecessor of ``<node>`` or ``<word>``
* a ``<relation>`` always gets its ID from its parent and contains its target as an attribute

```xml
      <node xml:id="s149_507" cat="NX" func="ON" parent="s149_509">
       <relation type="anaphoric" target="s149_501"/>
       <word xml:id="s149_8" form="sie" pos="PPER" morph="np*3" lemma="sie" func="HD" parent="s149_507" dephead="s149_10" deprel="SUBJ"/>
      </node>
```

```xml
     <node xml:id="s4_507" cat="NX" func="ON" parent="s4_513">
      <relation type="coreferential" target="s1_502"/>
      <node xml:id="s4_505" cat="NX" func="HD" parent="s4_507">
       <word xml:id="s4_4" form="die" pos="ART" morph="nsf" lemma="die" func="-" parent="s4_505" dephead="s4_5" deprel="DET"/>
       <ne xml:id="ne_32" type="ORG">
        <word xml:id="s4_5" form="Arbeiterwohlfahrt" pos="NN" morph="nsf" lemma="Arbeiterwohlfahrt" func="HD" parent="s4_505" dephead="s4_3" deprel="SUBJ"/>
       </ne>
      </node>
      <node xml:id="s4_506" cat="NX" func="-" parent="s4_507">
       <ne xml:id="ne_33" type="GPE">
        <word xml:id="s4_6" form="Bremen" pos="NE" morph="nsn" lemma="Bremen" func="HD" parent="s4_506" dephead="s4_5" deprel="APP"/>
       </ne>
      </node>
     </node>
```

```xml
       <word xml:id="s4_7" form="ihren" pos="PPOSAT" morph="asm" lemma="ihr" func="-" parent="s4_509" dephead="s4_9" deprel="DET">
        <relation type="anaphoric" target="s4_507"/>
       </word>
```

* ``<connective>`` is an annotation of a word

```xml
      <word xml:id="s29_1" form="Als" pos="KOUS" lemma="als" func="-" parent="s29_500" dephead="s29_14" deprel="KONJ">
       <connective konn="als" rel1="Temporal" rel2="enable"/>
      </word>
```

* ``<splitRelation>`` can occur as a predecessor of ``<word>`` (i.e. a child of ``<node>``)   **OR**  
  as a child of ``<word>``
  and describes which targets (plural!) its relation has

```xml
        <node xml:id="s2527_528" cat="NX" func="-" parent="s2527_529">
         <splitRelation type="split_antecedent" target="s2527_504 s2527_521"/>
         <word xml:id="s2527_32" form="beider" pos="PIDAT" morph="gpf" lemma="beide" func="-" parent="s2527_528" dephead="s2527_33" deprel="DET"/>
         <word xml:id="s2527_33" form="Firmen" pos="NN" morph="gpf" lemma="Firma" func="HD" parent="s2527_528" dephead="s2527_31" deprel="GMOD"/>
        </node>
```

```xml
        <word xml:id="s3456_12" form="ihr" pos="PPOSAT" morph="nsm" lemma="ihr" func="-" parent="s3456_507" dephead="s3456_14" deprel="DET">
         <splitRelation type="split_antecedent" target="s3456_505 s3456_9"/>
        </word>
```

# are ``<connective>`` elements always children of ``<word>``? YES!

In [None]:
connective_parents = Counter()
for text_elem in ExportXMLCorpus(TUEBADZ8_FILE):
    for connective in text_elem.iterdescendants('connective'):
            connective_parents[connective.getparent().tag] += 1

sorted(connective_parents.items(), key=itemgetter(1), reverse=True)

In [None]:
print text

In [None]:
def get_text(text_element, result_list):
    text_str = 'ID: {}\nOrigin: {}\n\n'.format(text_element.attrib[add_ns('id')], text_element.attrib['origin'])
    for sentence in text_element.iterchildren('sentence'):
        sent_str = u'line: {}\n'.format(sentence.sourceline)
        sent_str += u' '.join(word.attrib['form'] for word in sentence.iterdescendants('word'))
        text_str += u'{}\n'.format(sent_str)
    result_list.append(text_str)

In [None]:
results = []

context = etree.iterparse(TUEBADZ8_FILE, events=('end',), tag='text', recover=True)

In [None]:
%load_ext memory_profiler

In [None]:
# %memit fast_iter(context, get_text, results)
# peak memory: 330.86 MiB, increment: 249.31 MiB
# %time fast_iter(context, get_text, results)
# Wall time: 2min 13s

In [None]:
%memit texts = list(fast_iter_yield(context, get_text, results))
# peak memory: 330.86 MiB, increment: 249.31 MiB
# %time fast_iter(context, get_text, results)
# Wall time: 2min 13s

## example sentences

In [None]:
from collections import Counter

tree = etree.parse(TUEBADZ8_FILE)
sentence_origins = Counter()

sentences_iter = tree.iterfind('sentence')

In [None]:
s0 = sentences_iter.next()

In [None]:
print etree.tostring(s0)

## ExportXML ``<element>`` counts

### What about `<relation>` and `<anaphora>`?

In [None]:
from collections import Counter, defaultdict

def count_element_positions(tree, element):
    element_positions = defaultdict(Counter)

    for elem_instance in tree.iter(element):
        element_positions['parent'][elem_instance.getparent().tag] += 1
        for child in elem_instance.getchildren():
            element_positions['children'][child.tag] += 1
    return element_positions

In [None]:
count_element_positions(tree, 'relation')

In [None]:
count_element_positions(tree, 'anaphora')

In [None]:
anaphora_iter = tree.iter('anaphora')

for i in range(3):
    print etree.tostring(anaphora_iter.next())

In [None]:
anaphora_iter = tree.iter('anaphora')
word_anaphora = [a for a in anaphora_iter if a.getparent().tag == 'word']
for word_ana in word_anaphora[:3]:
    print etree.tostring(word_ana)
        

## unusual examples

### expletive

```
~/corpora/tueba/tuebadz-5.0/data/XML $ ack-grep -A 5 s_11429_n_506 tuebadz-5.0.anaphora.export.xml
        <node cat="NX" comment="" func="ON" id="s_11429_n_506">
          <anaphora>
            <relation type="expletive" antecedent=""/>
          </anaphora>
          <word comment="" form="Es" func="HD" pos="PPER" morph="nsn3" id="s_11429_n_10"/>
        </node>
```

### split antecedent

```
27434-          <node cat="MF" comment="" func="-" id="s_382_n_511">
27435-            <node cat="NX" comment="" func="ON" id="s_382_n_501">
27436-              <anaphora>
27437:                <relation type="split_antecedent" antecedent="s_381_n_9,s_378_n_510"/>
27438-              </anaphora>
27439-              <word comment="" form="die" func="-" pos="ART" morph="npm" id="s_382_n_1"/>
27440-              <word comment="" form="Partner" func="HD" pos="NN" morph="npm" id="s_382_n_2"/>
27441-            </node>
27442-            <node cat="ADVX" comment="" func="OADVP" id="s_382_n_502">
27443-              <word comment="" form="miteinander" func="HD" pos="ADV" morph="--" id="s_382_n_3"/>
27444-            </node>
27445-          </node>
```

In [None]:
anaphora_iter = tree.iter('anaphora')
for anaphora in anaphora_iter:
    # there's only one <relation> child element
    antecedent = anaphora.getchildren()[0].attrib['antecedent']

## ExportXML <element> meanings

* `<node>`: a node in a syntax tree
* `<word>`: a token in a sentence / syntax tree
* `<relation>`: a child of an `<anaphora>` element; it's always a leaf node  
   it has a `type` attrib (relation type) and an `antecedent` attrib (antecedent's node ID)  
   NB: if an anaphora has no antecedent, (e.g. if it's an `expletive` relation) the `antecedent` attrib  
   is an empty string!
* `<anaphora>`: a child of a `<node>` or `<word>` element; always has one `<relation>` child;  
  the element itself contains no information
* `<sentence>`: a sentence / syntax tree

```
 ('secedge', 4647),
 ('originDef', 2213),
 ('morphDef', 437),
 ('posDef', 56),
 ('edgeDef', 50),
 ('nodeDef', 29),
 ('editorDef', 26),
 ('secedgeDef', 7),
 ('comment', 6),


 ('posList', 1),
 ('secedgeList', 1),
 ('editorList', 1),
 ('edgeList', 1),
 ('originList', 1),
 ('format', 1),
 ('morphList', 1),
 ('nodeList', 1),
 ('export', 1)]
```

In [None]:
def parse_anaphora(anaphora, source_id):
    """
    Parameters
    ----------
    anaphora : etree.Element
        an <anaphora> element
    source_id : str
        the node ID of the anaphora (points either to a <node> or a <word>)
    
    Returns
    -------
    antecedent : str
        node ID of the antecedent, e.g. ``s_4_n_527``
    relation_type : str
        anaphoric relation type, e.g. ``anaphoric`` or ``coreferential``
    """
    # there's only one <relation> child element
    relation = anaphora.getchildren()[0]
    return relation.attrib['antecedent'], relation.attrib['type']
    

def exportxml2igraph(exportxml_file):
    """
    TODO: add <node> and <word> attributes
    """
    # in igraph, adding a single edge is prohibitively slow,
    # as the whole index of the graph has to be rebuild!
    # http://stackoverflow.com/questions/13974279/igraph-why-is-add-edge-function-so-slow-ompared-to-add-edges
    # to speed this up, store the edges in a list & call add_edges() once!
    edges = []
    relations = {}
    idocgraph = ig.Graph(directed=True)
    
    treeiter = etree.iterparse(TUEBADZ_FILE, tag='sentence')
    for _action, sentence in treeiter:
        sent_root_id = sentence.attrib['origin']
        idocgraph.add_vertex(sent_root_id, label=sent_root_id)
        
        for element in sentence.iter('node', 'word', 'anaphora'):
            parent_element = element.getparent()
            # some <anaphora> are children of <word> elements
            if parent_element.tag in ('node', 'word'):
                parent_id = parent_element.attrib['id']
            elif parent_element.tag == 'sentence':
                parent_id = parent_element.attrib['origin']
            else:
                sys.stderr.write("Unexpected parent '{}' of element '{}'\n".format(parent_element, element))
            element_id = element.attrib.get('id') # <anaphora> doesn't have an ID

            if element.tag == 'node':
                idocgraph.add_vertex(element_id, label=element.attrib['cat'])
                edges.append((parent_id, element_id))
            elif element.tag == 'word':
                idocgraph.add_vertex(element_id, label=element.attrib['form'])
                edges.append((parent_id, element_id))

            else: # element.tag == 'anaphora'
                # <anaphora> doesn't have an ID, but it's tied to its parent element
                antecedent_str, relation_type = parse_anaphora(element, parent_id)
                if antecedent_str:
                    # there might be more than one antecedent
                    for antecedent_id in antecedent_str.split(','):
                        edge = (parent_id, antecedent_id)
                        edges.append(edge)
                        relations[edge] = relation_type
                else:
                    # there's no antecedent in case of an expletive anaphoric relation
                    relations[(parent_id, None)] = relation_type
      
    idocgraph.add_edges(edges)

    # igraph doesn't store nodes/edge names in a dict, so a lookup would be O(n)
    node_name2id = {node['name']: node.index for node in idocgraph.vs}
    edge_endpoints2id = {(edge.source, edge.target): edge.index
                         for edge in idocgraph.es}

    for (source, target) in relations:
        relation_type = relations[(source, target)]
        if target:
            edge_endpoints = (node_name2id[source], node_name2id[target])
            idocgraph.es[edge_endpoints2id[edge_endpoints]]['exportxml:relation_type'] = relation_type
#             idocgraph.es[idocgraph.get_eid(source, target)]['exportxml:relation_type'] = relation_type

        else:
            # there's no antecedent in case of an expletive anaphoric relation
            
            idocgraph.vs[node_name2id[source]]['exportxml:anaphora_type'] = relation_type
#             idocgraph.vs.select(name=source)['exportxml:anaphora_type'] = relation_type
    return idocgraph

            

In [None]:
%%time
tuebagraph = exportxml2igraph(TUEBADZ_FILE)

In [None]:
tuebagraph.ecount()

In [None]:
tuebagraph.vcount()

In [None]:
import igraph as ig

foo = ig.Graph(directed=True)
foo.add_vertices(['1','2','3','4'])
foo.add_edges([('1', '1'), ('1', '2'), ('3', '4')])

In [None]:
# for e in foo.es:
#     print e, e.index, e.source, e.target
foo.es[0]

In [None]:
for v in foo.vs:
    print v, v.index, v['name']

In [None]:
foo.vs.select(name='2')

In [None]:
foo.vs[0]

In [None]:
list(foo.es.select(_source='3', _target='4'))

In [None]:
from collections import defaultdict

def exportxml2dict(exportxml_file):
    nodes = []
    edges = []
    
    itree = etree.iterparse(TUEBADZ_FILE, tag='sentence')
    for _action, sentence in itree:
        sent_root_id = sentence.attrib['origin']
        nodes.append( (sent_root_id, sent_root_id) )
        
        for element in sentence.iter('node', 'word'):
            parent_element = element.getparent()
            if parent_element.tag == 'node':
                parent_id = parent_element.attrib['id']
            elif parent_element.tag == 'sentence':
                parent_id = parent_element.attrib['origin']
            else:
                sys.stderr.write("Unexpected parent '{}' of element '{}'\n".format(parent_element, element))
            element_id = element.attrib.get('id') # <anaphora> doesn't have an ID

            if element.tag == 'node':
                element_label = element.attrib['cat']
            elif element.tag == 'word':
                element_label = element.attrib['form']
            else:
                continue # for now, ignore other elements (e.g. <anaphora>)

            nodes.append( (element_id, element_label) )
            edges.append( (parent_id, element_id) )
    return nodes, edges

# Speed comparison

In [None]:
%%time
nodes, edges = exportxml2dict(TUEBADZ_FILE) # 6.63s

## igraph: preprocessed node/edge lists, edges batch insert (total: 12.81 s)

- create node/edge lists
- add nodes one by one
- add edges in one go

```
%%time

idocgraph = ig.Graph()
for node_id, node_label in nodes:
    idocgraph.add_vertex(node_id, label=node_label)
idocgraph.add_edges(edges)
```

CPU times: user 6.06 s, sys: 112 ms, total: 6.18 s
Wall time: 6.18 s


## igraph: preprocessed node/edge lists, nodes & edges batch insert (total: 7.85 s)

- create node/edge lists
- add nodes in one go (without labels)
- add edges in one go

```python
%%time

idocgraph = ig.Graph()
node_ids = (node_id for (node_id, node_label) in nodes)
idocgraph.add_vertices(node_ids)
idocgraph.add_edges(edges)
```

CPU times: user 1.16 s, sys: 60 ms, total: 1.22 s
Wall time: 1.22 s

In [None]:
def exportxml2docgraph(exportxml_file):
    edges = []
    docgraph = dg.DiscourseDocumentGraph()
    edge_attribs = {'layers': {docgraph.ns}} # default edge attributes
    
    treeiter = etree.iterparse(TUEBADZ_FILE, tag='sentence')
    for _action, sentence in treeiter:
        sent_root_id = sentence.attrib['origin']
        docgraph.add_node(sent_root_id, label=sent_root_id)
        
        for element in sentence.iter('node', 'word'):
            parent_element = element.getparent()
            if parent_element.tag == 'node':
                parent_id = parent_element.attrib['id']
            elif parent_element.tag == 'sentence':
                parent_id = parent_element.attrib['origin']
            else:
                sys.stderr.write("Unexpected parent '{}' of element '{}'\n".format(parent_element, element))
            element_id = element.attrib.get('id') # <anaphora> doesn't have an ID

            if element.tag == 'node':
                element_label = element.attrib['cat']
            elif element.tag == 'word':
                element_label = element.attrib['form']
            else:
                continue # for now, ignore other elements (e.g. <anaphora>)

            docgraph.add_node(element_id, label=element_label)
            edges.append((parent_id, element_id, edge_attribs))
    docgraph.add_edges_from(edges)
    return docgraph