In [1]:
from docria import Document, DataTypes as T, NodeSpan, set_large_screen, MsgpackCodec, MsgpackDocument
from docria.storage import MsgpackDocumentIO, MsgpackDocumentReader, MsgpackDocumentWriter
from lxml import etree
import regex as re

## Import

In [2]:
%%sh
zcat pubmed_mini/pubmed19n0080.xml.gz | head -n 100

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE PubmedArticleSet SYSTEM "http://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
<PubmedArticleSet>
  <PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
      <PMID Version="1">2393846</PMID>
      <DateCompleted>
        <Year>1990</Year>
        <Month>10</Month>
        <Day>09</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2013</Year>
        <Month>11</Month>
        <Day>21</Day>
      </DateRevised>
      <Article PubModel="Print">
        <Journal>
          <ISSN IssnType="Print">0008-5472</ISSN>
          <JournalIssue CitedMedium="Print">
            <Volume>50</Volume>
            <Issue>18</Issue>
            <PubDate>
              <Year>1990</Year>
              <Month>Sep</Month>
              <Day>15</Day>
            </PubDate>
          </JournalIssue>
          <Title>Cancer research</Title>
          <ISOAbbreviation>Cancer Res.</ISOAbbreviation>
        </Journal>
        <ArticleT

In [3]:
pubmed0080 = etree.parse("pubmed_mini/pubmed19n0080.xml.gz")

In [4]:
articles = pubmed0080.iterfind("PubmedArticle")

In [5]:
article = next(articles)

In [6]:
print(etree.tounicode(article))

<PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
      <PMID Version="1">2393846</PMID>
      <DateCompleted>
        <Year>1990</Year>
        <Month>10</Month>
        <Day>09</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2013</Year>
        <Month>11</Month>
        <Day>21</Day>
      </DateRevised>
      <Article PubModel="Print">
        <Journal>
          <ISSN IssnType="Print">0008-5472</ISSN>
          <JournalIssue CitedMedium="Print">
            <Volume>50</Volume>
            <Issue>18</Issue>
            <PubDate>
              <Year>1990</Year>
              <Month>Sep</Month>
              <Day>15</Day>
            </PubDate>
          </JournalIssue>
          <Title>Cancer research</Title>
          <ISOAbbreviation>Cancer Res.</ISOAbbreviation>
        </Journal>
        <ArticleTitle>Differential growth-inhibitory effects of gallium on B-lymphocyte lines in high versus low iron concentrations.</ArticleTitle>
        <Pagination>
 

In [7]:
article.find(".//Article/ArticleTitle").text

'Differential growth-inhibitory effects of gallium on B-lymphocyte lines in high versus low iron concentrations.'

In [8]:
article.find(".//PMID").text

'2393846'

In [9]:
abstract = article.find(".//Abstract")

In [10]:
print(etree.tounicode(abstract))

<Abstract>
          <AbstractText>The growth inhibitory effects of gallium on a murine and human B-cell line were studied using two different serum-free culture systems: (a) ferric citrate medium containing 500 microM iron and (b) transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron). For the human cell line in ferric citrate medium, 50% growth inhibition achieved in the presence of transferrin-gallium represented a gallium concentration 80-fold lower than the concentration required when gallium nitrate was added. In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects. Monoclonal antibody to the transferrin receptor significantly decreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system. Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellu

In [11]:
descedants = list(abstract.iterdescendants())

In [12]:
descedants

[<Element AbstractText at 0x7faebfe52908>]

In [13]:
abstract_text = "".join(node.text for node in descedants if node.text is not None).strip()
print(abstract_text)

The growth inhibitory effects of gallium on a murine and human B-cell line were studied using two different serum-free culture systems: (a) ferric citrate medium containing 500 microM iron and (b) transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron). For the human cell line in ferric citrate medium, 50% growth inhibition achieved in the presence of transferrin-gallium represented a gallium concentration 80-fold lower than the concentration required when gallium nitrate was added. In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects. Monoclonal antibody to the transferrin receptor significantly decreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system. Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellular uptake. The growth inhibitory e

## Processing prototyping

Test the regex at https://www.regex101.com

In [14]:
last = 0
for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", abstract_text):
    print("SENT:", abstract_text[last:m.start()+1])
    last = m.end()

SENT: The growth inhibitory effects of gallium on a murine and human B-cell line were studied using two different serum-free culture systems: (a) ferric citrate medium containing 500 microM iron and (b) transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron).
SENT: For the human cell line in ferric citrate medium, 50% growth inhibition achieved in the presence of transferrin-gallium represented a gallium concentration 80-fold lower than the concentration required when gallium nitrate was added.
SENT: In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects.
SENT: Monoclonal antibody to the transferrin receptor significantly decreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system.
SENT: Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellular u

In [15]:
set_large_screen()

In [16]:
doc = Document()

In [17]:
doc.props["pmid"] = article.find(".//PMID").text

In [18]:
doc.maintext = abstract_text

In [19]:
doc

Document(0 layers, 1 texts, pmid=2393846)

In [20]:
doc.add_layer("token", text=T.span(), partOfSpeech=T.string, namedEntity=T.string("O"), indx=T.int32)
doc.add_layer("sentence", text=T.span(), tokens=T.nodespan("token"))

Layer 'sentence' with 0 nodes.,Layer 'sentence' with 0 nodes.,Layer 'sentence' with 0 nodes.
#,text,tokens


In [21]:
sentence_layer = doc["sentence"]
token_layer = doc["token"]
last = 0
for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", abstract_text):
    tokens = []
    text = doc.maintext[last:m.start()+1]
    offset = last
    for tok_i, tok_m in enumerate(re.finditer(r"\p{L}+|\p{N}+(\.\p{N}+)?|[\-\/%():,\.;+&#=!?@_<>]", str(text))):
        tokens.append(token_layer.add(indx=tok_i+1, text=doc.maintext[tok_m.start()+offset:tok_m.end()+offset]))
    
    if len(tokens) > 0:
        sentence = sentence_layer.add(text=text)
        sentence["tokens"] = NodeSpan(tokens[0], tokens[-1])

    last = m.end()

In [22]:
doc["sentence"]

Layer 'sentence' with 6 nodes.,Layer 'sentence' with 6 nodes.,Layer 'sentence' with 6 nodes.
#,text,tokens
0,span(main[0:293]) = 'The growth inhibitory effects of gallium on a murine and human B-cell line w ... transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron).',NodeSpan[token: 0 to incl. 57] = 'The growth inhibitory effects of gallium on a murine and human ... transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron).'
1,"span(main[294:527]) = 'For the human cell line in ferric citrate medium, 50% growth inhibition ac ... lium concentration 80-fold lower than the concentration required when gallium nitrate was added.'","NodeSpan[token: 58 to incl. 97] = 'For the human cell line in ferric citrate medium, 50% growth i ... lium concentration 80-fold lower than the concentration required when gallium nitrate was added.'"
2,"span(main[528:664]) = 'In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects.'","NodeSpan[token: 98 to incl. 117] = 'In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects.'"
3,span(main[665:824]) = 'Monoclonal antibody to the transferrin receptor significantly decreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system.',NodeSpan[token: 118 to incl. 140] = 'Monoclonal antibody to the transferrin receptor significantl ... ecreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system.'
4,"span(main[825:976]) = 'Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellular uptake.'","NodeSpan[token: 141 to incl. 165] = 'Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellular uptake.'"
5,span(main[977:1169]) = 'The growth inhibitory effects of gallium are markedly potentiated when th ... functional transferrin receptors even in cells continuously cultured in transferrin-free medium.',NodeSpan[token: 166 to incl. 195] = 'The growth inhibitory effects of gallium are markedly potent ... functional transferrin receptors even in cells continuously cultured in transferrin-free medium.'


In [23]:
doc["sentence"]["tokens"][0]

Node span with N=58 elements,Node span with N=58 elements,Node span with N=58 elements,Node span with N=58 elements,Node span with N=58 elements
#,indx,namedEntity,partOfSpeech,text
0,1,'O','',span(main[0:3]) = 'The'
1,2,'O','',span(main[4:10]) = 'growth'
2,3,'O','',span(main[11:21]) = 'inhibitory'
3,4,'O','',span(main[22:29]) = 'effects'
4,5,'O','',span(main[30:32]) = 'of'
5,6,'O','',span(main[33:40]) = 'gallium'
6,7,'O','',span(main[41:43]) = 'on'
7,8,'O','',span(main[44:45]) = 'a'
8,9,'O','',span(main[46:52]) = 'murine'
9,10,'O','',span(main[53:56]) = 'and'


In [24]:
doc["token"][doc["token"]["text"] == "of"]

Query with 5 nodes.,Query with 5 nodes.,Query with 5 nodes.,Query with 5 nodes.,Query with 5 nodes.
#,indx,namedEntity,partOfSpeech,text
0,5,'O','',span(main[30:32]) = 'of'
1,48,'O','',span(main[243:245]) = 'of'
2,19,'O','',span(main[391:393]) = 'of'
3,13,'O','',span(main[766:768]) = 'of'
4,5,'O','',span(main[1007:1009]) = 'of'


In [25]:
from docria.algorithm import group_by_span

In [26]:
group_by_span?

[0;31mSignature:[0m
[0mgroup_by_span[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m[[0m[0;34m'group_nodes:List[docria.model.Node]'[0m[0;34m,[0m [0;34m'layer_nodes:Dict[str, Iterable[docria.model.Node]]'[0m[0;34m,[0m [0;34m"resolution='intersect'"[0m[0;34m,[0m [0;34m"group_span_field='text'"[0m[0;34m,[0m [0;34m'layer_span_field:Union[Dict[str, str], NoneType]=None'[0m[0;34m,[0m [0;34m'include_empty_groups=True'[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mList[0m[0;34m[[0m[0mTuple[0m[0;34m[[0m[0mdocria[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0mNode[0m[0;34m,[0m [0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mdocria[0m[0;34m.[0m[0mmodel[0m[0;34m.[0m[0mNode[0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Groups all nodes in layer_nodes into the corresponding bucket_node

Nodes with textspans that equals to NIL/None are ignored

Hint: The statement below works because, 'text' is the default name of a span, if it is not called 'text', either group_span_field or layer_span_field = {'token': name} must be set. 

In [27]:
of_tokens = doc["token"][doc["token"]["text"] == "of"]

In [28]:
group_by_span(group_nodes=doc["sentence"], layer_nodes={"token": of_tokens})

[(Node<sentence#0>, {'token': [Node<token#4>, Node<token#47>]}),
 (Node<sentence#1>, {'token': [Node<token#76>]}),
 (Node<sentence#2>, {'token': []}),
 (Node<sentence#3>, {'token': [Node<token#130>]}),
 (Node<sentence#4>, {'token': []}),
 (Node<sentence#5>, {'token': [Node<token#170>]})]

## Scale it up, processing many documents

In [29]:
def segment(doc):
    token_layer = doc.add_layer("token", text=T.span(), partOfSpeech=T.string, namedEntity=T.string("O"), indx=T.int32)
    sentence_layer = doc.add_layer("sentence", text=T.span(), tokens=T.nodespan("token"))
    
    pubmed_abstract = doc.texts["main"]
    sentence_layer = doc["sentence"]
    token_layer = doc["token"]
    
    last = 0
    for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", str(pubmed_abstract)):
        tokens = []
        text = pubmed_abstract[last:m.start()+1]
        
        offset = last
        for tok_i, tok_m in enumerate(re.finditer(r"\p{L}+|\p{N}+(\.\p{N}+)?|[\-\/%():,\.;+&#=!?@_<>]", str(text))):
            tokens.append(token_layer.add(indx=tok_i+1, text=doc.maintext[tok_m.start()+offset:tok_m.end()+offset]))

        if len(tokens) > 0:
            sentence = sentence_layer.add(text=text)
            sentence["tokens"] = NodeSpan(tokens[0], tokens[-1])

        last = m.end()

In [30]:
def process_pubmed(articles):
    for article in articles:
        title = article.find(".//Article/ArticleTitle").text
        pmid = article.find(".//PMID").text
        abstract = article.find(".//Abstract")
        if abstract is None:
            continue
        
        abstract_text = "".join(node.text for node in abstract.iterdescendants() if node.text is not None).strip()
        
        assert pmid is not None
        assert title is not None
        assert abstract is not None
    
        doc = Document()
        doc.props["pmid"] = pmid
        doc.props["title"] = title
        doc.maintext = abstract_text
        
        segment(doc)
        yield doc

In [31]:
def process(inputfile, outputfile):
    pubmedxml = etree.parse(inputfile)
    with open(outputfile, "wb") as fout, MsgpackDocumentWriter(fout) as writer:
        for doc in process_pubmed(pubmedxml.iterfind("PubmedArticle")):
            writer.write(doc)

In [32]:
process("pubmed_mini/pubmed19n0080.xml.gz", "pubmed00080.docria")

In [33]:
from tqdm import tqdm

In [34]:
reader = MsgpackDocumentReader(open("pubmed00080.docria", "rb"))

In [35]:
pmids = []
titles = []

In [36]:
for doc in tqdm(reader):
    props = doc.properties()
    pmids.append(props["pmid"])
    titles.append(props["title"])

22477it [00:00, 50456.68it/s]


In [37]:
pmids[0:100]

['2393846',
 '2393847',
 '2393848',
 '2393849',
 '2393850',
 '2393851',
 '2393852',
 '2393853',
 '2393854',
 '2393855',
 '2393856',
 '2393857',
 '2393858',
 '2393859',
 '2393860',
 '2393861',
 '2393862',
 '2393863',
 '2393864',
 '2393865',
 '2393866',
 '2393867',
 '2393868',
 '2393869',
 '2393870',
 '2393871',
 '2393872',
 '2393873',
 '2393874',
 '2393875',
 '2393876',
 '2393877',
 '2393878',
 '2393879',
 '2393880',
 '2393881',
 '2393882',
 '2393883',
 '2393884',
 '2393885',
 '2393886',
 '2393887',
 '2393888',
 '2393889',
 '2393890',
 '2393891',
 '2393892',
 '2393894',
 '2393895',
 '2393896',
 '2393897',
 '2393898',
 '2393899',
 '2393900',
 '2393901',
 '2393902',
 '2393904',
 '2393905',
 '2393906',
 '2393907',
 '2393908',
 '2393909',
 '2393911',
 '2393913',
 '2393914',
 '2393916',
 '2393917',
 '2393918',
 '2393919',
 '2393920',
 '2393921',
 '2393922',
 '2393934',
 '2393935',
 '2393936',
 '2393937',
 '2393938',
 '2393939',
 '2393940',
 '2393941',
 '2393942',
 '2393943',
 '2393944',
 '23

In [38]:
titles[0:100]

['Differential growth-inhibitory effects of gallium on B-lymphocyte lines in high versus low iron concentrations.',
 'Inhibition of growth of established N-methyl-N-nitrosourea-induced mammary cancer in rats by retinoic acid and ovariectomy.',
 'Differential down-regulation of epidermal protein kinase C by 12-O-tetradecanoylphorbol-13-acetate and diacylglycerol: association with epidermal hyperplasia and tumor promotion.',
 'Clearance and tissue distribution of recombinant human interleukin 1 beta in rats.',
 'Effect of different levels of calorie restriction on azoxymethane-induced colon carcinogenesis in male F344 rats.',
 'Uveal melanoma in relation to ultraviolet light exposure and host factors.',
 'Lorglumide and loxiglumide inhibit gastrin-stimulated DNA synthesis in a rat tumoral acinar pancreatic cell line (AR42J).',
 'Opposite effects of tamoxifen on in vitro protein kinase C activity and endogenous protein phosphorylation in intact MCF-7 cells.',
 'Decreased serum concentrati

In [39]:
sentences = []

In [40]:
reader = MsgpackDocumentReader(open("pubmed00080.docria", "rb"))
for mdoc in tqdm(reader):
    doc = mdoc.document()
    sentences.extend([str(sent["text"]) for sent in doc["sentence"]])

22477it [00:21, 1023.74it/s]


In [41]:
sentences[0:100]

['The growth inhibitory effects of gallium on a murine and human B-cell line were studied using two different serum-free culture systems: (a) ferric citrate medium containing 500 microM iron and (b) transferrin medium containing 5 micrograms/ml of iron-saturated transferrin (0.125 microM iron).',
 'For the human cell line in ferric citrate medium, 50% growth inhibition achieved in the presence of transferrin-gallium represented a gallium concentration 80-fold lower than the concentration required when gallium nitrate was added.',
 'In the transferrin system, significantly higher transferrin-gallium concentrations were required to achieve the same inhibitory effects.',
 'Monoclonal antibody to the transferrin receptor significantly decreased the growth inhibiting effect of transferrin-gallium in the mouse ferric citrate system.',
 'Thus, under very different culture conditions, gallium and iron appear to compete via the transferrin-transferrin receptor pathway for cellular uptake.',
 'T

In [42]:
from multiprocessing import Pool, cpu_count

In [43]:
import os

In [44]:
inputfiles = list(
    map(lambda fname: os.path.join("pubmed_mini", fname),
        filter(lambda fname: fname.endswith(".xml.gz"), os.listdir("pubmed_mini"))
    )
)

In [45]:
inputfiles

['pubmed_mini/pubmed19n0080.xml.gz']

In [46]:
outputfiles = [os.path.join("pubmed_mini", os.path.basename(fname) + ".docria") for fname in inputfiles]

In [47]:
outputfiles

['pubmed_mini/pubmed19n0080.xml.gz.docria']

In [48]:
def genwork(inputfiles, outputfiles):
    for i, o in zip(inputfiles, outputfiles):
        yield {"inputfile": i , "outputfile": o}

In [49]:
def work(args):
    inputfile = args["inputfile"]
    outputfile = args["outputfile"]
    process(inputfile, outputfile)
    return outputfile

In [50]:
pool = Pool(cpu_count())

In [51]:
pool.imap_unordered?

[0;31mSignature:[0m [0mpool[0m[0;34m.[0m[0mimap_unordered[0m[0;34m([0m[0mfunc[0m[0;34m,[0m [0miterable[0m[0;34m,[0m [0mchunksize[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Like `imap()` method but ordering of results is arbitrary.
[0;31mFile:[0m      ~/anaconda3/lib/python3.6/multiprocessing/pool.py
[0;31mType:[0m      method


In [52]:
for completed in tqdm(pool.imap_unordered(work, genwork(inputfiles, outputfiles))):
    pass

1it [01:17, 77.35s/it]
