In [6]:
#install docria
!pip install docria 
#install regular expressions
!pip install regex

Collecting regex
  Downloading https://files.pythonhosted.org/packages/f1/2f/f586e982712ffee5681ca149d54480dbb04ff533e9e4638c5e28ae76bdb5/regex-2019.08.19-cp37-none-win_amd64.whl (325kB)
Installing collected packages: regex
Successfully installed regex-2019.8.19


In [8]:
# import relevant libraries 
from docria import Document, DataTypes as T, NodeSpan, set_large_screen, MsgpackCodec
from docria.storage import MsgpackDocumentIO, MsgpackDocumentReader, MsgpackDocumentWriter
from docria.codec import MsgpackDocument #MsgpackDocument must be imported from docria.codec
from lxml import etree #approx the same thing but better than import xml.etree.ElementTree as ET
import regex as re 

## Import

In [9]:
#IPython magic commands are prefixed by "%" -aim: succinctly solve various common problems in standard data analysis.
#cell magics are denoted by a double %% prefix

%%sh #invoke system shell (shell = user interface)
zcat pubmed_mini/pubmed19n0080.xml.gz | head -n 100 
#zcat prints whole document, pipe head - n 100 prints first 100 lines of document 

Couldn't find program: 'sh'


In [10]:
pubmed0080 = etree.parse("pubmed_mini/pubmed19n0080.xml.gz") #parses document, stored in memory (doesn't print it)

OSError: Error reading file 'pubmed_mini/pubmed19n0080.xml.gz': failed to load external entity "pubmed_mini/pubmed19n0080.xml.gz"

In [None]:
articles = pubmed0080.iterfind("PubmedArticle") #finds PubmedArticle in document, stores it as variable articles 

In [None]:
article = next(articles) #finds next I assume

In [None]:
print(etree.tounicode(article)) 
# "Serialize an element to the Python unicode representation of its XML tree." - https://lxml.de/api/lxml.etree-module.html
#"Serialization is the process of converting an object into a form that can be readily transported. 
# For example, you can serialize an object and transport it over the Internet using HTTP between a client and a server. 
# On the other end, deserialization reconstructs the object from the stream." - https://lxml.de/api/lxml.etree-module.html

In [None]:
article.find(".//Article/ArticleTitle").text #finds first ArticleTitle I think

In [None]:
article.find(".//PMID").text #finds first PubMed ID I think

In [None]:
abstract = article.find(".//Abstract") #finds first abstract 

In [None]:
print(etree.tounicode(abstract))

In [None]:
descedants = list(abstract.iterdescendants())  

In [None]:
descedants

In [None]:
abstract_text = "".join(node.text for node in descedants if node.text is not None).strip()
print(abstract_text)

## Processing prototyping

Test the regex at https://www.regex101.com

In [None]:
#regex: \symbol finds that symbol, otherwise it is interpreted as a command
#"?" makes the preceding token in the regular expression optional. colou?r matches colour or color.
#">" 

last = 0
for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", abstract_text): #regex finds new sentence (doesn't find end of last sentence)
    print("SENT:", abstract_text[last:m.start()+1])
    last = m.end()

In [None]:
set_large_screen() 

In [None]:
doc = Document()

In [None]:
doc.props["pmid"] = article.find(".//PMID").text #sets PubMed ID 

In [None]:
doc.maintext = abstract_text

In [None]:
doc

In [None]:
doc.add_layer("token", text=T.span(), partOfSpeech=T.string, namedEntity=T.string("O"), indx=T.int32) 
doc.add_layer("sentence", text=T.span(), tokens=T.nodespan("token")) 

In [None]:
sentence_layer = doc["sentence"]
token_layer = doc["token"]
last = 0
for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", abstract_text): #for each new sentence in abstract?
    tokens = []
    text = doc.maintext[last:m.start()+1]
    offset = last
    for tok_i, tok_m in enumerate(re.finditer(r"\p{L}+|\p{N}+(\.\p{N}+)?|[\-\/%():,\.;+&#=!?@_<>]", str(text))):
        tokens.append(token_layer.add(indx=tok_i+1, text=doc.maintext[tok_m.start()+offset:tok_m.end()+offset]))
    
    if len(tokens) > 0:
        sentence = sentence_layer.add(text=text)
        sentence["tokens"] = NodeSpan(tokens[0], tokens[-1])

    last = m.end()

In [None]:
doc["sentence"]

In [None]:
doc["sentence"]["tokens"][0]

In [11]:
doc["token"][doc["token"]["text"] == "of"]

NameError: name 'doc' is not defined

In [None]:
from docria.algorithm import group_by_span

In [None]:
group_by_span?

Hint: The statement below works because, 'text' is the default name of a span, if it is not called 'text', either group_span_field or layer_span_field = {'token': name} must be set. 

In [None]:
of_tokens = doc["token"][doc["token"]["text"] == "of"]

In [None]:
group_by_span(group_nodes=doc["sentence"], layer_nodes={"token": of_tokens})

## Scale it up, processing many documents

In [None]:
def segment(doc):
    token_layer = doc.add_layer("token", text=T.span(), partOfSpeech=T.string, namedEntity=T.string("O"), indx=T.int32)
    sentence_layer = doc.add_layer("sentence", text=T.span(), tokens=T.nodespan("token"))
    
    pubmed_abstract = doc.texts["main"]
    sentence_layer = doc["sentence"]
    token_layer = doc["token"]
    
    last = 0
    for m in re.finditer(r"(?>[\.\?\!])(?:\s*(?=\p{Lu})|$)", str(pubmed_abstract)):
        tokens = []
        text = pubmed_abstract[last:m.start()+1]
        
        offset = last
        for tok_i, tok_m in enumerate(re.finditer(r"\p{L}+|\p{N}+(\.\p{N}+)?|[\-\/%():,\.;+&#=!?@_<>]", str(text))):
            tokens.append(token_layer.add(indx=tok_i+1, text=doc.maintext[tok_m.start()+offset:tok_m.end()+offset]))

        if len(tokens) > 0:
            sentence = sentence_layer.add(text=text)
            sentence["tokens"] = NodeSpan(tokens[0], tokens[-1])

        last = m.end()

In [None]:
def process_pubmed(articles):
    for article in articles:
        title = article.find(".//Article/ArticleTitle").text
        pmid = article.find(".//PMID").text
        abstract = article.find(".//Abstract")
        if abstract is None: #because all articles don't have abstract
            continue
        
        abstract_text = "".join(node.text for node in abstract.iterdescendants() if node.text is not None).strip()
        
        assert pmid is not None
        assert title is not None
        assert abstract is not None
    
        doc = Document()
        doc.props["pmid"] = pmid
        doc.props["title"] = title
        doc.maintext = abstract_text
        
        segment(doc)
        yield doc

In [None]:
def process(inputfile, outputfile):
    pubmedxml = etree.parse(inputfile)
    with open(outputfile, "wb") as fout, MsgpackDocumentWriter(fout) as writer:
        for doc in process_pubmed(pubmedxml.iterfind("PubmedArticle")):
            writer.write(doc)

In [None]:
process("pubmed_mini/pubmed19n0080.xml.gz", "pubmed00080.docria")

In [None]:
from tqdm import tqdm

In [None]:
reader = MsgpackDocumentReader(open("pubmed00080.docria", "rb"))

In [None]:
pmids = []
titles = []

In [None]:
for doc in tqdm(reader):
    props = doc.properties()
    pmids.append(props["pmid"])
    titles.append(props["title"])

In [None]:
pmids[0:100]

In [None]:
titles[0:100]

In [None]:
sentences = []

In [None]:
reader = MsgpackDocumentReader(open("pubmed00080.docria", "rb"))
for mdoc in tqdm(reader):
    doc = mdoc.document()
    sentences.extend([str(sent["text"]) for sent in doc["sentence"]])

In [None]:
sentences[0:100]

In [None]:
from multiprocessing import Pool, cpu_count #library lets you run many steps in parallel

In [None]:
import os #allows you to interface with the underlying operating system that Python is running on 

In [None]:
inputfiles = list(
    map(lambda fname: os.path.join("pubmed_mini", fname),
        filter(lambda fname: fname.endswith(".xml.gz"), os.listdir("pubmed_mini"))
    )
)

In [None]:
inputfiles

In [None]:
outputfiles = [os.path.join("pubmed_mini", os.path.basename(fname) + ".docria") for fname in inputfiles]

In [None]:
outputfiles

In [None]:
def genwork(inputfiles, outputfiles):
    for i, o in zip(inputfiles, outputfiles):
        yield {"inputfile": i , "outputfile": o}

In [None]:
def work(args):
    inputfile = args["inputfile"]
    outputfile = args["outputfile"]
    process(inputfile, outputfile)
    return outputfile

In [None]:
pool = Pool(cpu_count()) #use all kernels on computer 

In [None]:
pool.imap_unordered?

In [None]:
for completed in tqdm(pool.imap_unordered(work, genwork(inputfiles, outputfiles))):
    pass