# Indra experiments

The aim here is to evaluate Indra as a basis for claim identification in medical literature outside of molecular biology, esp. in the case of clinical trials.


## Document ETL

Taking open-access documents as point of departure. Below is the abstract, stripped of references:




In [None]:
from lxml import etree
import requests
import json
from indra.literature import get_full_text


In [3]:
doi = "10.12688/f1000research.16369.1"
pmid='30631430'
pmcid='PMC6281014'
pdflink='https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6281014/pdf/f1000research-7-17879.pdf'

In [7]:
(t, ttype) = get_full_text(str(pmid), 'pmid') # or (doi, 'doi')
nsmap={'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
 'ali': 'http://www.niso.org/schemas/ali/1.0/',
 'mml': 'http://www.w3.org/1998/Math/MathML',
 'xlink': 'http://www.w3.org/1999/xlink',
 'j': 'https://jats.nlm.nih.gov/ns/archiving/1.2/'}


if ttype == 'pmc_oa_xml':
    tx = etree.fromstring(t.encode('utf-8'))
    with open(f"{pmcid}.xml", 'w') as f:
        f.write(t)
    ax = tx.xpath('//j:abstract', namespaces=nsmap)[0]
    abstract = ''.join(ax.itertext()).strip()
    if pmcid:
        if not pdflink:
            pdflinks=tx.xpath("//j:self-uri[@content-type='pdf']/@xlink:href", namespaces=nsmap)
            if pdflinks:
                pdflink = pdflinks[0]
        if pdflink:
            source = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/{pdflink}"
            r = requests.get(source)
            if r.ok:  # actually get a 403 here...
                with open(f"{pmcid}.pdf", 'wb') as f:
                    f.write(r.raw)
            else:
                print(f"open {source} and save it as {pmcid}.pdf")
elif ttype == 'abstract':
    abstract = t


In [None]:
from indra.sources import reach
rp=reach.process_text(abstract, offline=True)
with open(f"{pmcid}.reach.json", 'w') as f:
    json.dump(rp.tree.data, f)
if rp.statements:
    with open(f"{pmcid}.reach.indra.json", 'w') as f:
        json.dump([s.to_json() for s in rp.statements], f)


In [None]:
from indra.sources import eidos
rp=eidos.process_text(abstract, webservice="http://localhost:9000")
with open(f"{pmcid}.eidos.json", 'w') as f:
    json.dump(rp.doc.tree.data, f)
if rp.statements:
    with open(f"{pmcid}.eidos.indra.json", 'w') as f:
        json.dump([s.to_json() for s in rp.statements], f)


In [None]:
from indra.sources import sparser
rp=sparser.process_text(abstract)
# why does it fail now???
if rp:
    with open(f"{pmcid}.sparser.json", 'w') as f:
        json.dump(rp.doc.tree.data, f)
    if rp.statements:
        with open(f"{pmcid}.sparser.indra.json", 'w') as f:
            json.dump([s.to_json() for s in rp.statements], f)

