In [45]:
import time
from collections import Counter
from indra_db import get_primary_db
from indra_db.util import distill_stmts
from indra.literature import pubmed_client
from indra.tools import assemble_corpus as ac


The first step in creating the EMMAA model for Neurofibromatosis is to find publications broadly relevant for neurofibromatosis. We do this by querying based on the both the term "neurofibromatosis" as well as the top-level Medical Subject Heading (MESH) term for Neurofibromatoses, D017253 (see https://www.ncbi.nlm.nih.gov/mesh/68017253).

In [16]:
def get_nf_pmids():
    nf_text_pmids = pubmed_client.get_ids('neurofibromatosis') # PMIDs from text search for "neurofibromatosis"
    nf_mesh_ids = pubmed_client.get_ids_for_mesh('D017253') # Mesh term for neurofibromatoses
    return list(set(nf_text_pmids + nf_mesh_ids))
nf_pmids = get_nf_pmids()

Here's how many articles we found:

In [18]:
len(nf_pmids)

17850

The next step is to get the INDRA Statements that have been extracted from these, which will give us a sense of which molecular entities (proteins, drugs, etc.) are relevant to NF and should be included in the model.

In [37]:
# Get raw statement IDs from the DB for the given TextRefs
reload = False
pmid_stmts_file = 'nf_raw_stmts.pkl'
if reload: 
    db = get_primary_db()
    # Get statements for the given text refs
    print(f"Distilling statements for {len(nf_pmids)} TextRefs")
    start = time.time()
    clauses = [
        db.TextRef.pmid.in_(nf_pmids),
        db.TextContent.text_ref_id == db.TextRef.id,
        db.Reading.text_content_id == db.TextContent.id,
        db.RawStatements.reading_id == db.Reading.id]
    pmid_stmts = distill_stmts(db, get_full_stmts=True, clauses=clauses)
    end = time.time()
    elapsed = end - start
    print(elapsed)
    list(ac.dump_statements(pmid_stmts, pmid_stmts_file))
else:
    pmid_stmts = list(ac.load_statements(pmid_stmts_file))


INFO: [2020-10-27 10:41:44] indra.tools.assemble_corpus - Loading nf_raw_stmts.pkl...
INFO: [2020-10-27 10:41:45] indra.tools.assemble_corpus - Loaded 20209 statements


In [47]:
proteins = [ag.name for stmt in pmid_stmts for ag in stmt.agent_list()
            if ag is not None and 'HGNC' in ag.db_refs]
protein_ctr = Counter(proteins)
protein_ctr = sorted([(k, ct) for k, ct in protein_ctr.items()], key=lambda x: x[1], reverse=True)

In [68]:
camp = [ag.db_refs.get('TEXT') for stmt in pmid_stmts for ag in stmt.agent_list() if ag and ag.name == 'CAMP']

In [69]:
camp_ctr = Counter(camp)

In [70]:
camp_ctr

Counter({'cAMP': 131, 'c-AMP': 2})

In [49]:
len(protein_ctr)

1396

In [19]:
from indra.sources import indra_db_rest as idr

In [33]:
idrp = idr.get_statements_for_paper([('pmid', pmid) for pmid in nf_pmids][0:100])

INFO: [2020-10-27 10:28:54] indra.sources.indra_db_rest.util - query: https://db.indra.bio/statements/from_papers
INFO: [2020-10-27 10:28:54] indra.sources.indra_db_rest.util - params: {'ev_limit': 10, 'best_first': True, 'max_stmts': None, 'api_key': '[api-key]'}


Get drugs that:
1. Have already been tested against NF (are in the screening datasets)
2. Are known to target one or more genes relevant to NF.

In [10]:
both_pmids = set(nf_mesh_ids).union(set(nf_text_pmids))

In [11]:
len(both_pmids)

17850