In [32]:
import numpy as np
import pandas as pd
from collections import Counter
from datetime import datetime
import json
import pickle
import re
from collections import Counter
import random
from itertools import groupby

# nlp
import spacy
from spacy import displacy
from spacy.tokenizer import Tokenizer
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher

The goal of this notebook is to explore the key words used in the NF literature data and build an efficient custom NER model for the NF domain.

# Read data

In [2]:
pmc_data = pd.read_csv("../data/pmc_papers.csv")

In [3]:
pmc_data.head()

Unnamed: 0,pmc_id,title,authors,affliations,keywords,abstract
0,7643456,Discernment between candidate mechanisms for K...,"Stites Edward C., Rossman Kent L., McFall Thom...","grid.. Integrative Biology Laboratory, Salk ...","KRAS, GTPase, EGFR, Cancer, Targeted therapy",Phase three clinical trial evidence suggests t...
1,7643332,Psychometric properties of satisfaction with t...,"Mirghafourvand Mojgan, Mohammad-Alizadeh-Chara...","grid..f Students’ Research Committee, Tabriz...","Satisfaction, Childbirth education, Validity, ...",Childbirth preparation classes can reduce preg...
2,7641497,A Rare Incidence of Metachronous Neurovascular...,"Chandrasekaran Deepak, Azariah Emmanuel D, Chi...","Oral and Maxillofacial Surgery, Sri Ramachandr...","mesenchymal tumor, solitary, intraosseous, juv...",Neurofibroma is an uncommon benign tumor arisi...
3,7640792,A Child with Enlarged Extremities – A Case of ...,"Sundareswaran N, Gopinath G, Gunasekaran K","From the Department of Neurology, Government M...","Enlarged extremities, focal gigantism, macroda...","Macrodystrophia lipomatosa (ML) is a rare, non..."
4,7609672,Primary pancreatic glomus tumor invading into ...,"Sasaki Yu, Taki Yoshiro, Tamaki Ichiro, Sasano...","grid..cDepartment of Surgery, Kansai Electric ...","Glomus tumor, Glomangiomyoma, Pancreas, Immuno...",Glomus tumors are subcutaneous tumors arising ...


In [4]:
pmc_data.shape

(10000, 6)

In [13]:
pmc_kerwords = [kw.strip().lower() for kw in ','.join(pmc_data.keywords.dropna().tolist()).split(',')]

In [14]:
len(pmc_kerwords)

42247

In [15]:
elsevier_data = pd.read_json('../data/ctf-hackathon-upload.json', lines=True)

In [16]:
elsevier_data.head()

Unnamed: 0,Eid,abstract,affiliation_organization,title,year,sourcetitle,issn_print,doi,openaccess,meta_language,References,pmid,publishername,keywords,funding_text
0,77249160236,Introduction. Neurofibromatosis type 1 is an a...,"[[Department of Dermatology, University of Pal...",Neurofibromatosis of the nipple-areolar area: ...,2010,Journal of Medical Case Reports,[17521947],10.1186/1752-1947-4-22,Full,English,"[Fink, D., Schneider, C., Wight, E., Perucchin...",,,,
1,79952114987,To introduce operation skill of the spinal wed...,"[[Department of Spinal Surgery, First Affiliat...",[Spinal wedge osteotomy by posterior approach ...,2010,Zhongguo xiu fu chong jian wai ke za zhi = Zho...,[10021892],,,Chinese,,20695392.0,,,
2,79955067942,,"[[Department of Cardiology, AZ Sint Jan Hospit...",Multimodality imaging of cardiac involvement i...,2011,Journal of the American College of Cardiology,[07351097],10.1016/j.jacc.2010.08.651,Full,English,,21511107.0,Elsevier USA,,
3,80051905985,Malignant peripheral nerve sheath tumors accou...,"[[Maxillo-Facial Surgery Division, Head and Ne...",Malignant peripheral nerve sheath tumor of the...,2011,Journal of Pediatric Surgery,[00223468],10.1016/j.jpedsurg.2011.04.058,,English,"[Leroy, K., Dumas, V., Martin-Garcia, N., Falz...",,W.B. Saunders,"[MPNST vagus, Malignant peripheral nerve sheat...",
4,80052211641,We report a 20-year-old man with cauda equina ...,[[Department of Physical Medicine and Rehabili...,Peripheral nerve involvement in a neurofibroma...,2011,Archives of Physical Medicine and Rehabilitation,[00039993],10.1016/j.apmr.2011.04.011,,English,"[, , Neurofibromatosis: Conference Statement (...",21878222.0,,"[Case report, Cauda equina, Neurofibromatosis,...",


In [24]:
elsevier_data['keywords'] = elsevier_data['keywords'].fillna('').map(lambda x: ','.join(x))

In [27]:
elsevier_keywords = [kw.strip().lower() for kw in ','.join(elsevier_data.keywords.dropna().tolist()).split(',') if kw]

In [28]:
len(elsevier_keywords)

19269

In [33]:
keywords_cnt = Counter(pmc_kerwords + elsevier_keywords)

In [35]:
keywords_cnt.most_common(20)

[('neurofibromatosis type 1', 922),
 ('neurofibromatosis', 830),
 ('glioblastoma', 437),
 ('cancer', 420),
 ('nf1', 353),
 ('glioma', 325),
 ('schwannoma', 287),
 ('neurofibroma', 286),
 ('vestibular schwannoma', 246),
 ('neurofibromatosis type 2', 239),
 ('meningioma', 217),
 ('melanoma', 215),
 ('neurofibromatosis 1', 209),
 ('prognosis', 203),
 ('malignant peripheral nerve sheath tumor', 201),
 ('pheochromocytoma', 175),
 ('magnetic resonance imaging', 175),
 ('breast cancer', 173),
 ('children', 161),
 ('nf2', 149)]

Another good source of NER can come from the mutation information data from the Clinvar dataset.

In [37]:
clinvar_data = pd.read_csv("../data/nf_mutation_info.csv")

In [38]:
clinvar_data.head()

Unnamed: 0,Name,Gene(s),Protein change,diagnosis,Clinical significance,Last_review_date,GRCh37Chromosome,GRCh38Chromosome,VariationID
0,NM_001042492.3(NF1):c.58C>G (p.Gln20Glu),NF1,Q20E,"Neurofibromatosis, type 1",Uncertain significance,"Nov 8, 2019",17.0,17.0,955226
1,NM_000267.3(NF1):c.58C>T (p.Gln20Ter),NF1,Q20*,"Neurofibromatosis, type 1",Pathogenic,"Jan 7, 2019",17.0,17.0,576465
2,NM_000267.3(NF1):c.59A>C (p.Gln20Pro),NF1,Q20P,"Neurofibromatosis, type 1",Uncertain significance,"Aug 2, 2018",17.0,17.0,650986
3,NM_001042492.3(NF1):c.60G>C (p.Gln20His),NF1,Q20H,"Neurofibromatosis, type 1",Uncertain significance,"Oct 7, 2019",17.0,17.0,965727
4,NM_000267.3(NF1):c.60+1G>C,NF1,,"Neurofibromatosis, type 1",Likely pathogenic,"Aug 28, 2018",17.0,17.0,663201


In [47]:
nf_mutation_gene = clinvar_data['Protein change'].dropna().unique().tolist()

## Curate a NF specific vocabulary

We want to curate a simple NF domain vocabulary to prepare for training a custom NER model for the NF domain.

In [43]:
nf_paper_ners = list(np.unique(pmc_kerwords + elsevier_keywords))

In [44]:
len(nf_paper_ners)

24628

In [48]:
len(nf_mutation_gene)

5435

In [53]:
nf_mutation_gene[:6]

['Q20E', 'Q20*', 'Q20P', 'Q20H', 'L21P', 'P22A']

For future work, along with more deep dive corpus analysis, we also want to incorporate SME feedback to construct a better NER schema.

## Customized NF NER model

As a starting point, leveraging a domain specific vocabulary to construct a rule-based NER model would be the most efficient way to tackle the challenge of the cold-start problem while achieving reliable precision and speed of inference. 

Here we simply use the `spacy` NER model with the augmented rule-based model from the NF vocab.

In [50]:
nlp = spacy.load("en_core_web_sm")

In [51]:
# Create EntityRuler instance
ruler = EntityRuler(nlp)

In [None]:
# Define pattern for new entities
for ft in nf_paper_ners:
    ruler.add_patterns([{"label": "NF_TERM", "pattern": str(ft)}])
for ft in nf_mutation_gene:
    ruler.add_patterns([{"label": "NF_GENE", "pattern": str(ft)}])

In [None]:
# Update existing pipeline
nlp.add_pipe(ruler, before="ner")

In [None]:
# validate
with nlp.disable_pipes("tagger", "parser"):
    doc = nlp('The R2429X nonsense variant in the NF1 gene has been reported previously in association with neurofibromatosis type 1')
    for entity in doc.ents:
        print(entity.text, entity.label_)