In [None]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load("en_core_sci_md")

nlp.add_pipe("abbreviation_detector")

text = "Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily."

doc = nlp(text)

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
    print(f"{abrv} \t ({abrv.start},{abrv.end}) {abrv._.long_form}")

The entity linker is a knowledge base linker. There are 5 supported linkers which are:

* umls: links to Unified Medical Language System. This has around ~3M concepts
* mesh: links to ~30K entities. MeSH is derrived from MeSH iteself. The entities are used for indexing in Pubmed.
* rxnorm: Links to the RxNorm ontology. RxNorm contains ~100K concepts.
* go: links to the Gene Ontology which has ~67k concepts.
* hpo: Links to the Human Phenotype Ontology. The HPO contains ~16K concepts on phenotypic abnormalities encountered in human disease. 

In [None]:
from scispacy.linking import EntityLinker

In [None]:
import spacy
nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name":"mesh"})

In [None]:
import spacy
from scispacy.linking import EntityLinker

nlp2 = spacy.load("en_core_sci_md")
nlp2.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name":"umls"})

In [None]:
text = ["Some types of polyps can change into cancer over time (usually many years),but not all polyps become cancer. If dysplasia is seen in the polyp after it's removed. Dysplasia is another pre-cancerous condition.It means there's an area in a polyp or in the lining of the colonor rectum where the cells look abnormal,but they haven't become cancer."]
doc = nlp(text[0])

In [None]:
entity = doc.ents[1]

In [None]:
print("Name: ", entity)

In [None]:
linker = nlp.get_pipe("scispacy_linker")

In [None]:
for umls_ent in entity._.kb_ents:
    print(linker.kb.cui_to_entity[umls_ent[0]])

In [None]:
entity._.kb_ents

In [4]:
import spacy
from spacy import displacy
import pandas as pd

from scispacy.umls_linking import UmlsEntityLinker
from scispacy.abbreviation import AbbreviationDetector

In [8]:
SPACY_MODEL_NAMES = ["en_core_sci_sm", "en_core_sci_md", "en_core_sci_lg", "en_core_sci_scibert"]
NER_MODEL_NAMES = ["en_ner_craft_md", "en_ner_jnlpba_md", "en_ner_bc5cdr_md", "en_ner_bionlp13cg_md"]
DEFAULT_TEXT = "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron disease caused by the expansion of a polyglutamine tract within the androgen receptor (AR). SBMA can be caused by this easily."

In [5]:
def load_model(name):

    nlp = spacy.load(name)
    # Add abbreviation detector
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe("abbreviation_detector")
    return nlp

def process_text(model_name, text):
    nlp = load_model(model_name)
    return nlp(text)


def load_linker():
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    return linker


In [9]:
print("Choose the spacy model:\n")
i = int(input())
spacy_model = SPACY_MODEL_NAMES[i-1]
print("\nThis spacy model is chosen: ", spacy_model)
print("\nChoose the NER model:\n")
j = int(input())
ner_model = NER_MODEL_NAMES[j-1]
print("This NER model is chosen: ", ner_model)

Choose the spacy model:

4

This spacy model is chosen:  en_core_sci_scibert

Choose the NER model:

4
This NER model is chosen:  en_ner_bionlp13cg_md


In [10]:
doc = process_text(spacy_model,DEFAULT_TEXT)
displacy.render(doc, style="ent", options={"distance": 50})



In [11]:
ner_doc = process_text(ner_model, DEFAULT_TEXT)
displacy.render(ner_doc, style="ent", options={"distance": 50})

In [None]:
linker = load_linker()

threshold = 0.0
linker.threshold = threshold

In [None]:
data = []
for ent in linker(doc).ents:
    for ent_id, score in ent._.umls_ents:

        kb_entity = linker.umls.cui_to_entity[ent_id]
        tuis = ",".join(kb_entity.types)
        data.append([
            ent.text,
            kb_entity.canonical_name,
            ent_id,
            tuis,
            score,
            ent.start,
            ent.end,
        ])


In [None]:
attrs = ["text", "Canonical Name", "Concept ID", "TUI(s)", "Score", "start", "end"]
df = pd.DataFrame(data, columns=attrs)

In [8]:
from spacy import displacy 


displacy.render(ner_doc, style="ent", options={"distance": 50})