# Spacy PhraseMatcher

In [1]:
import spacy
import scispacy

In [2]:
from spacy.matcher import PhraseMatcher

In [3]:
path = 'C:\\Users\\Sonja\\Anaconda3\\envs\\scispacy\\Lib\\site-packages\\en_core_sci_lg\\en_core_sci_lg-0.2.4'

In [4]:
nlp = spacy.load(path)

In [5]:
abstract = """2019nCOV, also named SARS COV 2, causes COVID 19, also referred to as Wuhan pneumonia."""

In [6]:
covid19_terms = []
with open('corona_disease.txt','r') as disease:
    for line in disease:
        x = line.strip()
        covid19_terms.append(x)
    print(covid19_terms)

['covid19', 'covid 19', 'ncp', 'coronavirus disease 2019', 'corona virus disease 2019', 'coronavirus disease19', 'corona virus disease19', '2019 novel coronavirus respiratory syndrome', '2019 novel corona virus respiratory syndrome', 'wurs', 'seafood market pneumonia', 'severe acute respiratory syndrome type 2', 'sars 2', 'sars2', 'sars 2019', '2019 sars', 'severe acute respiratory syndrome 2019', 'wuhan infection', 'hubei infection', 'seafood market infection', 'covid19 virus infection', 'covid 19 virus infection', 'sarscov2 infection', 'sarscov2 2019 infection', '2019 sarscov2 infection', '2019novel sarscov2 infection', '2019new sarscov2 infection', '2019 novel sarscov2 infection', '2019 new sarscov2 infection', 'hcov 19 infection', 'hcov19 infection', 'sars cov 2 infection', 'sars cov 2 2019 infection', '2019 sars cov 2 infection', '2019novel sars cov 2 infection', '2019new sars cov 2 infection', '2019 novel sars cov 2 infection', '2019 new sars cov 2 infection', 'new coronavirus in

In [7]:
sarscov2_terms = []
with open('corona_virus.txt','r') as virus:
    for line in virus:
        x = line.strip()
        sarscov2_terms.append(x)
    print(sarscov2_terms)

['covid19 virus', 'covid 19 virus', 'sarscov2', 'sarscov2 2019', '2019 sarscov2', '2019novel sarscov2', '2019new sarscov2', '2019 novel sarscov2', '2019 new sarscov2', 'hcov 19', 'hcov19', 'sars cov 2', 'sars cov 2 2019', '2019 sars cov 2', '2019novel sars cov 2', '2019new sars cov 2', '2019 novel sars cov 2', '2019 new sars cov 2', 'new coronavirus', 'new coronavirus 2019', '2019 new coronavirus', 'new corona virus', 'new corona virus 2019', '2019 new corona virus', 'novel coronavirus', 'novel coronavirus 2019', '2019 novel coronavirus', 'novel corona virus', 'novel corona virus 2019', '2019 novel corona virus', 'wuhan virus', 'wuhan virus 2019', '2019 wuhan virus', '2019novel wuhan virus', '2019new wuhan virus', '2019 novel wuhan virus', '2019 new wuhan virus', 'wuhan pneumonia virus', 'wuhan pneumonia virus 2019', '2019 wuhan pneumonia virus', '2019novel wuhan pneumonia virus', '2019new wuhan pneumonia virus', '2019 novel wuhan pneumonia virus', '2019 new wuhan pneumonia virus', 'wu

In [8]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Only run nlp.make_doc to speed things up
sarscov2patterns = [nlp.make_doc(text) for text in sarscov2_terms]
covid19patterns = [nlp.make_doc(text) for text in covid19_terms]
matcher.add("covid19", None, *covid19patterns)
matcher.add("sarscov2", None, *sarscov2patterns) #the first argument is the class

In [9]:
doc = nlp(abstract)
matches = matcher(doc)
entities = []
for match_id, start, end in matches:
    span = doc[start:end]
    class_id = nlp.vocab.strings[match_id]
    print(span.text, span.start_char, span.end_char, class_id)
    ent = {"start": span.start_char, "end": span.end_char, "label": class_id}
    entities.append(ent)
print(entities)

2019nCOV 0 8 sarscov2
SARS COV 2 21 31 sarscov2
COVID 19 40 48 covid19
Wuhan pneumonia 70 85 covid19
[{'start': 0, 'end': 8, 'label': 'sarscov2'}, {'start': 21, 'end': 31, 'label': 'sarscov2'}, {'start': 40, 'end': 48, 'label': 'covid19'}, {'start': 70, 'end': 85, 'label': 'covid19'}]


Overlapping entities need to be removed before visualization

In [10]:
from spacy import displacy

In [11]:
test = [{"text": abstract,
       "ents": entities,
       "title": None}]
html = displacy.render(test, style="ent", manual=True)