# Spacy PhraseMatcher

In [1]:
import spacy
import scispacy

In [2]:
from spacy.matcher import PhraseMatcher

In [3]:
path = 'C:\\Users\\Sonja\\Anaconda3\\envs\\scispacy\\Lib\\site-packages\\en_core_sci_lg\\en_core_sci_lg-0.2.4'

In [4]:
nlp = spacy.load(path)

In [37]:
abstract = """2019nCOV, also named SARS COV 2, causes COVID 19, also referred to as Wuhan pneumonia."""

In [6]:
covid19_terms = []
with open('corona_disease.txt','r') as disease:
    for line in disease:
        x = disease.readline().strip()
        covid19_terms.append(x)
    print(covid19_terms)
disease.close()

['covid 19', 'coronavirus disease 2019', 'coronavirus disease19', '2019 novel coronavirus respiratory syndrome', 'wurs', 'severe acute respiratory syndrome type 2', 'sars2', '2019 sars', 'wuhan infection', 'seafood market infection', 'covid 19 virus infection', 'sarscov2 2019 infection', '2019novel sarscov2 infection', '2019 novel sarscov2 infection', 'hcov 19 infection', 'sars cov 2 infection', '2019 sars cov 2 infection', '2019new sars cov 2 infection', '2019 new sars cov 2 infection', 'new coronavirus 2019 infection', 'new corona virus infection', '2019 new corona virus infection', 'novel coronavirus 2019 infection', 'novel corona virus infection', '2019 novel corona virus infection', 'wuhan virus 2019 infection', '2019novel wuhan virus infection', '2019 novel wuhan virus infection', 'wuhan pneumonia virus infection', '2019 wuhan pneumonia virus infection', '2019new wuhan pneumonia virus infection', '2019 new wuhan pneumonia virus infection', 'wuhan coronavirus 2019 infection', '201

In [7]:
sarscov2_terms = []
with open('corona_virus.txt','r') as virus:
    for line in virus:
        x = virus.readline().strip()
        sarscov2_terms.append(x)
    print(sarscov2_terms)
virus.close()

['covid 19 virus', 'sarscov2 2019', '2019novel sarscov2', '2019 novel sarscov2', 'hcov 19', 'sars cov 2', '2019 sars cov 2', '2019new sars cov 2', '2019 new sars cov 2', 'new coronavirus 2019', 'new corona virus', '2019 new corona virus', 'novel coronavirus 2019', 'novel corona virus', '2019 novel corona virus', 'wuhan virus 2019', '2019novel wuhan virus', '2019 novel wuhan virus', 'wuhan pneumonia virus', '2019 wuhan pneumonia virus', '2019new wuhan pneumonia virus', '2019 new wuhan pneumonia virus', 'wuhan coronavirus 2019', '2019novel wuhan coronavirus', '2019 novel wuhan coronavirus', 'wuhan corona virus', '2019 wuhan corona virus', '2019new wuhan corona virus', '2019 new wuhan corona virus', 'wuhancoronavirus 2019', '2019novel wuhancoronavirus', '2019 novel wuhancoronavirus', 'wuhancorona virus', '2019 wuhancorona virus', '2019new wuhancorona virus', '2019 new wuhancorona virus', 'wuhan seafood market pneumonia virus 2019', '2019novel wuhan seafood market pneumonia virus', '2019 n

In [8]:
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Only run nlp.make_doc to speed things up
sarscov2patterns = [nlp.make_doc(text) for text in sarscov2_terms]
covid19patterns = [nlp.make_doc(text) for text in covid19_terms]
matcher.add("covid19", None, *covid19patterns)
matcher.add("sarscov2", None, *sarscov2patterns) #the first argument is the class


In [38]:
doc = nlp(abstract)
matches = matcher(doc)
entities = []
for match_id, start, end in matches:
    span = doc[start:end]
    class_id = nlp.vocab.strings[match_id]
    print(span.text, span.start_char, span.end_char, class_id)
    ent = {"start": span.start_char, "end": span.end_char, "label": class_id}
    entities.append(ent)
print(entities)

2019nCOV 0 8 sarscov2
SARS COV 2 21 31 sarscov2
COVID 19 40 48 covid19
Wuhan pneumonia 70 85 covid19
[{'start': 0, 'end': 8, 'label': 'sarscov2'}, {'start': 21, 'end': 31, 'label': 'sarscov2'}, {'start': 40, 'end': 48, 'label': 'covid19'}, {'start': 70, 'end': 85, 'label': 'covid19'}]


Overlapping entities need to be removed before visualization

In [28]:
from spacy import displacy

In [39]:
test = [{"text": abstract,
       "ents": entities,
       "title": None}]
html = displacy.render(test, style="ent", manual=True)