# Testing Procure Wikidata entity linking

In [1]:
import spacy
from spacy import displacy
import wd_search as wd
from importlib import reload # while developing

Loading config file wd_search_config.yml
spacy_entity_linker


### Load Spacy's medium model

In [2]:
nlp = spacy.load("en_core_web_md")

In [71]:
# uncomment and reload if debugging
# reload(wd)

Loading config file wd_search_config.yml


<module 'wd_search' from '/Users/finin/code/wdtools/wd_search.py'>

In [89]:
def test(string, qid='?', label='?', context='', MAX=20):
    """ find link for string and compare it to ground truth if qid is given.
        Shows initial and final search string used to get link """
    if qid != '?':
        print(f"GTRUTH: '{string}' ==> {qid}; '{label}'")
    link = wd.link(string, context=context)
    if not link:
        print(f"- LINK: '{string}' ==> None None")
    else:
        q,s,l,d = wd.summary(link)
        d = d[:MAX]+'...' if len(d)>MAX else d
        result = "-" if (qid != '?' and q != qid) else "+"
        print(f"{result} LINK: '{string}'=>'{s}' ==> {q}; '{l}'; '{d}'")
    print()

### Some simple tests

In [77]:
# Simple example
test('NIH', qid='Q390551', label='National Institutes of Health')

GTRUTH: 'NIH' ==> Q390551; 'National Institutes of Health'
+ LINK: 'NIH'=>'NIH' ==> Q390551; 'National Institutes of Health'; 'medical research organ..'



In [78]:
# This one fails because we prefer a biomedical item.  Obama is the name of a class of worms
test('Obama', qid='Q76', label='Barack Obama')

GTRUTH: 'Obama' ==> Q76; 'Barack Obama'
- LINK: 'Obama'=>'Obama' ==> Q18355807; 'Obama'; 'genus of worms'



In [79]:
# we remove any non-ascii characters from the 
test('Mortality rate \xcf\x80', qid="Q58702")

GTRUTH: 'Mortality rate Ï' ==> Q58702; '?'
+ LINK: 'Mortality rate Ï'=>'mortality rate' ==> Q58702; 'mortality rate'; 'measure of the number ..'



In [80]:
# we reduce tokens to their lemmas, so 'authors' becomes 'author'
test('authors')

+ LINK: 'authors'=>'author' ==> Q482980; 'author'; 'author or intellectual..'



In [81]:
# we remove the utf-8 characters and also a small set of characters like parentheses and %
test('HCP\xe2\x80\x99s death toll %')

+ LINK: 'HCPâs death toll %'=>'death toll' ==> Q65096341; 'death toll'; 'number of deaths follo..'



### Load test data from a tab-seperated text file
 * the examples are from the initial ones Vijay sent us
 * each line in the file has a string, qid & label from a human annotator
 * some of the 'wrong' examples may be ok alternatives
 * other wrong ones may need some sample cell values to guide the linking

In [82]:
test_file = 'linking_tests.tsv'
tests = [line.strip().split('\t') for line in open(test_file)]

# run all of the tests
for string, qid, label in tests:
    test(string, qid=qid, label=label)


GTRUTH: 'Types' ==> Q161524; 'mask'
- LINK: 'Types'=>'type' ==> Q3707858; 'type'; 'anchoring point (of a name) in taxonomy'

GTRUTH: 'Death rate %' ==> Q58702; 'mortality rate'
+ LINK: 'Death rate %'=>'death rate' ==> Q58702; 'mortality rate'; 'measure of the number ..'

GTRUTH: 'Sample size' ==> Q70443487; 'sample size'
+ LINK: 'Sample size'=>'sample size' ==> Q70443487; 'sample size'; 'number of units (e.g. ..'

GTRUTH: 'Authors' ==> Q11801904; 'PMCI'
- LINK: 'Authors'=>'author' ==> Q482980; 'author'; 'author or intellectual..'

GTRUTH: 'Virus' ==> Q855769; 'strain'
- LINK: 'Virus'=>'virus' ==> Q808; 'virus'; 'non-cellular, submicro..'

GTRUTH: 'Vaccine Platform' ==> Q105967696; 'vaccine type'
- LINK: 'Vaccine Platform'=>'Vaccine platform' ==> Q108028785; 'Vaccine Platform'; 'an underlying mechanis..'

GTRUTH: 'HCP\xe2\x80\x99s death toll %' ==> Q58702; 'mortality rate'
- LINK: 'HCP\xe2\x80\x99s death toll %'=>'death toll' ==> Q65096341; 'death toll'; 'number of deaths follo..'

GTRU

### We use SpaCy to find noun-chunks in a sentence and link each one, using the sentence as context

In [83]:
sent = nlp("Trials involving vaccines, antiviral drugs, immunotherapies, \
monoclonal antibodies, stem cells, and nitric oxide are summarized in Table 1.")

for chunk in sent.noun_chunks:
    test(chunk.text, context=sent)

+ LINK: 'Trials'=>'trial' ==> Q1436668; 'randomized controlled trial'; 'experimental method de..'

+ LINK: 'vaccines'=>'vaccine' ==> Q134808; 'vaccine'; 'substance used to stim..'

+ LINK: 'antiviral drugs'=>'antiviral drug' ==> Q846227; 'antiviral drug'; 'class of medications'

+ LINK: 'immunotherapies'=>'immunotherapy' ==> Q72138716; 'adoptive immunotherapy'; 'transfer immunotherapy..'

+ LINK: 'monoclonal antibodies'=>'monoclonal antibody' ==> Q422248; 'monoclonal antibody'; 'monospecific antibody ..'

+ LINK: 'stem cells'=>'stem cell' ==> Q48196; 'stem cell'; 'undifferentiated biolo..'

+ LINK: 'nitric oxide'=>'nitric oxide' ==> Q14916164; 'nitric oxide biosynthetic process'; 'The chemical reactions..'

+ LINK: 'Table'=>'table' ==> Q278425; 'table'; 'set of data elements in databases'



### Here we use the previous sentence as context for prevalence
  * but the context is not needed because we prefer MESH terms, in general
  * MESH term are from an NLM controlled vocabulary for Medical Subject Headings

In [92]:
test("prevalence")
test("prevalence", context=sent)

+ LINK: 'prevalence'=>'prevalence' ==> Q719602; 'prevalence'; 'Number of disease case...'

+ LINK: 'prevalence'=>'prevalence' ==> Q719602; 'prevalence'; 'Number of disease case...'



In [95]:
test("COVID-19 variants")

+ LINK: 'COVID-19 variants'=>'covid-19 variant' ==> Q106581308; 'SARS-CoV-2 lineage B.1.617'; 'variant of SARS-CoV-2'



### In this example, we use SpaCy to find named entities and their types and link them to Wikidata

In [42]:
# doc is a complex data structure with all of the information SpaCy extracts from a text.
doc = nlp("Anthony Fauci is an American doctor who got his medical degree from \
Cornell University. He is the director of the NIAID based in Bethesda, Maryland. \
He has a summer home in the Allegheny Mountains that he purchsed in 1985 for $65,000.")

# This displays the entities found and their types
displacy.render(doc, style="ent")

In [43]:
def link_ent(ent_mention, ent_type, MAX=30):
    """ Given an entity mention and its Ontonotes type, we link it to a Wikidata item. """
    link = wd.link(ent_mention, target_types=[ent_type])
    if link:
        q,_,l,d = wd.summary(link)
        d = d[:MAX]+'...' if len(d)>MAX else d
        print(f"'{ent_mention}'/{ent_type} ==> {q}; '{l}'; {d}\n")
    else:
        print(f"'{ent_mention}'/{ent_type} ==> No Link\n")

In [44]:
# doc.ents is a list of entities in the doc produced by SpaCy. We can easily get each entity's 
# tesxt and type and use these to find a Wikidata link
for ent in doc.ents:
    link_ent(ent.text, ent.label_)

'Anthony Fauci'/PERSON ==> Q573246; 'Anthony Fauci'; American immunologist and head...

'American'/NORP ==> Q49085; 'African Americans'; racial or ethnic group in the ...

'Cornell University'/ORG ==> Q49115; 'Cornell University'; private university in Ithaca (...

'NIAID'/ORG ==> Q3519875; 'National Institute of Allergy and Infectious Diseases'; US research institute for the ...

'Bethesda'/GPE ==> Q1883524; 'Bethesda'; town and community in Gwynedd

'Maryland'/GPE ==> Q1391; 'Maryland'; state of the United States of ...

'summer'/DATE ==> Q1313; 'summer'; warmest of the four temperate ...

'the Allegheny Mountains'/LOC ==> Q4731302; 'Allegheny Front'; Major escarpment in the Allegh...

'1985'/DATE ==> Q2431; '1985'; year

'65,000'/MONEY ==> No Link

