## Finding and linking nominal compounds and posessives

In [1]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
from collections import Counter
import en_core_web_md  #medium en pipeline
from importlib import reload # while developing

### Load one of Spacy's language models. This is a medium sized one for English

In [2]:
nlp = spacy.load("en_core_web_md")

spacy_entity_linker


### Input a short document and process it 

In [3]:
text1 = open(file='data/doc1.txt', mode='r', encoding='utf-8').read()
doc1 = nlp(text1)

### Display the text marking its entities and their types.  The default types are the 18 types from [Ontonotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)

In [4]:
displacy.render(doc1, style="ent")

### Add patterns to find nominal compounds and posessives

In [5]:
# Import the Matcher class
from spacy.matcher import Matcher
# Create a Matcher and provide model vocabulary; assign result under the variable 'matcher'
matcher = Matcher(vocab=nlp.vocab)
# Define a list with nested dictionaries that contains the pattern to be matched
nominal_compound = [{'POS': 'NOUN', 'OP': '+'}, {'POS': 'NOUN'}]
adj_noun = [{'POS': 'ADJ'}, {'POS': 'NOUN'}] # might not be usefull
noun_part_noun = [{'POS': 'NOUN'}, {'POS' :'PART'}, {'POS': 'NOUN'}]
# Add the pattern to the matcher under the name 'pronoun+verb'
matcher.add("possible concepts", patterns=[nominal_compound, noun_part_noun])

### Run the patterns over the document

In [24]:
nominal_descriptions = matcher(doc1, as_spans=True)
print(nominal_descriptions)

[country’s volcanology, volcanology centre, earth’s crust, crust meet]


In [7]:
import wd_search as wds

### For these, only the possessive links to a WD concept 

In [8]:
wds.summary(wds.link("earth's crust", category='concept'))

('Q15316',
 "Earth's crust",
 "Earth's outer layer",
 ['Q35120:entity'],
 'https://wikidata.org/wiki/Q15316')

In [9]:
wds.summary(wds.link("volcanology centre", category='concept'))

'No match'

### link named entites found by SpaCy

In [10]:
ent_set = set()
for e in doc1.ents:
    if e.label_ not in ['QUANTITY','DATE','ORDINAL','CARDINAL']:
        ent_set.add((e.text, e.label_))
print(ent_set)

{('Volcanology', 'ORG'), ('Geological Hazard Mitigation Centre', 'ORG'), ('Pacific Ring of Fire', 'WORK_OF_ART'), ('Sinabung', 'PERSON'), ('Indonesia', 'GPE'), ('Mount Sinabung', 'GPE'), ('Wirda Br Sitepu', 'PERSON'), ('North Sumatra', 'GPE'), ('Mount Sinabung’s', 'PERSON'), ('Reuters', 'ORG')}


In [11]:
import time
start = time.time()
for etext, etype in ent_set:
    wd = wds.link(etext, types=[etype], category='instance')
    if wd:
        print(f"{etext}/{etype} ==> {wds.summary(wd)}")
    else:
        print(f"{etext}/{etype} ==> ?")
    print()
elapsed = time.time() - start
avg = elapsed / len(ent_set)
print(f"linked {len(ent_set)} entities in {elapsed:.2f} seconds, (avg = {avg:.2f})")
    

Volcanology/ORG ==> ('Q1674866', 'National Institute of Geophysics and Volcanology', 'research institute in Italy', ['Q43229:organization'], 'https://wikidata.org/wiki/Q1674866')

Geological Hazard Mitigation Centre/ORG ==> ?

Pacific Ring of Fire/WORK_OF_ART ==> ('Q7058035', 'Northeastern Japan arc', 'island arc on the Pacific Ring of Fire', ['Q2221906:geographic location', 'Q35120:entity'], 'https://wikidata.org/wiki/Q7058035')

Sinabung/PERSON ==> ('Q207678', 'Sinabung Mount', 'Active volcano in North Sumatra, Indonesia', ['Q2221906:geographic location', 'Q35120:entity'], 'https://wikidata.org/wiki/Q207678')

Indonesia/GPE ==> ('Q252', 'Indonesia', 'sovereign state in Southeast Asia situated on more than 17,000 islands', ['Q56061:administrative territorial entity', 'Q43229:organization', 'Q7210356:political organization', 'Q2221906:geographic location', 'Q16334295:group of humans', 'Q35120:entity', 'Q1190554:occurrence'], 'https://wikidata.org/wiki/Q252')

Mount Sinabung/GPE ==> ('Q

### We can find nominal compounds with more than two nouns, but spacy's parser may not recognize them

In [35]:
textx = "John keeps his coins in a metal garbage can on his office desk"
docx = nlp(textx)

In [36]:
displacy.render(docx, style="ent")

In [32]:
displacy.render(docx, style="dep", options={'compact':True})

In [37]:
compounds = matcher(docx, as_spans=True)
print(compounds)

[metal garbage, office desk]


### Here's an example that Spacy gets right

In [34]:
docx2 = nlp("a bamboo garden rake is a good tool")
displacy.render(docx2, style="dep", options={'compact':True})
print(matcher(docx2, as_spans=True))

[bamboo garden, garden rake, bamboo garden rake]


fin