## Finding and linking nominal compounds and posessives in scale reports

In [1]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import en_core_web_md  #medium en pipeline

import wd_search as wds

from collections import Counter
import textwrap
import time
from importlib import reload # while developing

### Load one of Spacy's language models. This is a medium sized one for English

In [5]:
nlp = spacy.load("en_core_web_md")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


### Input a short document and process it 

In [3]:
topic_number = "1001"
text = open(file=f'report_data/{topic_number}_report.txt', mode='r', encoding='utf-8').read()
doc = nlp(text)

### Display the text marking its entities and their types.  The default types are the 18 types from [Ontonotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)

In [4]:
if doc.ents:
    displacy.render(doc, style="ent")
else:
    print("No entities were found in the document\n")
    print(textwrap.fill(doc.text, width=80))

### Add patterns to find nominal compounds and posessives

In [None]:
# Create a Matcher and provide model vocabulary; assign result under the variable 'matcher'
matcher = Matcher(vocab=nlp.vocab)
# Define a list with nested dictionaries that contains the pattern to be matched
nominal_compound = [{'POS': 'NOUN', 'OP': '+'}, {'POS': 'NOUN'}]
adj_noun = [{'POS': 'ADJ'}, {'POS': 'NOUN'}] # might not be usefull
noun_part_noun = [{'POS': 'NOUN'}, {'POS' :'PART'}, {'POS': 'NOUN'}]
# Add the pattern to the matcher under the name 'pronoun+verb'
#matcher.add("possible concepts", patterns=[nominal_compound, noun_part_noun])
matcher.add("possible concepts", patterns=[nominal_compound])

### Nominals found in the report

In [None]:
nominals = matcher(doc, as_spans=True)
nfound = len(nominals)
if nominals:
    print(f"Found {nfound} nominals: {nominals}")
else:
    print("No nominals were found in the document")
#for n in nominals: print(n, n.end-n.start, [x for x in n.noun_chunks])

### Named entites found in the report

In [None]:
ent_set = set()
for e in doc.ents:
    if e.label_ not in ['QUANTITY','DATE','ORDINAL','CARDINAL', 'MONEY', 'PERCENT']:
        ent_set.add((e.text, e.label_))
efound = len(ent_set)        
print(f"Found {efound} entities: {ent_set}")

### Link the entities and nominals found in the report

In [None]:
#number of successful entities and nominals linked and lists of the failures
elinked = nlinked = 0
efailed = []
nfailed = []
start = time.time()
for e in ent_set:
    etext, etype = e
    wd = wds.link(etext, types=[etype], category='instance')
    if wd:
        print(f"{e} ==> {wds.summary(wd)}\n")
        elinked += 1
    else:
        efailed.append(e)
        
for nc in nominals:
    wd = wds.link(nc.text, category='concept')
    if wd:
        print(f"{nc.text} ==> {wds.summary(wd)}\n")
        nlinked += 1
    else:
        nfailed.append(nc.text)

### Summarize the results

In [None]:
elapsed = time.time() - start
avg = elapsed / (len(ent_set) + len(nominals))
print(f"Linked {elinked} of {len(ent_set)} entities ({100*elinked/efound:.1f})% and {nlinked} of {len(nominals)} nominals ({100*nlinked/nfound:.1f})%")
print(f"Total time {elapsed:.1f} seconds, Average time {avg:.1f} seconds per attempt")
if efailed:
    print(f"\nEntities not linked: {efailed}")
if nfailed:
    print(f"\nNominals not linked: {nfailed}")

fin