# Information extraction examples

This notebook demonstrates information extraction examples using Spacy.

In [None]:
import re
from IPython.display import display, HTML
import spacy
from spacy import displacy

# normally: nlp = spacy.load('en_core_web_sm')
nlp = spacy.load('./.local/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.2.5') # Jupyterhub location

In [None]:
doc = "Barack Obama was born in Hawaii on 4th of August, 1961."

In [None]:
text = nlp(doc)

In [None]:
for token in text.doc:
    print(token.text, token.i) # tokenised; i provides an index

In [None]:
# tokenise by sentences

for sentence in text.sents:
    print(sentence, sentence.start, sentence.end)

In [None]:
# obtain parts of speech with Penn Treebank tags

for token in text:
    print(token, token.tag_)

In [None]:
# filter by part of speech

for token in text:
    if token.tag_ == 'NNP':
        print(token, token.i, token.idx) # i = token index; idx = character index

In [None]:
# extract entities
text.ents

In [None]:
# iterate over entities and print labels, start & end indices

for ent in text.ents:
    print(ent.text, ent.label_, ent.start, ent.end)

In [None]:
# NZ specific locations - Māori place names

location_example = '''Porritt Park, an old loop of the Avon River, lies within Wainoni. Going clockwise from there, 
                    boundary roads of the suburb are Wainoni, Breezes, Pages, and Kerrs Roads. 
                    Wainoni is approximately 7 kilometres (4.3 mi) from the central city. 
                    Wainoni Park is located in the adjacent suburb of Aranui.
                    Wainoni and its neighbouring suburb of Aranui are often considered together and intermixed. 
                    For example, Wainoni School and Wainoni Park are located in Aranui, and Aranui High School 
                    is located in Wainoni. Christchurch City Council publishes a combined community profile for 
                    the two suburbs. '''

loc_text = nlp(location_example)

for ent in loc_text.ents:
    print(ent.text, ent.label_)

In [None]:
# another example text

doc_with_nouns = '''The UK will be hit with a three-month meltdown at its ports, a hard Irish border and shortages of food and medicine if it leaves the EU without a deal, according to government documents on Operation Yellowhammer.'''

text2 = nlp(doc_with_nouns)

In [None]:
# GPE = Geopolitical Entities
# NORP = Nationality, Religious or Political organisations
# ORG = Organisation

for ent in text2.ents:
    print(ent.text, ent.label_)

In [None]:
# The displacy module visualises these entities nicely
displacy.render(text2, style='ent')

In [None]:
# extract noun phrases (noun chunks)

for chunk in text2.noun_chunks:
    print(chunk)

In [None]:
# Dependency parsing
# For interest: syntactic dependencies are described here - https://universaldependencies.org/docs/en/dep/

for token in text:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

In [None]:
displacy.render(text, style='dep')

### Relation Extraction

Below is an example script taken directly from [https://spacy.io/usage/examples](https://spacy.io/usage/examples)

It finds the relationship between MONEY entities and the noun phrases they relate to in these two sentences:

```"Net income was $9.4 million compared to the prior year of $2.7 million. Revenue exceeded twelve billion dollars, with a loss of $1b."```

In [None]:
#!/usr/bin/env python
# coding: utf8
"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.

Compatible with: spaCy v2.0.0+
Last tested with: v2.2.1
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]


@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str)
)
def main(model="en_core_web_sm"):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))

    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))


def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result


def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head, money))
    return relations

main()

    # Expected output:
    # Net income      MONEY   $9.4 million
    # the prior year  MONEY   $2.7 million
    # Revenue         MONEY   twelve billion dollars
    # a loss          MONEY   1b