In [1]:
# see https://spacy.io and https://spacy.io/api
# now has bert style pretraining, but much faster https://spacy.io/usage/v2-1
# pip install spacy
# spacy download en_core_web_sm
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fa23b5220b8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fa239055b88>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fa239055be8>)]

In [5]:
text = """
We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. \
Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly \
conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional \
output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial \
task-specific architecture modifications.
BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, \
including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 \
question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement). \
"""

In [6]:
doc = nlp(text)

In [7]:
doc


We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.
BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point

In [8]:
# sentence tokenization
list(doc.sents)

[
 We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers.,
 Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers.,
 As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications.,
 BERT is conceptually simple and empirically powerful.,
 It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD,
 v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD,
 v2.0 Test F1 to 8

In [10]:
# word tokenization
print([token for token in doc])

[
, We, introduce, a, new, language, representation, model, called, BERT, ,, which, stands, for, Bidirectional, Encoder, Representations, from, Transformers, ., Unlike, recent, language, representation, models, ,, BERT, is, designed, to, pre, -, train, deep, bidirectional, representations, from, unlabeled, text, by, jointly, conditioning, on, both, left, and, right, context, in, all, layers, ., As, a, result, ,, the, pre, -, trained, BERT, model, can, be, fine, -, tuned, with, just, one, additional, output, layer, to, create, state, -, of, -, the, -, art, models, for, a, wide, range, of, tasks, ,, such, as, question, answering, and, language, inference, ,, without, substantial, task, -, specific, architecture, modifications, ., 
, BERT, is, conceptually, simple, and, empirically, powerful, ., It, obtains, new, state, -, of, -, the, -, art, results, on, eleven, natural, language, processing, tasks, ,, including, pushing, the, GLUE, score, to, 80.5, %, (, 7.7, %, point, absolute, improve

In [11]:
# Named Entity Exraction
for ent in doc.ents:
    print(ent.label_, ent)

PRODUCT BERT
WORK_OF_ART Bidirectional Encoder Representations
PERSON BERT
PERSON BERT
CARDINAL just one
PERSON BERT
CARDINAL eleven
ORG GLUE
PERCENT 80.5%
PERCENT 7.7%
PERCENT 86.7%
PERCENT 4.6%
CARDINAL 93.2
CARDINAL 1.5
CARDINAL 83.1
CARDINAL 5.1


In [12]:
# Visualizing Entities
displacy.render(doc, style="ent", jupyter=True)

In [13]:
# Dependency Parsing
displacy.render(list(doc.sents)[0], style="dep", jupyter=True)

In [14]:
# relations
from __future__ import unicode_literals, print_function

def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    get_sort_key = lambda span: (span.end - span.start, span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
            seen_tokens.update(range(span.start, span.end))
    return result

def extract_currency_relations(doc):
    # Merge entities and noun chunks into one token
    seen_tokens = set()
    spans = list(doc.ents) + list(doc.noun_chunks)
    spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)

    relations = []
    for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
        if money.dep_ in ("attr", "dobj"):
            subject = [w for w in money.head.lefts if w.dep_ == "nsubj"]
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == "pobj" and money.head.dep_ == "prep":
            relations.append((money.head.head, money))
    return relations


TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b."
    
]

for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print("{:<10}\t{}\t{}".format(r1.text, r2.ent_type_, r2.text))


Net income	MONEY	$9.4 million
the prior year	MONEY	$2.7 million
Revenue   	MONEY	twelve billion dollars
a loss    	MONEY	1b
