In [19]:
# Initialization boilerplate
from typing import Iterable, Any, Optional

import re
import os
import time

import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core
import pandas as pd
import text_extensions_for_pandas as tp
import spacy
import matplotlib.pyplot as plt

import textwrap
from IPython.core.display import display, HTML


api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  
natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
    version="2021-01-01",
    authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
)
natural_language_understanding.set_service_url(service_url)

In [2]:
# Copy of the contents of the Github gist from part 1
# See https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d

def find_persons_quoted_by_name(doc_url, api_key, service_url) -> pd.DataFrame:
    # Ask Watson Natural Language Understanding to run its "semantic_roles"
    # and "entities" models.
    natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
        version="2021-01-01",
        authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
    )
    natural_language_understanding.set_service_url(service_url)
    nlu_results = natural_language_understanding.analyze(
        url=doc_url,
        return_analyzed_text=True,
        features=nlu.Features(
            entities=nlu.EntitiesOptions(mentions=True),
            semantic_roles=nlu.SemanticRolesOptions())).get_result()

    # Convert the output of Watson Natural Language Understanding to DataFrames.
    dataframes = tp.io.watson.nlu.parse_response(nlu_results)
    entity_mentions_df = dataframes["entity_mentions"]
    semantic_roles_df = dataframes["semantic_roles"]

    # Extract mentions of person names
    person_mentions_df = entity_mentions_df[entity_mentions_df["type"] == "Person"]

    # Extract instances of subjects that made statements
    quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] == "say"]
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Retrieve the full document text from the entity mentions output.
    doc_text = entity_mentions_df["span"].array.document_text

    # Filter down to just the rows and columns we're interested in
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Use String.index() to find where the strings in "subject.text" begin
    subjects_df["begin"] = pd.Series(
        [doc_text.index(s) for s in subjects_df["subject.text"]], dtype=int)

    # Compute end offsets and wrap the <begin, end, text> triples in a SpanArray column
    subjects_df["end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len()
    subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"], subjects_df["end"])

    # Align subjects with person names
    execs_df = tp.spanner.contain_join(subjects_df["span"], 
                                       person_mentions_df["span"],
                                       "subject", "person")
    # Add on the document URL.
    execs_df["url"] = doc_url
    return execs_df[["person", "url"]]

In [3]:
# Contents of the gist at
# https://gist.github.com/frreiss/a731438dda4ac948beca85d3fe167ff3
import pandas as pd
import text_extensions_for_pandas as tp

def find_titles_of_persons(persons: pd.DataFrame,
                           spacy_language_model) -> pd.DataFrame:
    """
    :param persons: DataFrame containing information about person names.
    :param spacy_language_model: Loaded SpaCy language model with dependency 
     parsing support.

    :returns: A DataFrame with a row for every title identified and two columns,
     "person" and "title".
    """
    def traverse_edges_once(start_nodes: pd.DataFrame, edges: pd.DataFrame,
                        metadata_cols = ["person"]) -> pd.DataFrame:
        return (
            start_nodes[["person", "id"]]  # Propagate original "person" span
            .merge(edges, left_on="id", right_on="head", 
                   suffixes=["_head", ""])[["person", "id"]]
            .merge(nodes)
        )

    if len(persons.index) == 0:
        # Special case: Empty input --> empty output
        return pd.DataFrame({
            "person": pd.Series([], dtype=tp.SpanDtype()),
            "title": pd.Series([], dtype=tp.SpanDtype()),
        })

    # Retrieve the document text from the person spans.
    doc_text = persons["person"].array.document_text

    # Run dependency parsing on the text and convert the parse to a DataFrame.
    all_token_features = tp.io.spacy.make_tokens_and_features(doc_text, spacy_language_model)

    # Drop the columns we won't need for this analysis.
    tokens = all_token_features[["id", "span", "tag", "dep", "head", "sentence"]]

    # Split the parse tree into nodes and edges and filter the edges.
    nodes = tokens[["id", "span", "tag"]].reset_index(drop=True)
    edges = tokens[["id", "head", "dep"]].reset_index(drop=True)

    # Start with the nodes that are inside person names.
    person_nodes = (
        tp.spanner.overlap_join(persons["person"], nodes["span"],
                                "person", "span")
        .merge(nodes)
    )

    # Step 1: Follow `appos` edges from the person names
    appos_targets = traverse_edges_once(person_nodes, 
                                        edges[edges["dep"] == "appos"])

    # Step 2: Transitive closure to find all tokens in the titles
    selected_nodes = appos_targets.copy()
    previous_num_nodes = 0
    while len(selected_nodes.index) > previous_num_nodes:

        # Find all the nodes that are directly reachable from our selected set.
        addl_nodes = traverse_edges_once(selected_nodes, edges)

        # Merge the new nodes into the selected set
        previous_num_nodes = len(selected_nodes.index)
        selected_nodes = (pd.concat([selected_nodes, addl_nodes])
                          .drop_duplicates())

    # Aggregate the nodes of each title to find the span of the entire title.
    titles = (
        selected_nodes
        .groupby("person")
        .aggregate({"span": "sum"})
        .reset_index()
        .rename(columns={"span": "title"})
    )

    # As of Pandas 1.2.1, groupby() over extension types downgrades them to object 
    # dtype. Cast back up to the extension type.
    titles["person"] = titles["person"].astype(tp.SpanDtype())

    return titles

In [4]:
import market_intelligence as mi

articles = mi.maybe_download_articles()
articles

Unnamed: 0,url,html
0,https://newsroom.ibm.com/2020-02-04-The-Avril-...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
1,https://newsroom.ibm.com/2020-02-11-IBM-X-Forc...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
2,https://newsroom.ibm.com/2020-02-18-IBM-Study-...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
3,https://newsroom.ibm.com/2020-02-19-IBM-Power-...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
4,https://newsroom.ibm.com/2020-02-20-Centotrent...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
...,...,...
186,https://newsroom.ibm.com/2021-01-25-OVHcloud-t...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
187,https://newsroom.ibm.com/2021-01-26-Luminor-Ba...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
188,https://newsroom.ibm.com/2021-01-26-DIA-Levera...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."
189,https://newsroom.ibm.com/2021-01-26-IBM-Board-...,"<!DOCTYPE html public ""-//W3C//DTD HTML 4.01 T..."


# Part 3: Making an NLP Model 9x Faster with the Semijoin Trick

Over the last few years, machine learning models for natural language processing have become much more accurate, but they've also become much more resource-intensive. In this article, we'll show you a simple trick that can cut those resources by an order of magnitude or more. And we'll show you how to implement this trick using our open-source library, [Text Extensions for Pandas](https://ibm.biz/text-extensions-for-pandas).

## Some Background

Many natural language processing applications involve combining the results of multiple models. A particularly common pattern is looking for **places in a document where the outputs of two models overlap**. For example, if you cross-reference a [document layout analysis](https://en.wikipedia.org/wiki/Document_layout_analysis) model and a [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) model, you can find who is the author of a document. Cross-reference a [part of speech](https://en.wikipedia.org/wiki/Part-of-speech_tagging) model with a model for [resolving anaphora](https://en.wikipedia.org/wiki/Coreference), and you can find which company the pronoun "it" refers to. There are many different pairs of models whose outputs you can combine to produce something more valuable than its parts.

Any time that you see this pattern, it's an opportunity to deploy an old database performance trick called a *semijoin*.

## What's a Semijoin?

In the early 1980s, engineers started looking into how to find matching records from two databases that are physically far apart. Back then, wide-area network bandwidth was measured in *bits* per second.

Say that you worked in your firm's New York office, and you wanted to check for duplicate invoices between the accounts payable database in New York and the database in Los Angeles. You could download the entire contents of the L.A. database, but at 300 bits per second, that would take a long time. So instead, you gather up a list of dates and dollar amounts from your New York database, upload that list to the L.A. database, and ask the L.A. database to send back just the invoices that match your list. That way you send much less data.

That's the basic formula for a semijoin: Instead of asking the remote system for all its data and then matching it with your local data, you send the remote system a description of what data *could* match with your local data. Then you ask the data source for just the data that matches your description. The description, plus the data that meets that description, are much smaller than the entire remote data set.


## The Semijoin Trick for NLP

What does this old database trick have to do with natural language processing?

If your application is one of those "find places where two models overlap" applications we talked about earlier, chances are one of those models is much more expensive than the other. Running that expensive NLP model is a lot like downloading the results of the model over a very slow modem --- it takes a lot of time. 

But what if you could tell the expensive model to only "download" the results that are going to match with the outputs of the less-expensive model? For many NLP models, there's a way to do just that: Run the model only over certain parts of the document. Then you'll get back a fraction of the results in a fraction of the time.

In more concrete terms, we can apply the semijoin trick to a "find places where two models overlap" task by breaking the task into four steps:

1. Divide the document into pieces.
2. Find all the pieces that contain a match of the less-expensive model.
3. Run the expensive model over just those pieces.
4. Cross-reference the results of the two models.

Now, there are some tricky parts to making these steps work in practice.

The first problem is that **NLP models are sensitive to context**. If you don't give the expensive model enough context from before and after a given place in the document, you can change the results at the place you're interested in.

What constitutes "enough" context varies, but many models treat each sentence in the document as a separate unit of analysis. The trouble is, **finding sentence boundaries is hard**.

## Finding sentence boundaries is hard...

Finding sentence boundaries with high accuracy is surprisingly hard. You might think that finding sentence boundaries is easy --- just look for punctuation. But there are many exceptions to that rule. There could be a quotation like "This!" in the middle of a sentence. There could be an abbreviation like Mr. or colloq. that doesn't end a sentence. Your document use punctuation to emphasize. certain. words. Or the author may have forgotten the punctuation entirely

Even the best sentence boundary detection model gets about one sentence boundary in 10 wrong. For example, the [SpaCy library's most sophisticated sentence boundary detector](https://github.com/explosion/spacy-models/releases/tag/en_core_web_trf-3.2.0) reports an F1 score of 0.91 on the OntoNotes corpus.

Worse, each incorrect sentence boundary results in multiple incorrect sentences. If your model misses a sentence boundary, it's actually making three mistakes, because the sentences on either side of the boundary will be replaced with a single larger, incorrect sentence.

![Each incorrect sentence boundary results in multiple incorrect sentences](images/sentence_accuracy.png)

## ...but that's ok

Fortunately, we don't need to find *all* the sentence boundaries to make the semijoin trick work. We just need to find *some* of them with high accuracy. As it happens, some sentence boundaries are much clearer than others. 

For news articles, a good place to start is paragraph boundaries. Newspapers have traditionally cut articles on paragraph boundaries to meet space constraints, so news content is usually divided into small, clearly-delineated paragraphs.


So splitting on paragraph boundaries helps with the context problem.

But there's a second problem: **Bookkeeping**.

When you run your expensive model over fragment of the document, the model only sees the fragment. It doesn't know about the original document and will return its matches relative to the current document fragment.

You'll need to map these piecewise results back to the original document. So your code needs to keep track of all the fragment and the relationship between offsets into a given fragment and offsets into the document.

Fortunately, our [Text Extensions for Pandas](https://ibm.biz/text-extensions-for-pandas) library gives you all the tools you need to track this information.



In [5]:
text = "This is a sentence. This is also a sentence."
begins = [0, 20]
ends = [19, 44]

span_array = tp.SpanArray(text, begins, ends)
span_array

Unnamed: 0,begin,end,context
0,0,19,This is a sentence.
1,20,44,This is also a sentence.


In [37]:
selected_span = span_array[1]
selected_span

[20, 44): 'This is also a sentence.'

In [39]:
text_to_examine = selected_span.covered_text
regex = re.compile("sentence")
results = pd.DataFrame({"word": tp.spanner.extract_regex(text_to_examine,
                                                         regex)})
results["word"].array

Unnamed: 0,begin,end,context
0,15,23,sentence


In [61]:
def unpack_semijoin(target_region: tp.Span,
                    model_results: pd.DataFrame) -> pd.DataFrame:
    """
    Unpack the results of evaluating an extraction model, such as
    dependency parsing or named entity recognition, using a semijoin
    strategy to reduce the amount of text over which the model is
    applied.

    To use :func:`unpack_semijoin`, first identify regions of the text
    that you wish to run the model. Then run the model over the text 
    of those regions to produce spans whose begin and end offsets are 
    relative to the text of each distinct target region. Then you can
    pass the spans and the model results to this function to produce
    result spans whose begin and end offsets are relative to the original
    document text.

    :param target_region: Span indicating a section of the original
     document text over which themodel was applied.
    :param model_results: Results from running your model over
     ``target_region``, as a :class:`pd.DataFrame`.
    :returns: A :class:`pd.DataFrame` with the same schema as 
      ``model_results``, but with all spans converted from spans over
      the target text of ``target_region`` to spans over the original
      document text.
    """
    doc_text = target_region.target_text
    region_offset = target_region.begin

    # Make a copy of the DataFrame, then modify span columns of
    # the copy in place.
    result = model_results.copy()
    for i in range(len(result.columns)):
        if isinstance(model_results.dtypes[i], tp.SpanDtype):
            column_name = model_results.columns[i]
            raw_spans = result[column_name].array
            result[column_name] = tp.SpanArray(
                doc_text, raw_spans.begin + region_offset,
                raw_spans.end + region_offset)
    return result

In [62]:
remapped_results = unpack_semijoin(selected_span, results)
remapped_results["word"].array

Unnamed: 0,begin,end,context
0,35,43,sentence


In [53]:
len(results.columns)

1

In [None]:
import numpy as np
np.array([1, 2]).tolist() + [4]


Let's tie all of this discussion together with a concrete example. The example that we will use here comes from some of our previous articles. It involves finding the names and titles of executives in corporate press releases by looking for the pattern: *the article quotes a person by name and job title*.

for example:

![Example of the name and title of an executive in context](images/quote_name_title.png)

If a press release quotes someone by name and job title, that person is probably someone who's authorized to speak to the press on behalf of her employer --- in other words, an executive.

If you're interested in the details of this application, check out our previous articles where we break down the use case into two parts

[Part 1](https://medium.com/ibm-data-ai/market-intelligence-with-pandas-and-ibm-watson-a939323a31ea): cross-reference the `semantic_roles` and `entities` models from Watson Natural Language Understanding to find places where the article quotes a person by name.

[Part 2](https://medium.com/ibm-data-ai/using-pandas-dataframes-to-analyze-sentence-structure-53539ffbdf06): cross-reference the names from Part 1 with the output of the SpaCy dependency parser to match job titles with each of the names.


The code from part 1 is available in [this Github gist](https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d), which defines a single Python function, `find_persons_quoted_by_name()`. As its name suggests, the function finds all the places where the document quotes a person by name. It returns a Pandas DataFrame containing all those locations. Here's what this return value looks like for an [example IBM press release](https://newsroom.ibm.com/2020-12-02-IBM-Named-a-Leader-in-the-2020-IDC-MarketScape-For-Worldwide-Advanced-Machine-Learning-Software-Platform):

In [None]:
doc_url = "https://newsroom.ibm.com/2020-12-02-IBM-Named-a-Leader-in-the-2020-IDC-MarketScape-For-Worldwide-Advanced-Machine-Learning-Software-Platform"
persons_quoted_by_name = find_persons_quoted_by_name(doc_url, api_key, service_url)
persons_quoted_by_name

The first column in this DataFrame holds the location of each person name. We store these locations as *spans*. "Span" is an NLP term that refers to a region of a document, usually defined by a begin and end offset. For example, the span `[1288, 1304): 'Daniel Hernandez'` covers the region from offsets 1288 thorough 1304 of the press release.

Pandas does not have a built-in type for storing spans, so we use a Pandas extension type from our open source library, [Text Extensions for Pandas](https://ibm.biz/text-extensions-for-pandas), to represent store the spans in the first column of this DataFrame.

The code from part 2 is available in [this Github gist](https://gist.github.com/frreiss/a731438dda4ac948beca85d3fe167ff3) which defines a second Python function, `find_titles_of_persons()`. This function takes in the output from `find_persons_quoted_by_name()` and a SpaCy language model. It uses the language model's dependency parser to connect person names to job titles. Here's what the output of that function looks like on the same example document:

In [None]:
spacy_language_model = spacy.load("en_core_web_trf")
persons_and_titles = find_titles_of_persons(persons_quoted_by_name, 
                                            spacy_language_model)
persons_and_titles

This function returns the 


The low-level details of  aren't too important for the present discussion

what matters is that we are combining the results of three models: Two Watson models in `find_persons_quoted_by_name` and a SpaCy language model in `find_titles_of_persons`. And one of those models is much more expensive than the other two.

We added some timing code to these two functions and fed 191 IBM press releases through them. Here's a high-level breakdown of 

In [None]:
timings_df = pd.read_csv("ibm_press_release_timings.csv")  # See below for code to regenerate this CSV file
sums = timings_df[["step_1_sec", "step_2_sec", "step_3_sec", "step_3a_sec", "step_4_sec"]].sum()

# Version of timings that break things down into 5 parts
timings = {
    "Extract entities and semantic roles": sums["step_1_sec"],
    "Identify persons quoted by name": sums["step_2_sec"],
    "Perform dependency parsing": sums["step_3_sec"],
    "Extract titles of persons": sums["step_4_sec"],
    "Combine results across documents": 0.062  # Manually pasted from output later on
}

# Simplified version
timings_short = {
    "entities and semantics_roles models": sums["step_1_sec"],
    "Dependency parser": sums["step_3_sec"],
    "Everything else": sums["step_2_sec"] + sums["step_4_sec"] + 0.062
}

data = timings_short
plt.pie(data.values(), labels=data.keys(), shadow=True)
plt.show()

We can break the running time into three parts:
* calling the Watson Natural Language Understanding web service to run its `entities` and `semantic_roles` models
* calling the SpaCy language model to run its dependency parser.
* everything else, including including reading in the documents, cross-referencing the results of the three models, and merging results across documents

As you can see in the chart, the dependency parser is by far the most expensive part of this application, accounting for nearly 3/4 of the total running time. Dependency parsing has always been an expensive operation, and the latest generation of dependency parsers is even more CPU-intensive.

Let's 

takes a negligible amount of time


# Part 3: Make it fast with Pandas

In the first two parts of this series, we explained how to use Text Extensions for Pandas and IBM Watson Natural Language Understanding to implement an example Market Intelligence application of natural language processing. Our example task involved identifying names and titles of executives in corporate press releases.

Parts 1 and 2 of this series laid out this process in great detail, but we can summarize the high-level flow in four steps:

1. Use IBM Watson Natural Language Understanding to extract semantic roles and person mentions from the press release.
2. Used Text Extensions for Pandas to convert those model outputs to Pandas DataFrames. Then cross-reference the data in those DataFrames to find the places where the press release quoted a person by name.
3. Use SpaCy's dependency parser to extract information about the relationships between the words of the press release.
4. Use Text Extensions for Pandas to process the parser's output and identify the titles of the persons we had identified earlier.

In this part of the series, we're going to turn four-step approach approach for identifying executives in a given press release into a program that mines this information from many press releases. 

Conceptually, our processing pipeline will look like this:

![First version of our processing pipeline](images/pipeline_v1.png)


Press releases go in on the left, and each document goes through the four steps of processing we've described so far. Then a fifth step combines all the results of that processing into a table of names and titles of executives. 

Let's quickly review what the outputs of the first four steps of processing look and how we surface these steps in our companion Python code. 



*TODO: Insert here a description of how we share the Python file `market_intelligence.py`*

In [None]:
import market_intelligence as mi

We'll use one year's worth of press releases from [the IBM news room](https://newsroom.ibm.com/announcements) --- 191 press releases, to be exact.
This data set is just big enough to illustrate the performance differences between the different approaches we're about to explore.
A typical real-world market intelligence application would use thousands or millions of articles.

`market_intelligence.py` contains code to download and cache this small corpus.

In [None]:
articles = mi.maybe_download_articles()
articles

Let's recap the analysis steps we laid out in detail in parts 1 and 2, using the same example document as before. This document is one of the documents in our DataFrame `articles`:

In [None]:
example_doc_url = "https://newsroom.ibm.com/2021-01-04-IBM-Study-Majority-of-Surveyed-Companies-are-Not-Prepared-for-IT-Needs-of-the-Future-Say-U-S-and-U-K-Tech-Leaders"
example_doc_html = articles.loc[articles["url"] == example_doc_url, "html"].values[0]
display(HTML(textwrap.shorten(example_doc_html, 5000)))

The first processing step extracts named entities and semantic roles with IBM Watson Natural Language Understanding.


In [None]:
step_1_results = (
    mi.extract_named_entities_and_semantic_roles(example_doc_html, 
                                                 natural_language_understanding)
)
textwrap.shorten(str(step_1_results), 1000)

The second processing step uses Text Extensions for Pandas to convert these model outputs into DataFrames, then uses these DataFrames to identify persons that the document quotes by name:

In [None]:
step_2_results = mi.identify_persons_quoted_by_name(step_1_results)
step_2_results

The third processing step uses SpaCy to perform dependency parsing over the document, then uses Text Extensions for Pandas to convert the dependency parse into a DataFrame:

In [None]:
# Retrieve the detagged document text that Watson Natural Language Understanding
# produced from the original HTML document.
doc_text = step_1_results["analyzed_text"]

# Fire up SpaCy's deep learning-based dependency parser.
spacy_language_model = spacy.load("en_core_web_trf")

step_3_results = mi.perform_dependency_parsing(doc_text, spacy_language_model)
step_3_results.head(3)

The fourth processing step uses Text Extensions for Pandas to process the parser's output and identify the titles of the persons we had identified back in step 2.

In [None]:
step_4_results = mi.extract_titles_of_persons(step_2_results, step_3_results)
step_4_results                                       

We can roll up these four steps into a single Python function:

In [None]:
def steps_1_through_4(doc_html: str) -> pd.DataFrame:
    step_1_results = mi.extract_named_entities_and_semantic_roles(doc_html, nlu_api)
    step_2_results = mi.identify_persons_quoted_by_name(step_1_results)   
    step_3_results = mi.perform_dependency_parsing(step_1_results["analyzed_text"],
                                                   spacy_language_model)
    step_4_results = mi.extract_titles_of_persons(step_2_results, step_3_results)
    return step_4_results

Now we've summarized the four steps that make up the processing from Part 1 and Part 2. Let's talk about the fifth step: **combining results across documents**.

We can implement our whole pipeline by running steps 1-4 in a for loop over the `html` column of our DataFrame of articles, building up a list of DataFrames, then stacking all the DataFrames into a single table of executives and titles:

In [None]:
# Replicate initialization code in a separate cell so other code can use it when we
# don't run the 15-minute cell that follows.
nlu_api = ibm_watson.NaturalLanguageUnderstandingV1(version="2021-01-01", 
            authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key))
nlu_api.set_service_url(service_url)
spacy_language_model = spacy.load("en_core_web_trf")

In [None]:
%%time

nlu_api = ibm_watson.NaturalLanguageUnderstandingV1(version="2021-01-01", 
            authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key))
nlu_api.set_service_url(service_url)
spacy_language_model = spacy.load("en_core_web_trf")

# Repeat steps 1-4 on every document
dataframes_to_stack = [
    steps_1_through_4(doc_html) for doc_html in articles["html"]
]

# Step 5: Merge the results across documents
step_5_results = pd.concat(dataframes_to_stack)
step_5_results

This loop takes about 15 minutes. That's a tolerable amount of time for this small collection of documents, but we will need better performance to work at more realistic scales.

In [None]:
# Temporary
import importlib
mi = importlib.reload(mi)

We can use this for loop 

If we insert a few timing measurements into this code, we can determine the amount of time the code spent in each step across these 190 articles. 

TODO: Insert graph of timings (see previous cell)

the dominant cost is running the deep parser

obviously we can make that step run faster with parallelism

but it's a good idea to exhaust the easy options for good single-threaded performance first

so let's make the parsing step go faster using Pandas

## Using Pandas to make dependency parsing faster
Observation: We only use portion of the the dependency parse that is reachable from the person mentions.

Observation: Each sentence of the document produces a disjoint parse tree.

Observation: Each sentence is an independent unit of work for the parser.

This means that we can parse just the sentences containing the person mentions and get the same answer as if we parsed every sentence. And parsing just one sentence is much faster.

Finding sentence boundaries without parsing can be tricky, but fortunately we don't need to find every sentence boundary with 100 percent accuracy to make this approach work. We just need to find enough accurate sentence boundaries to eliminate most of the document.

For this data set, we have reliable paragraph metadata. The source documents are HTML files with paragraphs denoted by HTML `<p>` tags. Watson Natural Language Underderstanding's HTML to text conversion reliably turns the paragraph tags into newline characters.

With a few lines of Python, we can generate a `SpanArray` with one span for each paragraph in the document:

In [None]:
import regex
paragraph_break_re = regex.compile(r"\n+")

def find_paragraph_spans(doc_text: str):
    # Find paragraph boundaries
    break_locs = [(a.start(), a.end()) 
                  for a in regex.finditer(paragraph_break_re, doc_text)]
    boundaries = break_locs + [(len(doc_text), len(doc_text))]
    
    # Split the document on paragraph boundaries
    begins = []
    ends = []
    begin = 0
    for b in boundaries:
        end = b[0]
        if end > begin:  # Ignore zero-length paragraphs
            begins.append(begin)
            ends.append(end)
        begin = b[1]
    return tp.SpanArray(doc_text, begins, ends)

The output of the `find_paragraph_spans()` is a `SpanArray` that is ready for processing with Pandas.
It can even render itself as HTML.

In [None]:
example_text = """
This is a the first paragraph.
Second paragraph here. 

Third paragraph comes after two newlines."""

find_paragraph_spans(example_text)


We'll use [document number 3](https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems) as an example here, because it contains quotes from more than one person.

In [None]:
# Pick a document than quotes more than one executive.
doc_html = articles.iloc[3]["html"]
step_1_results = mi.extract_named_entities_and_semantic_roles(doc_html, nlu_api)
step_2_results = mi.identify_persons_quoted_by_name(step_1_results)   
step_2_results

The `mi.perform_dependency_parsing()` function that we've been using so far is actually only one line of Python code:
```python
def perform_dependency_parsing(doc_text: str, spacy_language_model):
    """
    First phase of processing from the second part of the series.
    
    Parses a document using SpaCy's depdendency parser, then converts the
    outputs of the parser into a Pandas DataFrame using Text Extensions for Pandas.
    """
    return (
        tp.io.spacy.make_tokens_and_features(doc_text, spacy_language_model)
        [["id", "span", "tag", "dep", "head"]])
```

Let's pull out that one line and run it on our example document:

In [None]:
doc_text = step_1_results["analyzed_text"]
spacy_language_model = spacy.load("en_core_web_trf")

# Parse the document.
parse_features = tp.io.spacy.make_tokens_and_features(
            doc_text, spacy_language_model)[["id", "span", "tag", "dep", "head"]]
parse_features

To restrict the regions of the document that we parse, we need to need to identify the paragraphs containing potential person names. The `contain_join` span operation in Text Extensions for Pandas handles the heavy lifting by finding all pairs of paragraph and person spans where the paragraph contains the person.

In [None]:
paragraph_spans = find_paragraph_spans(doc_text)
covered_paragraphs = (
    tp.spanner.contain_join(pd.Series(paragraph_spans), step_2_results["person"],
                            "paragraph", "person")
    ["paragraph"]
    .drop_duplicates()  # In case 2 persons are in the same paragraph
    .array
)
covered_paragraphs

Now we can tokenize and parse one paragraph at a time and convert the parse tree to a DataFrame by calling the Text Extensions for Pandas function `make_tokens_and_features()`:

In [None]:
paragraph_span = covered_paragraphs[0]

paragraph_text = paragraph_span.covered_text
paragraph_tokens = tp.io.spacy.make_tokens_and_features(
    paragraph_text, spacy_language_model
    )[["id", "span", "tag", "dep", "head"]]
paragraph_tokens

Of course, the tokens in the above DataFrame are in the context of the paragraph, not the document. We need to translate all the spans of the tokens so that they map to the full document. This operation is easy to do with Pandas: just add a fixed offset and change the target text back to the original document's text:

In [None]:
span_array_before = paragraph_tokens["span"].array
paragraph_tokens["span"] = tp.SpanArray(paragraph_span.target_text,
                                        paragraph_span.begin + span_array_before.begin,
                                        paragraph_span.begin + span_array_before.end) 
paragraph_tokens

We'll also want to add a fixed offset to all the integer IDs so that we can stack multiple paragraphs' parse tree nodes together. Again, Pandas makes this easy:

In [None]:
example_offset = 100

to_stack = paragraph_tokens.copy()
to_stack["id"] += example_offset
to_stack["head"] += example_offset
to_stack.index += example_offset
to_stack

`market_intelligence.py` contains a Python function, `perform_targeted_dependency_parsing()`, that rolls up the code from the previous few paragraphs into a single step:

In [None]:
import importlib
mi = importlib.reload(mi)

In [None]:
targeted_parse_features = mi.perform_targeted_dependency_parsing(step_2_results["person"], 
                                                                 spacy_language_model)
targeted_parse_features

The output looks the same as what the original `perform_dependency_parsing` function produced, except that there is only output covering the relevant paragraphs of the document -- 190 tokens versus the original 680.

In [None]:
parse_features = mi.perform_dependency_parsing(doc_text,
                                               spacy_language_model)
parse_features

and if we feed this smaller DataFrame into the remaining processing steps in our pipeline, we get the same result as before:

In [None]:
# Before
results_before = mi.extract_titles_of_persons(step_2_results, parse_features)
results_before

In [None]:
# After
results_with_targeted_parsing = mi.extract_titles_of_persons(step_2_results, 
                                                             targeted_parse_features)
results_with_targeted_parsing

Now we can create a faster version of our `steps_1_through_4()` function by replacing the call to `mi.perform_dependency_parsing` with `mi.perform_targeted_dependency_parsing`:

In [None]:
# NOTE: The blog version of this should show the before and after versions of
#  just the line that changes.

def steps_1_through_4(doc_html: str) -> pd.DataFrame:
    step_1_results = mi.extract_named_entities_and_semantic_roles(doc_html, nlu_api)
    step_2_results = mi.identify_persons_quoted_by_name(step_1_results)
    step_3_results = mi.perform_targeted_dependency_parsing(
                                                   step_2_results["person"],
                                                   spacy_language_model)
    step_4_results = mi.extract_titles_of_persons(step_2_results, step_3_results)
    return step_4_results

Then we can rerun our original `for` loop.

In [None]:
%%time

# Probably don't want to show this code in the blog version, as it doesn't change

nlu_api = ibm_watson.NaturalLanguageUnderstandingV1(version="2021-01-01", 
            authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key))
nlu_api.set_service_url(service_url)
spacy_language_model = spacy.load("en_core_web_trf")


# Repeat the improved version of steps 1-4 on every document
dataframes_to_stack = [
    steps_1_through_4(doc_html) for doc_html in articles["html"]
]

# Step 5: Merge the results across documents
step_5_results = pd.concat(dataframes_to_stack)
step_5_results

In [None]:
# TODO: Pretty up this plot and get the legend in the same order as the bars
import numpy as np

sums = timings_df.sum()

timings = {
    "Extract entities and semantic roles": [sums["step_1_sec"], sums["step_1_sec"]],
    "Identify persons quoted by name": [sums["step_2_sec"],sums["step_2_sec"]],
    "Perform dependency parsing": [sums["step_3_sec"], sums["step_3a_sec"]],
    "Extract titles of persons": [sums["step_4_sec"],sums["step_4_sec"]],
    "Combine results across documents": [0.062, 0.062]
}

print(f"Total time before: {sum([v[0] for v in timings.values()])}")
print(f"Total time after: {sum([v[1] for v in timings.values()])}")

cur_sum = np.zeros(2)
for k, v in timings.items():
    plt.bar([0, 1], v, bottom=cur_sum, label=k, width=0.7)
    cur_sum += np.array(v)
    
plt.xlim([-1, 4])
plt.legend()
plt.show()

After that one-line change, the time spent on dependency parsing across our 190 documents drops from 641 seconds to 72 seconds -- a 9x performance increase!

Overall processing time has gone from 852 seconds to 283 seconds for an end-to-end improvement of 3x.

TODO: Conclusion goes here.

In [None]:
# TODO: Don't include this cell in the main flow, but do include the graph that the next cell 
# produces.


if not os.path.exists("ibm_press_release_timings.csv"):
    # Cached timing data not present. Regenerate it.
    benchmark_api_key = api_key
    benchmark_service_url = service_url
    #benchmark_api_key = os.environ.get("STANDARD_API_KEY")
    #benchmark_service_url = os.environ.get("STANDARD_SERVICE_URL")  

    nlu_api = ibm_watson.NaturalLanguageUnderstandingV1(version="2021-01-01", 
                authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(benchmark_api_key))
    nlu_api.set_service_url(benchmark_service_url)
    spacy_language_model = spacy.load("en_core_web_trf")

    dataframes_to_stack = []
    timings = []

    article_subset = articles

    for url, html in zip(article_subset["url"], article_subset["html"]):

        start_time = time.time()

        step_1_results = mi.extract_named_entities_and_semantic_roles(html, nlu_api)
        step_1_time = time.time()

        step_2_results = mi.identify_persons_quoted_by_name(step_1_results)
        step_2_time = time.time()

        step_3_results = mi.perform_dependency_parsing(step_1_results["analyzed_text"],
                                                       spacy_language_model)
        step_3_time = time.time()
        
        step_3a_results = mi.perform_targeted_dependency_parsing(step_2_results["person"],
                                                       spacy_language_model)
        step_3a_time = time.time()

        step_4_results = mi.extract_titles_of_persons(step_2_results, step_3_results)
        step_4_time = time.time()

        dataframes_to_stack.append(step_4_results)
        timings.append({
            "url": url,
            "step_1_sec": step_1_time - start_time,
            "step_2_sec": step_2_time - step_1_time,
            "step_3_sec": step_3_time - step_2_time,
            "step_3a_sec": step_3a_time - step_3_time,
            "step_4_sec": step_4_time - step_3a_time
        })

    step_5_start_time = time.time()
    step_5_results = pd.concat(dataframes_to_stack)
    step_5_time = time.time() - step_5_start_time
    print(f"Step 5 took {step_5_time:1.3f} sec.")
    
    pd.DataFrame.from_records(timings).to_csv("ibm_press_release_timings.csv")


In [None]:
# alt_timings = []
# #for doc_num in range(len(article_htmls)):
# for doc_num in range(3):    
#     doc_html = article_htmls[doc_num]
    
#     start_time = time.time()
    
#     step_1a_html_results = mi.extract_named_entities(doc_html, nlu_api)
#     step_1a_html_time = time.time()
    
#     doc_text = step_1a_html_results["analyzed_text"]
    
#     step_1a_results = mi.extract_named_entities(doc_text, nlu_api)
#     step_1a_time = time.time()
    
#     step_1b_results = mi.extract_semantic_roles(doc_html, nlu_api)
#     step_1b_time = time.time()
    
#     alt_timings.append({
#         "doc": doc_num,
#         "step_1a_html_sec": step_1a_html_time - start_time,
#         "step_1a_sec": step_1a_time - step_1a_html_time,
#         "step_1b_sec": step_1b_time - step_1a_time,
#     })

# alt_timings_df = pd.DataFrame.from_records(alt_timings)
# alt_timings_df

In [None]:
# Another pass to time the NLU models separately
# timings = []

# for doc_num in range(len(article_htmls)):
# #for doc_num in range(3):
#     doc_html = article_htmls[doc_num]
    
#     start_time = time.time()
    
#     step_1a_results = mi.extract_named_entities(doc_html, nlu_api)
#     step_1a_time = time.time()
    
#     step_1b_results = mi.extract_semantic_roles(doc_html, nlu_api)
#     step_1b_time = time.time()
    
#     timings.append({
#         "doc": doc_num,
#         "step_1a_sec": step_1a_time - start_time,
#         "step_1b_sec": step_1b_time - step_1a_time,
#     })

# timings_df = timings_df.merge(pd.DataFrame.from_records(timings))
# timings_df["step_1_diff"] = (timings_df["step_1a_sec"] + timings_df["step_1b_sec"]
#                              - timings_df["step_1_sec"])
# step_5_results