In [1]:
# Import Python libraries
from typing import *
import os
#import ibm_watson
#import ibm_watson.natural_language_understanding_v1 as nlu
#import ibm_cloud_sdk_core
import pandas as pd
import spacy
import sys
from IPython.core.display import display, HTML
import textwrap

# And of course we need the text_extensions_for_pandas library itself.
_PROJECT_ROOT = "../.."
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("market"):
        raise e
    if _PROJECT_ROOT not in sys.path:
        sys.path.insert(0, _PROJECT_ROOT)
    import text_extensions_for_pandas as tp
    
# Download the SpaCy model if necessary
try:
    spacy.load("en_core_web_trf")
except IOError:
    raise IOError("SpaCy dependency parser not found. Please run "
                  "'python -m spacy download en_core_web_trf', then "
                  "restart JupyterLab.")


if "IBM_API_KEY" not in os.environ:
    raise ValueError("IBM_API_KEY environment variable not set. Please create "
                     "a free instance of IBM Watson Natural Language Understanding "
                     "(see https://www.ibm.com/cloud/watson-natural-language-understanding) "
                     "and set the IBM_API_KEY environment variable to your instance's "
                     "API key value.")

api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  
# natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
#     version="2021-01-01",
#     authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
# )
# natural_language_understanding.set_service_url(service_url)
# Screenshots of this notebook should be this wide: ----------------------------->

In [2]:
# Code from the Github gist at https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d
# Be sure to update this cell if the gist changes!

import text_extensions_for_pandas as tp
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core

def find_persons_quoted_by_name(doc_url, api_key, service_url) -> pd.DataFrame:
    # Ask Watson Natural Language Understanding to run its "semantic_roles"
    # and "entities" models.
    natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
        version="2021-01-01",
        authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
    )
    natural_language_understanding.set_service_url(service_url)
    nlu_results = natural_language_understanding.analyze(
        url=doc_url,
        return_analyzed_text=True,
        features=nlu.Features(
            entities=nlu.EntitiesOptions(mentions=True),
            semantic_roles=nlu.SemanticRolesOptions())).get_result()
    
    # Convert the output of Watson Natural Language Understanding to DataFrames.
    dataframes = tp.io.watson.nlu.parse_response(nlu_results)
    entity_mentions_df = dataframes["entity_mentions"]
    semantic_roles_df = dataframes["semantic_roles"]
    
    # Extract mentions of person names and company names
    person_mentions_df = entity_mentions_df[entity_mentions_df["type"] == "Person"]
    
    # Extract instances of subjects that made statements
    quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] == "say"]
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)
    
        # Retrieve the full document text from the entity mentions output.
    doc_text = entity_mentions_df["span"].array.document_text

    # Filter down to just the rows and columns we're interested in
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Use String.index() to find where the strings in "subject.text" begin
    subjects_df["begin"] = pd.Series(
        [doc_text.index(s) for s in subjects_df["subject.text"]], dtype=int)

    # Compute end offsets and wrap the <begin, end, text> triples in a SpanArray column
    subjects_df["end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len()
    subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"], subjects_df["end"])

    # Align subjects with person names
    execs_df = tp.spanner.contain_join(subjects_df["span"], 
                                       person_mentions_df["span"],
                                       "subject", "person")
    # Add on the document URL.
    execs_df["url"] = doc_url
    return execs_df[["person", "url"]]

# Part 2: Analyzing SpaCy parse trees with Text Extensions for Pandas

*Dependency parsing* is a natural language processing technique that identifies the relationships between the words that make up a sentence. We can treat these relationships between a sentence's words as the edges of a graph. 

For example, here's the graph that a dependency parser produces for the sentence, "I like natural language processing":
![Parse tree for the sentence "I like natural language processing"](images/parse_tree.png)

In [3]:
# Do not include this cell in the blog post.
# Code to generate the above image
import spacy

spacy_language_model = spacy.load("en_core_web_trf")
token_features = tp.io.spacy.make_tokens_and_features(
    "I like natural language processing.", spacy_language_model)
tp.io.spacy.render_parse_tree(token_features)

This graph is always a tree, so we refer to it as the *dependency-based parse tree* of the sentence. We often shorten the phrase "dependency-based parse tree" to **dependency parse** or **parse tree**.

Every word in the sentence (including the period at the end) becomes a node of the parse tree:
![Parse tree for the sentence "I like natural language processing". Each word of the sentence becomes a node of the tree.](images/parse_tree_nodes.png)

The most important verb in the sentence
becomes the root of the tree. We call this root node the *head* node. In this example, the head node is the verb "like".

Edges in the tree connect pairs of related words:
![Parse tree for the sentence "I like natural language processing". Relationships between words form the edges of the tree.](images/parse_tree_edges.png)

Each edge is tagged with information about why the words are related. For example, the first two words in the sentence, "I" and "like", have a `nsubj` relationship. That means that the pronoun "I" is the subject for the verb "like".

Dependency parsing is useful because it lets you solve many business problems with very little code. The parser acts as a sort of universal machine learning model. The output of the parser is much easier to filter and manipulate with code, compared with the original text.

# An enterprise application of dependency parsing

In a [previous post](https://medium.com/@fred.reiss/market-intelligence-with-pandas-and-ibm-watson-natural-language-understanding-a939323a31ea), we showed how to use Watson Natural Language Understanding to find places where a corporate press release quotes an executive by name. In this article, we'll use dependency parsing to associate those names with **job titles**.

A person's title is an valuable piece of context. The title can tell you whether the person is an important decision maker. Titles can tell you relationship between different employees at a company. By looking at how titles change over time, you can reconstruct a person's job history. There are many applications of job title information.

In [4]:
# Don't include this cell in the blog

# Code to generate parse tree of entire sentence
# Take a screenshot at 25% to create the png version.
quote_text = '''\
"Equal access to skills and jobs is the key to unlocking economic \
opportunity and prosperity for diverse populations," said Valinda Scarbro \
Kennedy, HBCU Program Lead, IBM Global University Programs.'''

tp.io.spacy.render_parse_tree(tp.io.spacy.make_tokens_and_features(
    quote_text, spacy_language_model))

Here's an example snippet from an [IBM press release](https://newsroom.ibm.com/2021-01-05-IBM-Provides-Harris-Stowe-State-University-with-2-Million-in-Artificial-Intelligence-and-Open-Hybrid-Cloud-Technology-Resources-to-Help-Students-Build-Modern-Skills) from January 2021:

!["Equal access to skills and jobs is the key to unlocking economic 
opportunity and prosperity for diverse populations," said Valinda Scarbro 
Kennedy, HBCU Program Lead, IBM Global University Programs.](images/quote.png)

This sentence is 29 words long, so the entire parse tree is a bit daunting...

![Dependency parse of the example sentence](images/quote_parse_tree.png)

But if we zoom in on just the phrase, "Valinda Scarbro Kennedy, HBCU Program Lead," some structure becomes clear:

![Parse tree nodes for the Valinda Scarbro Kennedy's name and her job title are tied together by relationships of type `compound` and `appos`](images/name_title_relationship.png)


All the edges that that make up Valinda Kennedy's name and her job title are connected together by edges  with two specific types, `appos` and `compound`. These edge types come from a standard called the [Universal Dependencies](https://universaldependencies.org/) framework.

[`compound`](https://universaldependencies.org/docs/en/dep/compound.html) edges connect together nouns that are part of a [compound noun](https://learningenglish.voanews.com/a/compound-nouns/4706032.html). In this example, both the name "Valinda Scarbro Kennedy" and the title "HBCU Program Lead" are compound nouns.

`appos` is short for "[appositional modifier](https://universaldependencies.org/docs/en/dep/appos.html)", or [appositive](https://owl.purdue.edu/owl/general_writing/grammar/appositives.html). This label connects pairs of nouns where one noun modifies the other. In the example above, Valinda Kennedy's job title is an appositive for her name. The dependency parse includes an `appos` edge between the head words of these two noun phrases.

What we see here is a general pattern that happens whenever a person's job title occurs as an appositive for that person's name. The name and title form a subgraph of the parse tree that is connected only by edges of the two types `appos` and `compound`.

To put it another way, the nodes of the job title are *exactly* those nodes that are reachable from nodes of the name by following `appos` or `compound` links. So we can find the job title by [**transitive closure**](https://en.wikipedia.org/wiki/Transitive_closure#In_graph_theory). "Transitive closure" is a complicated term, but the algorithm is simple:

1. Start with the set of nodes that make up the name
2. Look for nodes that are connected to nodes of your set via an `appos` or `compound` link. Add those nodes to your set.
3. Repeat step 2 until your set of nodes stops growing.

We can implement this algorithm with Pandas DataFrames.

# Transitive closure with Pandas

We're going to use Pandas to match person names with job titles. The first thing we'll need is the locations of the person names. In our previous post, we created a function `find_persons_quoted_by_name()` that finds all the people that a news article quotes by name. If you're curious, you can find the source code [here](https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d). The function produces a DataFrame with the location of each person name. Here's the output when you run the function over an example press release:

In [5]:
doc_url = "https://newsroom.ibm.com/2020-02-19-IBM-Power-Systems-Certified-for-SAP-HANA-R-Enterprise-Cloud-as-a-provider-for-large-SAP-HANA-systems"
persons_df = find_persons_quoted_by_name(doc_url, api_key, service_url)
persons_df

Unnamed: 0,person,url
0,"[1213, 1229): 'Christoph Herman'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
1,"[2227, 2242): 'Stephen Leonard'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...


The second thing we will need is a parse tree.  We'll use the dependency parser from the [SpaCy](https://spacy.io) NLP library. Our open source library [Text Extensions for Pandas](https://ibm.biz/text-extensions-for-pandas) can convert the output of this parser into a DataFrame:

In [6]:
import spacy

# The original document had HTML tags. Get the detagged document text.
doc_text = persons_df["person"].array.document_text

# Run dependency parsing on the text and convert the parse to a DataFrame
spacy_language_model = spacy.load("en_core_web_trf")
all_token_features = tp.io.spacy.make_tokens_and_features(doc_text, spacy_language_model)

# Drop the columns we won't need for this analysis
tokens_df = all_token_features[["id", "span", "tag", "dep", "head", "sentence"]]
tokens_df

Unnamed: 0,id,span,tag,dep,head,sentence
0,0,"[0, 6): 'ARMONK'",NNP,ROOT,0,"[0, 43): 'ARMONK, N.Y., Feb. 19, 2020 /PRNewsw..."
1,1,"[6, 7): ','",",",punct,0,"[0, 43): 'ARMONK, N.Y., Feb. 19, 2020 /PRNewsw..."
2,2,"[8, 12): 'N.Y.'",NNP,appos,0,"[0, 43): 'ARMONK, N.Y., Feb. 19, 2020 /PRNewsw..."
3,3,"[12, 13): ','",",",punct,0,"[0, 43): 'ARMONK, N.Y., Feb. 19, 2020 /PRNewsw..."
4,4,"[14, 18): 'Feb.'",NNP,npadvmod,0,"[0, 43): 'ARMONK, N.Y., Feb. 19, 2020 /PRNewsw..."
...,...,...,...,...,...,...
675,675,"[3787, 3798): 'explanation'",NN,pobj,672,"[3751, 3810): 'ii See footnote 1 for more deta..."
676,676,"[3798, 3799): ''",_SP,punct,669,"[3751, 3810): 'ii See footnote 1 for more deta..."
677,677,"[3799, 3805): 'SOURCE'",NN,dep,669,"[3751, 3810): 'ii See footnote 1 for more deta..."
678,678,"[3806, 3809): 'IBM'",NNP,appos,677,"[3751, 3810): 'ii See footnote 1 for more deta..."


This `tokens_df` DataFrame contains one row for every *token* in the document. The term "token" here refers to a part of the document that is a word, an abbreviation, or a piece of punctuation. The columns "id", "dep" and "head" encode the edges of the parse tree. Since we're going to be analyzing the parse tree, it's more convenient to have the nodes and edges in separate DataFrames. So let's split `tokens_df` into DataFrames of nodes and edges:

In [7]:
nodes_df = tokens_df[["id", "span", "tag"]].reset_index(drop=True)
edges_df = tokens_df[["id", "head", "dep"]].reset_index(drop=True)

In [8]:
nodes_df

Unnamed: 0,id,span,tag
0,0,"[0, 6): 'ARMONK'",NNP
1,1,"[6, 7): ','",","
2,2,"[8, 12): 'N.Y.'",NNP
3,3,"[12, 13): ','",","
4,4,"[14, 18): 'Feb.'",NNP
...,...,...,...
675,675,"[3787, 3798): 'explanation'",NN
676,676,"[3798, 3799): ''",_SP
677,677,"[3799, 3805): 'SOURCE'",NN
678,678,"[3806, 3809): 'IBM'",NNP


In [9]:
edges_df

Unnamed: 0,id,head,dep
0,0,0,ROOT
1,1,0,punct
2,2,0,appos
3,3,0,punct
4,4,0,npadvmod
...,...,...,...
675,675,672,pobj
676,676,669,punct
677,677,669,dep
678,678,677,appos


Our transitive closure will start with the nodes that are parts of target person names. To find these nodes, we need to match the person names in `person_df` with tokens in `nodes_df`.

The "person" column of `persons_df` and the "span" column in `nodes_df` both hold *span* data. Spans are a common concept in natural language processing. A span represents a region of the document, usually as begin and end offsets and a reference to the document's text. The span data in these two DataFrames is stored using the `SpanDtype` extension type from Text Extensions for Pandas. Text Extensions for Pandas alos includes functions for manipulating span data. We can use one of these functions, `overlap_join()`, to find all the places where a token from `nodes_df` overlaps with a person name from `persons_df`:

In [11]:
person_nodes_df = (
    tp.spanner.overlap_join(persons_df["person"], nodes_df["span"],
                            "person", "span")
    .merge(nodes_df)
)
person_nodes_df

Unnamed: 0,person,span,id,tag
0,"[1213, 1229): 'Christoph Herman'","[1213, 1222): 'Christoph'",224,NNP
1,"[1213, 1229): 'Christoph Herman'","[1223, 1229): 'Herman'",225,NNP
2,"[2227, 2242): 'Stephen Leonard'","[2227, 2234): 'Stephen'",399,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2235, 2242): 'Leonard'",400,NNP


This set of nodes is the starting set for the transitive closure. Now we need to expand this set by following `appos` and `compound` edges. We can use Pandas to filter the edges of the graph down to just those edge types:

In [12]:
filtered_edges_df = edges_df[edges_df["dep"].isin(["appos", "compound"])]
filtered_edges_df

Unnamed: 0,id,head,dep
2,2,0,appos
8,8,0,appos
13,13,11,appos
15,15,13,appos
20,20,22,compound
...,...,...,...
631,631,621,appos
638,638,639,compound
658,658,660,compound
660,660,655,appos


Each step of the transitive closure involves following all the edges from our selected set of nodes, then adding the nodes we reached to the set.

We can implement a single step with a few basic Pandas operations. First we use `DataFrame.merge()` to follow edges and find some reachable nodes. Then we use `Pandas.concat()` to add the new nodes to our selected set of nodes. Then we use `DataFrame.drop_duplicates()` to remove duplicates from the set:

In [13]:
# Start with the nodes that are inside person names.
selected_nodes_df = person_nodes_df.copy()

# Find all the nodes that are directly reachable from our selected set.
reachable_nodes_df = (
    selected_nodes_df[["person", "id"]]
    .merge(filtered_edges_df, left_on="id", right_on="head", 
           suffixes=["_head", ""])[["person", "id"]]
    .merge(nodes_df)
)

# Merge the reachable nodes into the selected set
pd.concat([selected_nodes_df, reachable_nodes_df]).drop_duplicates()

Unnamed: 0,person,span,id,tag
0,"[1213, 1229): 'Christoph Herman'","[1213, 1222): 'Christoph'",224,NNP
1,"[1213, 1229): 'Christoph Herman'","[1223, 1229): 'Herman'",225,NNP
2,"[2227, 2242): 'Stephen Leonard'","[2227, 2234): 'Stephen'",399,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2235, 2242): 'Leonard'",400,NNP
1,"[1213, 1229): 'Christoph Herman'","[1231, 1234): 'SVP'",227,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2252, 2259): 'Manager'",403,NNP


Now we just need to repeat this step for as long as the set of selected nodes keeps growing:

In [14]:
# Start with the nodes that are inside person names.
selected_nodes_df = person_nodes_df.copy()

# Transitive closure. 
# Keep going as long as the previous round added nodes to our set.
previous_num_nodes = 0
while len(selected_nodes_df.index) > previous_num_nodes:

    # Find all the nodes that are directly reachable from our selected set.
    reachable_nodes_df = (
        selected_nodes_df[["person", "id"]]
        .merge(filtered_edges_df, left_on="id", right_on="head", 
               suffixes=["_head", ""])[["person", "id"]]
        .merge(nodes_df)
    )

    # Merge the reachable nodes into the selected set
    previous_num_nodes = len(selected_nodes_df.index)
    selected_nodes_df = (pd.concat([selected_nodes_df, reachable_nodes_df])
                      .drop_duplicates())

selected_nodes_df

Unnamed: 0,person,span,id,tag
0,"[1213, 1229): 'Christoph Herman'","[1213, 1222): 'Christoph'",224,NNP
1,"[1213, 1229): 'Christoph Herman'","[1223, 1229): 'Herman'",225,NNP
2,"[2227, 2242): 'Stephen Leonard'","[2227, 2234): 'Stephen'",399,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2235, 2242): 'Leonard'",400,NNP
1,"[1213, 1229): 'Christoph Herman'","[1231, 1234): 'SVP'",227,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2252, 2259): 'Manager'",403,NNP
4,"[2227, 2242): 'Stephen Leonard'","[2244, 2251): 'General'",402,NNP
5,"[2227, 2242): 'Stephen Leonard'","[2275, 2282): 'Systems'",407,NNP
6,"[2227, 2242): 'Stephen Leonard'","[2261, 2264): 'IBM'",405,NNP
7,"[2227, 2242): 'Stephen Leonard'","[2265, 2274): 'Cognitive'",406,NNP


Now we have the set of all nodes that are reachable from one of our selected person names by traversing `appos` and `compound` edges. If we filter out the nodes we started with, we should get the nodes for the tokens of the job titles:

In [15]:
title_nodes_df = \
    selected_nodes_df[~selected_nodes_df["id"].isin(person_nodes_df["id"])]
title_nodes_df

Unnamed: 0,person,span,id,tag
1,"[1213, 1229): 'Christoph Herman'","[1231, 1234): 'SVP'",227,NNP
3,"[2227, 2242): 'Stephen Leonard'","[2252, 2259): 'Manager'",403,NNP
4,"[2227, 2242): 'Stephen Leonard'","[2244, 2251): 'General'",402,NNP
5,"[2227, 2242): 'Stephen Leonard'","[2275, 2282): 'Systems'",407,NNP
6,"[2227, 2242): 'Stephen Leonard'","[2261, 2264): 'IBM'",405,NNP
7,"[2227, 2242): 'Stephen Leonard'","[2265, 2274): 'Cognitive'",406,NNP


Now we just need to turn these sets of nodes into spans. We can Pandas' grouping
and aggregation to do so, taking advantage of the fact that the "addition" operation 
for spans is defined as:
```
span1 + span2 = smallest span that contains both span1 and span2
```

In [16]:
titles_df = (
    title_nodes_df
    .groupby("person")
    .aggregate({"span": "sum"})
    .reset_index()
    .rename(columns={"span": "title"})
)

# As of Pandas 1.2.1, groupby() over extension types downgrades them to object 
# dtype. Cast back up to the extension type.
titles_df["person"] = titles_df["person"].astype(tp.SpanDtype())

titles_df

Unnamed: 0,person,title
0,"[1213, 1229): 'Christoph Herman'","[1231, 1234): 'SVP'"
1,"[2227, 2242): 'Stephen Leonard'","[2244, 2282): 'General Manager, IBM Cognitive ..."


Now we have found a job title for each of the executive names in this document.

## Tying it all together

Let's put all of the code we've presented so far into a single function.

In [17]:
def find_titles_of_persons(persons_df: pd.DataFrame,
                           spacy_language_model) -> pd.DataFrame:
    """
    :param persons_df: DataFrame containing information about person names.
    :param spacy_language_model: Loaded SpaCy language model with dependency 
     parsing support.
    
    :returns: A DataFrame with a row for every title identified and two columns,
     "person" and "title".
    """
    if len(persons_df.index) == 0:
        # Special case: Empty input --> empty output
        return pd.DataFrame({
            "person": pd.Series([], dtype=tp.SpanDtype()),
            "title": pd.Series([], dtype=tp.SpanDtype()),
        })
    
    # Retrieve the document text from the person spans.
    doc_text = persons_df["person"].array.document_text
    
    # Run dependency parsing on the text and convert the parse to a DataFrame.
    all_token_features = tp.io.spacy.make_tokens_and_features(doc_text, spacy_language_model)

    # Drop the columns we won't need for this analysis.
    tokens_df = all_token_features[["id", "span", "tag", "dep", "head", "sentence"]]
    
    # Split the parse tree into nodes and edges and filter the edges.
    nodes_df = tokens_df[["id", "span", "tag"]].reset_index(drop=True)
    edges_df = tokens_df[["id", "head", "dep"]].reset_index(drop=True)
    filtered_edges_df = edges_df[edges_df["dep"].isin(["appos", "compound"])]
    
    # Transitive closure. 
    # Start with the nodes that are inside person names.
    person_nodes_df = (
        tp.spanner.overlap_join(persons_df["person"], nodes_df["span"],
                                "person", "span")
        .merge(nodes_df)
    )
    selected_nodes_df = person_nodes_df.copy()

    # Keep going as long as the previous round added nodes to our set.
    previous_num_nodes = 0
    while len(selected_nodes_df.index) > previous_num_nodes:
        # Find all the nodes that are directly reachable from our selected set.
        reachable_nodes_df = (
            selected_nodes_df[["person", "id"]]
            .merge(filtered_edges_df, left_on="id", right_on="head", 
                   suffixes=["_head", ""])[["person", "id"]]
            .merge(nodes_df)
        )

        # Merge the reachable nodes into the selected set
        previous_num_nodes = len(selected_nodes_df.index)
        selected_nodes_df = (pd.concat([selected_nodes_df, reachable_nodes_df])
                             .drop_duplicates())

    # Find nodes that are reachable from the person names but aren't part of a name.
    title_nodes_df = \
        selected_nodes_df[~selected_nodes_df["id"].isin(person_nodes_df["id"])]
    

    # Aggregate the nodes of each title to find the span of the entire title.
    titles_df = (
        title_nodes_df
        .groupby("person")
        .aggregate({"span": "sum"})
        .reset_index()
        .rename(columns={"span": "title"})
    )

    # As of Pandas 1.2.1, groupby() over extension types downgrades them to object 
    # dtype. Cast back up to the extension type.
    titles_df["person"] = titles_df["person"].astype(tp.SpanDtype())
    
    return titles_df
    

If we combine this `find_titles_of_persons()` function with the `find_persons_quoted_by_name()` function we created in our previous post, we can build a data mining pipeline. This pipeline find the names and titles of executives in corporate press releases. Here's the output that we get if we pass a year's worth of IBM press releases through this pipeline:

In [18]:
# Don't include this cell in the blog post.

# Load press release URLs from a file
with open("ibm_press_releases.txt", "r") as f:
    lines = [l.strip() for l in f.readlines()]
    ibm_press_release_urls = [l for l in lines if len(l) > 0 and l[0] != "#"]

In [22]:
to_concat = []
for url in ibm_press_release_urls:
    persons_df = find_persons_quoted_by_name(url, api_key, service_url)
    titles_df = find_titles_of_persons(persons_df, spacy_language_model)
    titles_df["url"] = url
    to_concat.append(titles_df)
    
all_titles_df = pd.concat(to_concat).reset_index(drop=True)
all_titles_df

Unnamed: 0,person,title,url
0,"[1977, 1991): 'Wendi Whitmore'","[1993, 2040): 'Vice President, IBM X-Force Thr...",https://newsroom.ibm.com/2020-02-11-IBM-X-Forc...
1,"[1281, 1292): 'Rob DiCicco'","[1294, 1348): 'PharmD, Deputy Chief Health Off...",https://newsroom.ibm.com/2020-02-18-IBM-Study-...
2,"[1213, 1229): 'Christoph Herman'","[1231, 1234): 'SVP'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
3,"[2227, 2242): 'Stephen Leonard'","[2244, 2282): 'General Manager, IBM Cognitive ...",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
4,"[2289, 2297): 'Bob Lord'","[2303, 2324): 'Senior Vice President'",https://newsroom.ibm.com/2020-02-26-2020-Call-...
...,...,...,...
267,"[3114, 3124): 'Mike Doran'","[3126, 3150): 'Worldwide Sales Director'",https://newsroom.ibm.com/2021-01-25-OVHcloud-t...
268,"[3155, 3169): 'Howard Boville'","[3171, 3210): 'Senior Vice President, IBM Hybr...",https://newsroom.ibm.com/2021-01-26-Luminor-Ba...
269,"[3114, 3126): 'Samuel Brack'","[3127, 3137): 'Co-Founder'",https://newsroom.ibm.com/2021-01-26-DIA-Levera...
270,"[3509, 3523): 'Hillery Hunter'","[3525, 3556): 'IBM Fellow, VP & CTO, IBM Cloud'",https://newsroom.ibm.com/2021-01-26-DIA-Levera...


Our pipeline has processed 191 IBM press releaes, and it found the names and titles of 272 executives!

To find out more about the extensions to Pandas that made this possible, check out Text Extensions for Pandas [here](https://ibm.biz/text-extensions-for-pandas).


In [25]:
all_titles_df[-50:]

Unnamed: 0,person,title,url
222,"[2035, 2048): 'Vitaly Tsivin'","[2050, 2086): 'Executive Vice President of Bus...",https://newsroom.ibm.com/2020-12-02-IBM-Positi...
223,"[1288, 1304): 'Daniel Hernandez'","[1314, 1339): 'manager, Data and AI, IBM'",https://newsroom.ibm.com/2020-12-02-IBM-Named-...
224,"[1838, 1849): 'Curren Katz'","[1851, 1896): 'Director of Data Science R&D, H...",https://newsroom.ibm.com/2020-12-02-IBM-Named-...
225,"[2476, 2486): 'Ritu Jyoti'","[2488, 2523): 'program vice president, AI rese...",https://newsroom.ibm.com/2020-12-02-IBM-Named-...
226,"[813, 825): 'Daniel Stumm'","[833, 837): 'Head'",https://newsroom.ibm.com/2020-12-03-IBM-Helps-...
227,"[2802, 2816): 'Neil McCormack'","[2818, 2866): 'managing partner - Geo Leader, ...",https://newsroom.ibm.com/2020-12-03-IBM-Helps-...
228,"[3453, 3465): 'Luigi Menzio'","[3467, 3505): 'Services Executive Partner, IBM...",https://newsroom.ibm.com/2020-12-03-Piaggio-Gr...
229,"[1703, 1714): 'Gene Seroka'","[1684, 1702): 'Executive Director'",https://newsroom.ibm.com/2020-12-07-IBM-Works-...
230,"[2164, 2180): 'Daniel Hernandez'","[2182, 2217): 'General Manager of Data and AI,...",https://newsroom.ibm.com/2020-12-09-IBM-Launch...
231,"[2933, 2945): 'André Tamers'","[2947, 2952): 'owner'",https://newsroom.ibm.com/2020-12-10-eProvenanc...
