# Demo for PyData Montreal 2021, part 2

Based on `Market_Intelligence_Part1.ipynb`.

In [1]:
# Make sure the API keys are present
import os
if "IBM_API_KEY" not in os.environ:
    raise ValueError("IBM_API_KEY environment variable not set. Please create "
                     "a free instance of IBM Watson Natural Language Understanding "
                     "(see https://www.ibm.com/cloud/watson-natural-language-understanding) "
                     "and set the IBM_API_KEY environment variable to your instance's "
                     "API key value.")
api_key = os.environ.get("IBM_API_KEY")
service_url = os.environ.get("IBM_SERVICE_URL")  

# Github notebook gists will be this wide: ------------------>
# Screenshots of this notebook should be this wide: ----------------------------->

In [2]:
# In the blog post, this will be a Github gist.
# See https://gist.github.com/frreiss/038ac63ef20eed323a5637f9ddb2de8d

import pandas as pd
import text_extensions_for_pandas as tp
import ibm_watson
import ibm_watson.natural_language_understanding_v1 as nlu
import ibm_cloud_sdk_core

def find_persons_quoted_by_name(doc_url, api_key, service_url) -> pd.DataFrame:
    # Ask Watson Natural Language Understanding to run its "semantic_roles"
    # and "entities" models.
    natural_language_understanding = ibm_watson.NaturalLanguageUnderstandingV1(
        version="2021-01-01",
        authenticator=ibm_cloud_sdk_core.authenticators.IAMAuthenticator(api_key)
    )
    natural_language_understanding.set_service_url(service_url)
    nlu_results = natural_language_understanding.analyze(
        url=doc_url,
        return_analyzed_text=True,
        features=nlu.Features(
            entities=nlu.EntitiesOptions(mentions=True),
            semantic_roles=nlu.SemanticRolesOptions())).get_result()
    
    # Convert the output of Watson Natural Language Understanding to DataFrames.
    dataframes = tp.io.watson.nlu.parse_response(nlu_results)
    entity_mentions_df = dataframes["entity_mentions"]
    semantic_roles_df = dataframes["semantic_roles"]
    
    # Extract mentions of person names
    person_mentions_df = entity_mentions_df[entity_mentions_df["type"] == "Person"]
    
    # Extract instances of subjects that made statements
    quotes_df = semantic_roles_df[semantic_roles_df["action.normalized"] == "say"]
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)
    
    # Retrieve the full document text from the entity mentions output.
    doc_text = entity_mentions_df["span"].array.document_text

    # Filter down to just the rows and columns we're interested in
    subjects_df = quotes_df[["subject.text"]].copy().reset_index(drop=True)

    # Use String.index() to find where the strings in "subject.text" begin
    subjects_df["begin"] = pd.Series(
        [doc_text.index(s) for s in subjects_df["subject.text"]], dtype=int)

    # Compute end offsets and wrap the <begin, end, text> triples in a SpanArray column
    subjects_df["end"] = subjects_df["begin"] + subjects_df["subject.text"].str.len()
    subjects_df["span"] = tp.SpanArray(doc_text, subjects_df["begin"], subjects_df["end"])

    # Align subjects with person names
    execs_df = tp.spanner.contain_join(subjects_df["span"], 
                                       person_mentions_df["span"],
                                       "subject", "person")
    # Add on the document URL.
    execs_df["url"] = doc_url
    return execs_df[["person", "url"]]
    

In [3]:
# Don't include this cell in the blog post.

# Load press release URLs from a file
with open("ibm_press_releases.txt", "r") as f:
    lines = [l.strip() for l in f.readlines()]
    ibm_press_release_urls = [l for l in lines if len(l) > 0 and l[0] != "#"]

In [4]:
executive_names = pd.concat([
    find_persons_quoted_by_name(url, api_key, service_url) 
    for url in ibm_press_release_urls
])
executive_names

Unnamed: 0,person,url
0,"[1977, 1991): 'Wendi Whitmore'",https://newsroom.ibm.com/2020-02-11-IBM-X-Forc...
0,"[1281, 1292): 'Rob DiCicco'",https://newsroom.ibm.com/2020-02-18-IBM-Study-...
0,"[1213, 1229): 'Christoph Herman'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
1,"[2227, 2242): 'Stephen Leonard'",https://newsroom.ibm.com/2020-02-19-IBM-Power-...
0,"[2289, 2297): 'Bob Lord'",https://newsroom.ibm.com/2020-02-26-2020-Call-...
...,...,...
0,"[3114, 3124): 'Mike Doran'",https://newsroom.ibm.com/2021-01-25-OVHcloud-t...
0,"[3155, 3169): 'Howard Boville'",https://newsroom.ibm.com/2021-01-26-Luminor-Ba...
0,"[3114, 3126): 'Samuel Brack'",https://newsroom.ibm.com/2021-01-26-DIA-Levera...
1,"[3509, 3523): 'Hillery Hunter'",https://newsroom.ibm.com/2021-01-26-DIA-Levera...


In [8]:
executive_names.iloc[[0]]["person"].array

Unnamed: 0,begin,end,covered_text
0,1977,1991,Wendi Whitmore
