# Implementing Information Extraction

## 1. Installing packages 


In [1]:
!pip install smart_open
!pip install networkx
!pip install pandas
!pip install pyvis
!pip install spacy
!pip install ipywidgets

Collecting smart_open
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)
Installing collected packages: smart_open
Successfully installed smart_open-7.1.0
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.0.1-py3-none-any.whl.metadata (8.2 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jsonpickle-4.0.1-py3-none-any.whl (46 kB)
Installing collected packages: jsonpickle, pyvis
Successfully installed jsonpickle-4.0.1 pyvis-0.3.2
Collecting spacy
  Downloading spacy-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy

In [2]:
%matplotlib inline

import json
import requests
import uuid

import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import boto3
import smart_open

from time import sleep
from matplotlib import cm, colors
from spacy import displacy
from collections import Counter
from pyvis.network import Network

Matplotlib is building the font cache; this may take a moment.


## 2. Writing the documents to Amazon S3

In [3]:
bucket = "c144486a3735941l8877079t1w611433154768-labbucket-5oyy6ooqq6sl"

In [4]:
# Client and session information
session = boto3.Session()
s3_client = session.client(service_name="s3")

# Constants for S3 bucket and input data file

filename = "sample_finance_dataset.txt"
input_data_s3_path = f's3://{bucket}/' + filename
output_data_s3_path = f's3://{bucket}/'

# Upload the local file to S3
s3_client.upload_file("../data/" + filename, bucket, filename)

# Load the documents locally for later analysis
with open("../data/" + filename, "r") as fi:
    raw_texts = [line.strip() for line in fi.readlines()]

## 3. Starting an asynchronous events detection job using the SDK

In [5]:
# Amazon Comprehend client information
comprehend_client = session.client(service_name="comprehend")

# IAM role with access to Amazon Comprehend and the specified S3 bucket
job_data_access_role = 'arn:aws:iam::611433154768:role/service-role/c144486a3735941l8877079t1w-ComprehendDataAccessRole-7rjprp7Y1J33'

# Other job parameters
input_data_format = 'ONE_DOC_PER_LINE'
job_uuid = uuid.uuid1()
job_name = f"events-job-{job_uuid}"

In [6]:
event_types = ["BANKRUPTCY", "EMPLOYMENT", "CORPORATE_ACQUISITION", 
               "INVESTMENT_GENERAL", "CORPORATE_MERGER", "IPO",
               "RIGHTS_ISSUE", "SECONDARY_OFFERING", "SHELF_OFFERING",
               "TENDER_OFFERING", "STOCK_SPLIT"]

In [7]:
# Begin the inference job
response = comprehend_client.start_events_detection_job(
    InputDataConfig={'S3Uri': input_data_s3_path,
                     'InputFormat': input_data_format},
    OutputDataConfig={'S3Uri': output_data_s3_path},
    DataAccessRoleArn=job_data_access_role,
    JobName=job_name,
    LanguageCode='en',
    TargetEventTypes=event_types
)

# Get the job ID
events_job_id = response['JobId']

In [8]:
# Get the current job status
job = comprehend_client.describe_events_detection_job(JobId=events_job_id)

# Loop until the job is completed
waited = 0
timeout_minutes = 30
while job['EventsDetectionJobProperties']['JobStatus'] != 'COMPLETED':
    sleep(60)
    waited += 60
    assert waited//60 < timeout_minutes, "Job timed out after %d seconds." % waited
    print('.', end='')
    job = comprehend_client.describe_events_detection_job(JobId=events_job_id)

print('Ready')

.........Ready


In [9]:
# The output filename is the input filename + ".out"
output_data_s3_file = job['EventsDetectionJobProperties']['OutputDataConfig']['S3Uri'] + filename + '.out'

# Load the output into a results dictionary
results = []
with smart_open.open(output_data_s3_file) as fi:
    results.extend([json.loads(line) for line in fi.readlines() if line])

## 4. Analyzing the Amazon Comprehend Events output

> Amazon.com, Inc. (NASDAQ: AMZN) today announced financial results for its third quarter ended September 30, 2017.

> Operating cash flow increased 14% to \\$17.1 billion for the trailing twelve months, compared with \\$15.0 billion for the trailing twelve months ended September 30, 2016. Free cash flow decreased to \\$8.1 billion for the trailing twelve months, compared with \\$9.0 billion for the trailing twelve months ended September 30, 2016. Free cash flow less lease principal repayments decreased to \\$3.5 billion for the trailing twelve months, compared with \\$5.3 billion for the trailing twelve months ended September 30, 2016. Free cash flow less finance lease principal repayments and assets acquired under capital leases decreased to an outflow of \\$1.0 billion for the trailing twelve months, compared with an inflow of \\$3.8 billion for the trailing twelve months ended September 30, 2016.

> Common shares outstanding plus shares underlying stock-based awards totaled 503 million on September 30, 2017, compared with 496 million one year ago.

> Net sales increased 34% to \\$43.7 billion in the third quarter, compared with \\$32.7 billion in third quarter 2016. Net sales includes \\$1.3 billion from Whole Foods Market, which Amazon acquired on August 28, 2017. Excluding Whole Foods Market and the \\$124 million favorable impact from year-over-year changes in foreign exchange rates throughout the quarter, net sales increased 29% compared with third quarter 2016.

> Operating income decreased 40% to \\$347 million in the third quarter, compared with operating income of \\$575 million in third quarter 2016. Operating income includes income of \\$21 million from Whole Foods Market.

> Net income was \\$256 million in the third quarter, or \\$0.52 per diluted share, compared with net income of \\$252 million, or \\$0.52 per diluted share, in third quarter 2016.

> “In the last month alone, we’ve launched five new Alexa-enabled devices, introduced Alexa in India, announced integration with BMW, surpassed 25,000 skills, integrated Alexa with Sonos speakers, taught Alexa to distinguish between two voices, and more. Because Alexa’s brain is in the AWS cloud, her new abilities are available to all Echo customers, not just those who buy a new device,” said Jeff Bezos, Amazon founder and CEO. “And it’s working — customers have purchased tens of millions of Alexa-enabled devices, given Echo devices over 100,000 5-star reviews, and active customers are up more than 5x since the same time last year. With thousands of developers and hardware makers building new Alexa skills and devices, the Alexa experience will continue to get even better.”

In [10]:
# Use the first results document for analysis
result = results[0]
raw_text = raw_texts[0]

In [11]:
raw_text

"Amazon (NASDAQ:AMZN) and Whole Foods Market, Inc. (NASDAQ:WFM) today announced that they have entered into a definitive merger agreement under which Amazon will acquire Whole Foods Market for $42 per share in an all-cash transaction valued at approximately $13.7 billion, including Whole Foods Market’s net debt.  “Millions of people love Whole Foods Market because they offer the best natural and organic foods, and they make it fun to eat healthy,” said Jeff Bezos, Amazon founder and CEO. “Whole Foods Market has been satisfying, delighting and nourishing customers for nearly four decades – they’re doing an amazing job and we want that to continue.”  “This partnership presents an opportunity to maximize value for Whole Foods Market’s shareholders, while at the same time extending our mission and bringing the highest quality, experience, convenience and innovation to our customers,” said John Mackey, Whole Foods Market co-founder and CEO.  Whole Foods Market will continue to operate store

In [12]:
result

{'Entities': [{'Mentions': [{'BeginOffset': 0,
     'EndOffset': 6,
     'Score': 0.999501,
     'Text': 'Amazon',
     'Type': 'ORGANIZATION',
     'GroupScore': 1.0},
    {'BeginOffset': 149,
     'EndOffset': 155,
     'Score': 0.999615,
     'Text': 'Amazon',
     'Type': 'ORGANIZATION',
     'GroupScore': 0.9936},
    {'BeginOffset': 468,
     'EndOffset': 474,
     'Score': 0.998912,
     'Text': 'Amazon',
     'Type': 'ORGANIZATION',
     'GroupScore': 0.584697}]},
  {'Mentions': [{'BeginOffset': 8,
     'EndOffset': 19,
     'Score': 0.990119,
     'Text': 'NASDAQ:AMZN',
     'Type': 'STOCK_CODE',
     'GroupScore': 1.0}]},
  {'Mentions': [{'BeginOffset': 25,
     'EndOffset': 49,
     'Score': 0.999654,
     'Text': 'Whole Foods Market, Inc.',
     'Type': 'ORGANIZATION',
     'GroupScore': 1.0},
    {'BeginOffset': 169,
     'EndOffset': 187,
     'Score': 0.999668,
     'Text': 'Whole Foods Market',
     'Type': 'ORGANIZATION',
     'GroupScore': 0.990907},
    {'BeginOffset

In [13]:
result['Events'][1]['Triggers']

[{'BeginOffset': 161,
  'EndOffset': 168,
  'Score': 0.999958,
  'Text': 'acquire',
  'Type': 'CORPORATE_ACQUISITION',
  'GroupScore': 1.0},
 {'BeginOffset': 221,
  'EndOffset': 232,
  'Score': 0.931136,
  'Text': 'transaction',
  'Type': 'CORPORATE_ACQUISITION',
  'GroupScore': 0.999985}]

In [14]:
result['Events'][1]['Arguments']

[{'EntityIndex': 5, 'Role': 'AMOUNT', 'Score': 0.99873},
 {'EntityIndex': 4, 'Role': 'DATE', 'Score': 0.994578},
 {'EntityIndex': 2, 'Role': 'INVESTEE', 'Score': 0.999668},
 {'EntityIndex': 0, 'Role': 'INVESTOR', 'Score': 0.999615}]

In [15]:
result['Entities'][0]['Mentions']

[{'BeginOffset': 0,
  'EndOffset': 6,
  'Score': 0.999501,
  'Text': 'Amazon',
  'Type': 'ORGANIZATION',
  'GroupScore': 1.0},
 {'BeginOffset': 149,
  'EndOffset': 155,
  'Score': 0.999615,
  'Text': 'Amazon',
  'Type': 'ORGANIZATION',
  'GroupScore': 0.9936},
 {'BeginOffset': 468,
  'EndOffset': 474,
  'Score': 0.998912,
  'Text': 'Amazon',
  'Type': 'ORGANIZATION',
  'GroupScore': 0.584697}]

In [16]:
# Convert the output to the displaCy format
entities = [
    {'start': m['BeginOffset'], 'end': m['EndOffset'], 'label': m['Type']}
    for e in result['Entities']
    for m in e['Mentions']
]

triggers = [
    {'start': t['BeginOffset'], 'end': t['EndOffset'], 'label': t['Type']}
    for e in result['Events']
    for t in e['Triggers']
]

# Spans need to be sorted for displaCy to process them correctly
spans = sorted(entities + triggers, key=lambda x: x['start'])
tags = [s['label'] for s in spans]

output = [{"text": raw_text, "ents": spans, "title": None, "settings": {}}]

In [17]:
# Miscellaneous objects for presentation purposes
spectral = cm.get_cmap("Spectral", len(tags))
tag_colors = [colors.rgb2hex(spectral(i)) for i in range(len(tags))]
color_map = dict(zip(*(tags, tag_colors)))

  spectral = cm.get_cmap("Spectral", len(tags))


In [18]:
# Note that only entities participating in events are shown
displacy.render(output, style="ent", options={"colors": color_map}, manual=True)

In [19]:
# Create the entity DataFrame. Entity indices must be explicitly created.
entities_df = pd.DataFrame([
    {"EntityIndex": i, **m}
    for i, e in enumerate(result['Entities'])
    for m in e['Mentions']
])

# Create the events DataFrame. Event indices must be explicitly created.
events_df = pd.DataFrame([
    {"EventIndex": i, **a, **t}
    for i, e in enumerate(result['Events'])
    for a in e['Arguments']
    for t in e['Triggers']
])

# Join the two tables into one flat data structure
events_df = events_df.merge(entities_df, on="EntityIndex", suffixes=('Event', 'Entity'))

In [20]:
events_df

Unnamed: 0,EventIndex,EntityIndex,Role,ScoreEvent,BeginOffsetEvent,EndOffsetEvent,TextEvent,TypeEvent,GroupScoreEvent,BeginOffsetEntity,EndOffsetEntity,ScoreEntity,TextEntity,TypeEntity,GroupScoreEntity
0,0,4,DATE,0.999611,120,126,merger,CORPORATE_MERGER,1.000000,63,68,0.994578,today,DATE,1.000000
1,0,4,DATE,0.999829,662,673,partnership,CORPORATE_MERGER,0.999969,63,68,0.994578,today,DATE,1.000000
2,0,4,DATE,0.992193,1237,1248,transaction,CORPORATE_MERGER,0.509699,63,68,0.994578,today,DATE,1.000000
3,0,4,DATE,0.998367,1403,1414,transaction,CORPORATE_MERGER,0.336708,63,68,0.994578,today,DATE,1.000000
4,1,4,DATE,0.999958,161,168,acquire,CORPORATE_ACQUISITION,1.000000,63,68,0.994578,today,DATE,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,1,0,INVESTOR,0.931136,221,232,transaction,CORPORATE_ACQUISITION,0.999985,468,474,0.998912,Amazon,ORGANIZATION,0.584697
133,2,6,EMPLOYEE,0.999938,1116,1122,remain,EMPLOYMENT,1.000000,897,908,0.999606,John Mackey,PERSON,1.000000
134,2,6,EMPLOYEE,0.999938,1116,1122,remain,EMPLOYMENT,1.000000,1099,1110,0.999699,John Mackey,PERSON,0.977111
135,2,7,EMPLOYEE_TITLE,0.999938,1116,1122,remain,EMPLOYMENT,1.000000,944,947,0.997071,CEO,PERSON_TITLE,1.000000


In [21]:
def format_compact_events(x):
    """Collapse groups of mentions and triggers into a single set."""
    # Take the most commonly occurring EventType and the set of triggers
    d = {"EventType": Counter(x['TypeEvent']).most_common()[0][0],
         "Triggers": set(x['TextEvent'])}
    # For each argument Role, collect the set of mentions in the group
    for role in x['Role']:
        d.update({role: set((x[x['Role']==role]['TextEntity']))})
    return d

# Group data by EventIndex and format
event_analysis_df = pd.DataFrame(
    events_df.groupby("EventIndex").apply(format_compact_events).tolist()
).fillna('')

In [22]:
event_analysis_df

Unnamed: 0,EventType,Triggers,DATE,PARTICIPANT,INVESTEE,AMOUNT,INVESTOR,EMPLOYER,EMPLOYEE,EMPLOYEE_TITLE
0,CORPORATE_MERGER,"{merger, transaction, partnership}","{today, during the second half of 2017}","{we, NASDAQ:AMZN, NASDAQ:WFM, Whole Foods Mark...",,,,,,
1,CORPORATE_ACQUISITION,"{acquire, transaction}",{today},,"{they, Whole Foods Market, we, Whole Foods Mar...","{$13.7 billion, $42}",{Amazon},,,
2,EMPLOYMENT,{remain},,,,,,"{they, Whole Foods Market, we, Whole Foods Mar...",{John Mackey},{CEO}


## Graphing event semantics

In [23]:
# Entities are associated with events by group, not individual mention
# For simplicity, aassume the canonical mention is the longest one
def get_canonical_mention(mentions):
    extents = enumerate([m['Text'] for m in mentions])
    longest_name = sorted(extents, key=lambda x: len(x[1]))
    return [mentions[longest_name[-1][0]]]

# Set a global confidence threshold
thr = 0.5

# Nodes are (id, type, tag, score, mention_type) tuples
trigger_nodes = [
    ("tr%d" % i, t['Type'], t['Text'], t['Score'], "trigger")
    for i, e in enumerate(result['Events'])
    for t in e['Triggers'][:1]
    if t['GroupScore'] > thr
]
entity_nodes = [
    ("en%d" % i, m['Type'], m['Text'], m['Score'], "entity")
    for i, e in enumerate(result['Entities'])
    for m in get_canonical_mention(e['Mentions'])
    if m['GroupScore'] > thr
]

# Edges are (trigger_id, node_id, role, score) tuples
argument_edges = [
    ("tr%d" % i, "en%d" % a['EntityIndex'], a['Role'], a['Score'])
    for i, e in enumerate(result['Events'])
    for a in e['Arguments']
    if a['Score'] > thr
]    

In [24]:
G = nx.Graph()

# Iterate over triggers and entity mentions
for mention_id, tag, extent, score, mtype in trigger_nodes + entity_nodes:
    label = extent if mtype.startswith("entity") else tag
    G.add_node(mention_id, label=label, size=score*10, color=color_map[tag], tag=tag, group=mtype)
    
# Iterate over argument role assignments
for event_id, entity_id, role, score in argument_edges:
    G.add_edges_from(
        [(event_id, entity_id)],
        label=role,
        weight=score*100,
        color="grey"
    )

# Drop mentions that don't participate in events
G.remove_nodes_from(list(nx.isolates(G)))

In [25]:
nt = Network("600px", "800px", notebook=True, heading="")
nt.from_nx(G)
nt.show("compact_nx.html")

compact_nx.html


In [26]:
# This function in `events_graph.py` plots a complete graph of the document
# The graph shows all events, triggers, entities, and their groups

import events_graph as evg

evg.plot(result, node_types=['event', 'trigger', 'entity_group', 'entity'], thr=0.5)

nx.html
