## Demo for SciPy 2021, part 1: 
# "Without Pandas"

Based on `Market_Intelligence_Part1.ipynb`.

In [1]:
# Load some model outputs we've precomputed offline
import json

data_dir = "./scipy_demo_data"
with open(f"{data_dir}/person_mentions.json") as f:
    person_mentions = json.load(f)
with open(f"{data_dir}/someone_said_something.json") as f:
    someone_said_something = json.load(f)
with open(f"{data_dir}/person_mentions_watson.json") as f:
    person_mentions_watson = json.load(f)

In [2]:
someone_said_something

[{'subject': {'text': 'Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery',
   'begin': 1213,
   'end': 1281},
  'sentence': ' "SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO," said Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery.',
  'object': {'text': 'SAP HANA Enterprise Cloud on IBM Power Systems will help clients unlock the full value of SAP HANA in the cloud, with the possibility of enhancing the scalability and availability of mission critical SAP applications while moving workloads to SAP HANA and lowering TCO'},
  'action': {'verb': {'text': 'say', 'tense': 'past'},
   'text': 'said',
   'normalized': 'say'}},
 {'subject': {'text': 'Stephen Leonard, General Manager, IBM Cognitive Systems, "With the addition of IBM Power 

In [3]:
person_mentions

[{'entity_group': 'PER',
  'score': 0.9996308088302612,
  'word': 'Christoph Herman',
  'start': 1213,
  'end': 1229}]

In [4]:
def persons_in_subjects(person_mentions, someone_said_something):
    # Adjust chunk length so every span fits in exactly 1 or 2 chunks
    lengths = [s["subject"]["end"] - s["subject"]["begin"] for s in someone_said_something]
    chunk_len = max(lengths) + 1
    
    # Build a lookup table.
    # Key is (offset // chunk len).
    # Value is index into someone_said_something
    chunk_to_srl_ix = {}
    for i in range(len(someone_said_something)):
        s = someone_said_something[i]
        chunk_indices = set(
            [s["subject"]["begin"] // chunk_len, s["subject"]["end"] // chunk_len]
        )
        for chunk_ix in chunk_indices:
            entry = chunk_to_srl_ix.get(chunk_ix, [])
            entry.append(i)
            chunk_to_srl_ix[chunk_ix] = entry
            
    # Probe into the lookup table and compare pairs of spans
    ix_pairs = []
    for i in range(len(person_mentions)):
        p = person_mentions[i]
        chunk_indices = set([p["start"] // chunk_len, p["end"] // chunk_len])
        ix_to_compare = []
        for chunk_ix in chunk_indices:
            for srl_ix in chunk_to_srl_ix[chunk_ix]:
                srl = someone_said_something[srl_ix]
                if srl["subject"]["begin"] <= p["start"] and srl["subject"]["end"] >= p["end"]:
                    ix_pairs.append((i, srl_ix))
            
    # Drop duplicates
    unique_ix_pairs = set(ix_pairs)
    
    # Construct result records
    return [
        {"person": person_mentions[t[0]],
         "subject": someone_said_something[t[1]]["subject"]}
        for t in unique_ix_pairs
    ]

persons_in_subjects(person_mentions, someone_said_something)

[{'person': {'entity_group': 'PER',
   'score': 0.9996308088302612,
   'word': 'Christoph Herman',
   'start': 1213,
   'end': 1229},
  'subject': {'text': 'Christoph Herman, SVP and Head of SAP HANA Enterprise Cloud Delivery',
   'begin': 1213,
   'end': 1281}}]

In [5]:
person_mentions_watson

[{'type': 'Person',
  'text': 'Christoph Herman',
  'relevance': 0.217154,
  'mentions': [{'text': 'Christoph Herman',
    'location': [1213, 1229],
    'confidence': 0.94435}],
  'count': 1,
  'confidence': 0.94435},
 {'type': 'Person',
  'text': 'Stephen Leonard',
  'relevance': 0.136166,
  'mentions': [{'text': 'Stephen Leonard',
    'location': [2227, 2242],
    'confidence': 0.989177}],
  'disambiguation': {'name': 'Steve_Leonard',
   'dbpedia_resource': 'http://dbpedia.org/resource/Steve_Leonard'},
  'count': 1,
  'confidence': 0.989177},
 {'type': 'Person',
  'text': 'Sam Ponedal',
  'relevance': 0.020711,
  'mentions': [{'text': 'Sam Ponedal',
    'location': [3574, 3585],
    'confidence': 0.894298}],
  'count': 1,
  'confidence': 0.894298}]

In [6]:
persons_in_subjects(person_mentions_watson, someone_said_something)

KeyError: 'start'