## Set up

Load the Python dependencies.

In [1]:
from dataclasses import dataclass, field
import json
import os
import pathlib
import sys
import typing

from graphdatascience import GraphDataScience
from icecream import ic
from tqdm import tqdm
import dotenv
import neo4j
import pandas as pd
import watermark

%load_ext watermark

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-06-28T10:13:17.004352-07:00

Python implementation: CPython
Python version       : 3.11.9
IPython version      : 8.25.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 23.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

watermark: 2.4.3
pandas   : 2.2.2
sys      : 3.11.9 (v3.11.9:de54cf5be3, Apr  2 2024, 07:12:50) [Clang 13.0.0 (clang-1300.0.29.30)]
json     : 2.0.9
neo4j    : 5.22.0



Establish a GDS connection to Neo4j.

In [3]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
database: str = os.environ.get("NEO4J_DBMS")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

gds:GraphDataScience = GraphDataScience(
    bolt_uri,
    auth = ( username, password, ),
    database = database,
    aura_ds = False,
)



## Load the Senzing overlay

In [4]:
@dataclass(order=False, frozen=False)
class Entity:  # pylint: disable=R0902
    """
A data class representing a resolved entity.
    """
    entity_uid: id
    name: str
    num_recs: int
    records: typing.Dict[ str, str ] = field(default_factory = lambda: {})
    related: typing.Dict[ int, dict ] = field(default_factory = lambda: {})
    has_ref: bool = False

In [5]:
export_path: pathlib.Path = pathlib.Path("ICIJ-entity-report-2024-06-21_12-04-57-std.json")
entities: dict = {}
count : int = 0

with export_path.open() as fp:
    for line in tqdm(fp.readlines(), desc = "read JSON"):
        count += 1
        entity_dat: dict = json.loads(line)
        entity_uid: int = entity_dat["RESOLVED_ENTITY"]["ENTITY_ID"]
        entity_name: str = entity_dat["RESOLVED_ENTITY"]["ENTITY_NAME"]
        records: dict = {}

        for rec in entity_dat["RESOLVED_ENTITY"]["RECORDS"]:
            record_uid: str = rec["RECORD_ID"]

            if not record_uid.startswith("#"):
                match_key: str = rec["MATCH_KEY"]

                if match_key.strip() == "":
                    match_key = "INITIAL"

                records[record_uid] = match_key

            if entity_name == "" and rec["ENTITY_DESC"] != "":
                entity_name = rec["ENTITY_DESC"]

        if entity_name == "":
            entity_name = entity_uid

        entities[entity_uid] = Entity(
            entity_uid = entity_uid,
            name = entity_name,
            records = records,
            num_recs = len(records),
            related = {
                r["ENTITY_ID"]: r
                for r in entity_dat["RELATED_ENTITIES"]
            },
        )

count

read JSON: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1547418/1547418 [00:18<00:00, 85286.05it/s]


1547418

In [6]:
entity_dat

{'RESOLVED_ENTITY': {'ENTITY_ID': 1809179,
  'ENTITY_NAME': 'Vistamax Kenya Ltd',
  'FEATURES': {'ADDRESS': [{'FEAT_DESC': 'Kay Construction Complex   Mombasa Road Nairobi KENYA',
     'LIB_FEAT_ID': 15469571,
     'FEAT_DESC_VALUES': [{'FEAT_DESC': 'Kay Construction Complex   Mombasa Road Nairobi KENYA',
       'LIB_FEAT_ID': 15469571}]}],
   'DUNS_NUMBER': [{'FEAT_DESC': '499289270',
     'LIB_FEAT_ID': 15469585,
     'FEAT_DESC_VALUES': [{'FEAT_DESC': '499289270',
       'LIB_FEAT_ID': 15469585}]}],
   'NAME': [{'FEAT_DESC': 'Vistamax Kenya Ltd',
     'LIB_FEAT_ID': 15469564,
     'USAGE_TYPE': 'COMPANY',
     'FEAT_DESC_VALUES': [{'FEAT_DESC': 'Vistamax Kenya Ltd',
       'LIB_FEAT_ID': 15469564}]},
    {'FEAT_DESC': 'Zuresh Said',
     'LIB_FEAT_ID': 15469570,
     'USAGE_TYPE': 'EXECUTIVE',
     'FEAT_DESC_VALUES': [{'FEAT_DESC': 'Zuresh Said',
       'LIB_FEAT_ID': 15469570}]}],
   'PHONE': [{'FEAT_DESC': '254-282-4805',
     'LIB_FEAT_ID': 15469572,
     'USAGE_TYPE': 'CONTACT'

In [7]:
for entity in entities.values():
    if entity.num_recs > 0:
        entity.has_ref = True

    for rel_ent_id in entity.related:
        entities[rel_ent_id].has_ref = True

How many entities link to known records or other entities?

In [8]:
count: int = 0

for ent in entities.values():
    if len(ent.records) > 0 or len(ent.related) > 0:
        count += 1

count

1526801

In [9]:
gds.run_cypher("""
DROP CONSTRAINT `sz_entity_node_key` IF EXISTS
""")

gds.run_cypher("""
CREATE CONSTRAINT `sz_entity_node_key` IF NOT EXISTS
  FOR (ent:SzEntity)
  REQUIRE ent.uid IS NODE KEY
""")

Load the Senzing entities.

In [10]:
df_ent: pd.DataFrame = pd.DataFrame([
    {
        "uid": entity.entity_uid,
        "name": entity.name,
        "has_ref": entity.has_ref,
    }
    for entity in entities.values()
])

unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (ent:SzEntity {uid: row.uid, name: row.name, has_ref: row.has_ref})
} IN TRANSACTIONS OF 10000 ROWS
    """

gds.run_cypher(
    unwind_query,
    {"rows": df_ent.to_dict(orient = "records")},
)

In [11]:
df_rec: pd.DataFrame = gds.run_cypher(
  """
MATCH (rec)
WHERE rec.node_id IS NOT NULL
RETURN rec.node_id AS node_id, head(labels(rec)) AS label, rec.name AS name
  """
)

df_rec

Unnamed: 0,node_id,label,name
0,10000001,Entity,"TIANSHENG INDUSTRY AND TRADING CO., LTD."
1,10000002,Entity,"NINGBO SUNRISE ENTERPRISES UNITED CO., LTD."
2,10000003,Entity,"HOTFOCUS CO., LTD."
3,10000004,Entity,"SKY-BLUE GIFTS & TOYS CO., LTD."
4,10000005,Entity,FORTUNEMAKER INVESTMENTS CORPORATION
...,...,...,...
2016518,240554205,Entity,The Giscarl Trust
2016519,240554206,Entity,Woodland Irrevocable Trust
2016520,240554207,Entity,Kapecod Trust
2016521,240554208,Entity,The Schijman Trust


In [12]:
ent_kind: dict = pd.Series(df_rec.label.values,index=df_rec.node_id.astype(str)).to_dict()

In [13]:
df_load: pd.DataFrame = pd.DataFrame([
    {
        "entity_uid": entity.entity_uid,
        "record_uid": record_uid,
        "label": ent_kind[record_uid],
        "match_key": match_key,
    }
    for entity in entities.values()
    for record_uid, match_key in entity.records.items()
    if record_uid in ent_kind
])

len(df_load)

1614277

In [14]:
df_load

Unnamed: 0,entity_uid,record_uid,label,match_key
0,463,240130486,Officer,+NAME+ADDRESS
1,1650,56096328,Officer,+NAME+ADDRESS
2,1756,56051487,Officer,+NAME+ADDRESS
3,100003,56008866,Officer,INITIAL
4,100028,80060589,Officer,INITIAL
...,...,...,...,...
1614272,1734124,12125912,Officer,INITIAL
1614273,1734125,200128234,Entity,INITIAL
1614274,1734127,10127496,Entity,INITIAL
1614275,1734128,10009033,Entity,INITIAL


Connect the Senzing entities with records already loaded in Neo4j.

In [15]:
for _, row in tqdm(df_load.iterrows(), desc = "load rows"):
    template: str = f"""
  MATCH
    (ent:SzEntity {{uid: {row.entity_uid}}}),
    (rec:{row.label} {{node_id: {row.record_uid}}})
  MERGE (ent)-[rel:RESOLVES {{match_key: "{row.match_key}"}}]->(rec)
"""

    gds.run_cypher(template)

load rows: 1614277it [5:28:32, 81.89it/s] 


Test the results

In [16]:
df_test: pd.DataFrame = gds.run_cypher(
  """
MATCH (ent)-[rel:RESOLVES]->(rec)
RETURN labels(ent), ent.name, rec.node_id
  """
)

df_test

Unnamed: 0,labels(ent),ent.name,rec.node_id
0,[SzEntity],HSBC PRIVATE BANK SUISSE S A AXEL STERN,11000398
1,[SzEntity],AKSANA PALTARZHYTSKAYA,240130486
2,[SzEntity],FAIDRA THEOFANOUS,56096328
3,[SzEntity],HANNAH GERTSEN,56051487
4,[SzEntity],KPMG HOLDINGS LIMITED,56008866
...,...,...,...
1614272,[SzEntity],NORTHLAKE NOMINEES SERVICES LIMITED,12125912
1614273,[SzEntity],Westbond Corporate Services Limited,200128234
1614274,[SzEntity],Scout Holdings Limited,10127496
1614275,[SzEntity],SAFAT INVESTMENTS LTD S A,10009033
