# Senzing + Neo4j: Build a knowledge graph

## Set up the Python environment

First, we need to import the Python library dependencies which are required for the code we'll be running.

In [1]:
from dataclasses import dataclass, field
import json
import os
import pathlib
import sys
import typing

from graphdatascience import GraphDataScience 
from icecream import ic
from tqdm import tqdm
import dotenv
import matplotlib.pyplot as plt
import pandas as pd
import pyvis
import seaborn as sns
import watermark

%load_ext watermark

Show a "watermark" of which versions are being used for system componenents and library dependencies. This may help in case you need to troubleshoot the dependencies on your system, e.g., if there's some conflict during installation.

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-03-28T09:03:09.595341-07:00

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.22.2

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

json      : 2.0.9
pandas    : 2.2.1
pyvis     : 0.3.2
watermark : 2.4.3
sys       : 3.11.0 (v3.11.0:deaf509e8f, Oct 24 2022, 14:43:23) [Clang 13.0.0 (clang-1300.0.29.30)]
matplotlib: 3.8.3
seaborn   : 0.13.2



## Parse the results from Senzing

Let's define a `dataclass` to represent the parsed results from Senzing entity resolution.

In [3]:
@dataclass(order=False, frozen=False)
class Entity:  # pylint: disable=R0902
    """
A data class representing a resolved entity.
    """
    entity_id: id
    num_recs: int
    records: typing.Set[ str ] = field(default_factory = lambda: set([]))
    related: typing.Dict[ int, dict ] = field(default_factory = lambda: {})
    has_ref: bool = False

Parse the JSON data from the export, to build a dictionary of entities indexed by their unique identifiers. Also keep track of both the "resolved" and "related" records for each entity, to use for constructing the knowledge graph from these results.

In [6]:
export_path: pathlib.Path = pathlib.Path("export.json")
entities: dict = {}

with export_path.open() as fp:
    for line in tqdm(fp.readlines(), desc = "read JSON"):
        entity_dat: dict = json.loads(line)
        entity_id: int = entity_dat["RESOLVED_ENTITY"]["ENTITY_ID"]

        records: set = set([
            ".".join([ r["DATA_SOURCE"].upper(), str(r["RECORD_ID"]) ])
            for r in entity_dat["RESOLVED_ENTITY"]["RECORDS"]
        ])

        entities[entity_id] = Entity(
            entity_id = entity_id,
            records = records,
            num_recs = len(records),
            related = {
                r["ENTITY_ID"]: r
                for r in entity_dat["RELATED_ENTITIES"]
            },
        )

read JSON: 100%|█████████████████████████████████████████████████████████████████████████████████████| 99156/99156 [00:10<00:00, 9902.76it/s]


To finish preparing the input data for resolved entities, let's make a quick traversal of the record linkage and set a flag for "interesting" entities which will have relations in the graph to visualize.

In [16]:
for entity in entities.values():
    if entity.num_recs > 0:
        entity.has_ref = True

    for rel_ent_id in entity.related:
        entities[rel_ent_id].has_ref = True

Let's examine one of the resolved entity objects, to see which fields are available

In [7]:
entity_dat

{'RESOLVED_ENTITY': {'ENTITY_ID': 438737,
  'RECORDS': [{'DATA_SOURCE': 'SAFEGRAPH',
    'RECORD_ID': 'zzw-222@5yv-c8t-t7q',
    'ENTITY_TYPE': 'GENERIC',
    'INTERNAL_ID': 438737,
    'ENTITY_KEY': '7A2952039A2EDAE86C89FF025284618BB47F5B0E',
    'ENTITY_DESC': 'Royalty Renee Salez',
    'MATCH_KEY': '',
    'MATCH_LEVEL': 0,
    'MATCH_LEVEL_CODE': '',
    'ERRULE_CODE': '',
    'LAST_SEEN_DT': '2024-03-12 18:54:15.638'}]},
 'RELATED_ENTITIES': []}

In [17]:
ic(list(entities.values())[-1]);

ic| list(entities.values())[-1]: Entity(entity_id=438737,
                                        num_recs=1,
                                        records={'SAFEGRAPH.zzw-222@5yv-c8t-t7q'},
                                        related={},
                                        has_ref=True)


## Connect the GDS library to Neo4j Desktop

Set up a GDS connection using our credentials for Neo4j Desktop

In [18]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
database: str = os.environ.get("NEO4J_DBMS")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

gds:GraphDataScience = GraphDataScience(
    bolt_uri,
    auth = ( username, password, ),
    database = database,
    aura_ds = False,
)

## Build the KG in Neo4j

### Populate nodes from the Senzing entities

In [None]:
for entity in tqdm(entities.values(), desc = "merge entity nodes"):
    params = {
        "uid": entity.entity_id,
        "has_ref": entity.has_ref,
    }

    gds.run_cypher(
        "MERGE (ent:Entity {uid: $uid, has_ref: $has_ref})",
        params,
    )

merge entity nodes:   0%|                                                                               | 30/99156 [00:03<1:06:14, 24.94it/s]

### Connect the resolved records and related entities

In [None]:
query = """
MATCH
    (ent:Entity {uid: $entity_uid}),
    (rec:Record {uid: $record_uid})       
MERGE (ent)-[:RESOLVES]->(rec)
"""

for entity in tqdm(entities.values(), desc = "merge ent->rec"):
    for record_uid in entity.records:
        params = {
            "entity_uid": entity.entity_id,
            "record_uid": record_uid,
        }

        gds.run_cypher(query, params)

In [None]:
query = """
MATCH
    (ent:Entity {uid: $params.entity_uid}),
    (rel_ent:Entity {uid: $params.rel_ent})       
MERGE (ent)-[:RELATED {ambiguous: $params.ambiguous, disclosed: $params.disclosed, match_key: $params.match_key, match_level: $params.match_level, match_level_code: $params.match_level_code}]->(rel_ent)
"""

for entity in tqdm(entities.values(), desc = "merge ent->rel"):
    for rel_key, rel_ent in entity.related.items():
        params = {
            "entity_uid": entity.entity_id,
            "rel_ent": rel_ent["ENTITY_ID"],
            "ambiguous": (rel_ent["IS_AMBIGUOUS"] == 0),
            "disclosed": (rel_ent["IS_DISCLOSED"] == 0),
            "match_key": rel_ent["MATCH_KEY"],
            "match_level": rel_ent["MATCH_LEVEL"],
            "match_level_code": rel_ent["MATCH_LEVEL_CODE"],
        }

        gds.run_cypher(query, params)

In [None]:
query = """
MATCH (ent:Entity)
RETURN
    ent.uid, COUNT { (ent)-[:RESOLVES]->(:Record) } AS num_recs
ORDER BY num_recs DESC
LIMIT 20
    """
    
gds.run_cypher(query, params)

## Analyze the impact of ER

Now let's analyze the Senzing results, measuring how much the process of _entity resolution_ has consolidated records among the input datasets.

In [None]:
for entity in entities.values():
    if entity.num_recs > 1:
        entity.has_ref = True

        for inf_ent in entity.related.keys():
            entities[inf_ent].has_ref = True

has_ref_ents: int = len([
    e for e in entities.values()
    if e.has_ref
])

In particular, it's helpful for planning about our eventual knowledge graph to understand the:

  - total number of entities
  - number of entities which have references (i.e., these will be linked within the knowledge graph)

In [None]:
ic(len(entities))
ic(has_ref_ents);

Now visualize this as a histogram of the resolved entities versus their related records in the input datasets.

From this analysis, more than 14K entities were linked to records through _entity resolution_.
These can be used to construct _nodes_, _properties_, and _relations_ in a knowledge graph.

In [None]:
num_rel_2: int = 3437
has_ref_ents - num_rel_2

Of the linked entities, more than 10K have three or more records linked.
This is interesting since we're trying to link records across three datasets.
We'll get more specific stats later through Cypher graph queries in Neo4j.

In [None]:
df = gds.run_cypher(
  """
MATCH (ent:Entity)
RETURN COUNT(ent.uid) as count_ent, COUNT { (ent)-[:RELATED]->(:Entity) } as num_rel
ORDER BY num_rel DESC
  """
)

In [None]:
fig, ax = plt.subplots()
plt.rcParams["font.family"] = "sans-serif"

y = sns.lineplot(df, y = "count_ent", x = "num_rel")
y.tick_params(axis = "y", size = 9, colors = "gray")

plt.xlabel("related entities per entity", size = 10, fontstyle = "italic")
plt.ylabel("entity count", size = 10, fontstyle = "italic")

sns.despine(bottom = True, left = True)
plt.yscale("log")

fig.savefig("article/img/graphs.plot.ent_rel.png")

In [None]:
df = gds.run_cypher(
  """
MATCH (ent:Entity)
RETURN COUNT(ent.uid) as count_ent, COUNT { (ent)-[:RESOLVES]->(:Record) } as num_rec
ORDER BY num_rec DESC
  """
)

In [None]:
fig, ax = plt.subplots()
plt.rcParams["font.family"] = "sans-serif"

y = sns.barplot(df, y = "count_ent", x = "num_rec")
y.tick_params(axis = "y", size = 9, colors = "gray")
y.bar_label(y.containers[0], padding = 3, color = "black", fontsize = 11)

plt.xlabel("records per entity", size = 10, fontstyle = "italic")
plt.ylabel("entity count", size = 10, fontstyle = "italic")

sns.despine(bottom = True, left = True)
plt.yscale("log")

fig.savefig("article/img/graphs.plot.ent_rec.png")

What's the average number of records per entity, for entities that have records linked?

In [None]:
df = gds.run_cypher(
  """
MATCH (ent:Entity)
RETURN ent.uid, COUNT { (ent)-[:RESOLVES]->(:Record) } as num_rec
  """
)

df_linked = df[df["num_rec"] > 1]
df_linked["num_rec"].mean()

## Graph visualizations

In [None]:
df = gds.run_cypher(
  """
MATCH (ent:Entity)-[:RESOLVES]->(rec:Record)
RETURN ent.uid AS ent_uid, rec.uid AS rec_uid, COUNT { (ent)-[:RESOLVES]->(:Record) } as num_rec
  """
)

df_linked = df[df["num_rec"] > 1]
df_linked.head()

In [None]:
net: pyvis.network.Network = pyvis.network.Network(notebook = True)

for _, row in df_linked.iterrows():
    net.add_node(row.rec_uid, title = row.rec_uid, color = "blue", shape = "square")

net.toggle_physics(True)
net.show("vegas.1.html")

In [None]:
net: pyvis.network.Network = pyvis.network.Network(notebook = True)

for _, row in df_linked.iterrows():
    net.add_node(row.rec_uid, title = row.rec_uid, color = "blue", shape = "square")
    net.add_node(row.ent_uid, title = str(row.ent_uid), color = "red", shape = "star", size = row.num_rec)
    net.add_edge(row.rec_uid, row.ent_uid, weight = 1.0)

net.toggle_physics(True)
net.show("vegas.2.html")

## Add more structure to the KG

In [None]:
df = gds.run_cypher(
  """
MATCH (rec:Record)
WHERE (rec.sub_category) IS NOT NULL 
RETURN DISTINCT rec.sub_category AS sub_category
  """
)

df

In [None]:
for _, row in df.iterrows():
    gds.run_cypher(
        "MERGE (cat:Category {name: $name })",
        params = { "name": row.sub_category },
    )

In [None]:
df = gds.run_cypher(
    """
MATCH (ent:Entity)-[:RESOLVES]->(rec:Record)
RETURN rec.uid
    """
)

df