# Senzing + Neo4j: The datasets

## Set up the Python environment

First, we need to import the Python library dependencies which are required for the code we'll be running.

In [1]:
import json
import os
import pathlib
import sys
import typing

from graphdatascience import GraphDataScience
from icecream import ic
import dotenv
import pandas as pd
import watermark

%load_ext watermark

Show a "watermark" of which versions are being used for system componenents and library dependencies. This may help in case you need to troubleshoot the dependencies on your system, e.g., if there's some conflict during installation.

In [2]:
%watermark
%watermark --iversions

Last updated: 2024-04-11T20:52:33.221585-07:00

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.23.0

Compiler    : Clang 13.0.0 (clang-1300.0.29.30)
OS          : Darwin
Release     : 21.6.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit

watermark: 2.4.3
json     : 2.0.9
pandas   : 2.2.1
sys      : 3.11.0 (v3.11.0:deaf509e8f, Oct 24 2022, 14:43:23) [Clang 13.0.0 (clang-1300.0.29.30)]



## Examine the input datasets

We will use three datasets which describe businesses (names, addresses, etc.) within the Las Vegas metropolitan area:

  - SafeGraph: `Places of Interest` (POI)
  - US Dept of Labor: `Wage and Hour Compliance Action Data` (WHISARD)
  - US Small Business Admin: `PPP Loans over $150K` (PPP)

Snapshots of these datasets are available to download from: 
<https://senzing.com/get-erkg-tutorial-evaluation-data/>

For the sake of brevity, we've shortened their file names to: `poi.json`, `dol.json`, `ppp.json`, respectively

We'll keep all of the columns for each dataset even though we're only using a few in this tutorial. Let's define a utility function to show a subset of columns in a Pandas `DataFrame` object.

In [3]:
def sample_df (
    df: pd.DataFrame,
    cols_keep: typing.Set[ typing.Any ],
    ) -> pd.DataFrame:
    """
Remove all but the specified columns from a copy of the given Pandas dataframe.
https://stackoverflow.com/a/51285940/1698443
    """
    diff: typing.Set[ typing.Any ] = set(df.columns) - cols_keep
    
    return df.drop(
        diff,
        axis = 1,
        inplace = False,
    )

### Load the SafeGraph Places dataset

Load the `Places of Interest` (POI) dataset for Las Vegas, from SafeGraph:

In [4]:
poi_path: pathlib.Path = pathlib.Path("../lv_data") / "poi.json"

df_poi: pd.DataFrame = pd.DataFrame.from_dict(
    [ json.loads(line) for line in poi_path.open(encoding = "utf-8") ],
)

df_poi = df_poi.astype(str)

df_poi["name"] = df_poi["LOCATION_NAME_ORG"]

In [5]:
df_poi.head()

Unnamed: 0,DATA_SOURCE,RECORD_ID,RECORD_TYPE,PLACEKEY,REL_ANCHOR_DOMAIN,REL_ANCHOR_KEY,LOCATION_NAME_ORG,BRANDS,TOP_CATEGORY,SUB_CATEGORY,...,PHONE_NUMBER,BUSINESS_ADDR_COUNTRY,BUSINESS_ADDR_FULL,MAILING_VERIFIED_STATUS,REL_POINTER_DOMAIN,REL_POINTER_KEY,REL_POINTER_ROLE,OPENED_ON,IS_INTERSECTION,name
0,SAFEGRAPH,225-222@5yv-j92-tn5,ORGANIZATION,225-222@5yv-j92-tn5,PLACEKEY,225-222@5yv-j92-tn5,Cantwell Michelle L Atty,[],Legal Services,Offices of Lawyers,...,17023627800,US,3320 W Sahara Ave Las Vegas NV 89102-3223,VERIFIED_PREMISE,,,,,,Cantwell Michelle L Atty
1,SAFEGRAPH,226-222@5yv-hmm-whq,ORGANIZATION,226-222@5yv-hmm-whq,PLACEKEY,226-222@5yv-hmm-whq,Pieology Pizzeria,"[ { ""safegraph_brand_id"": ""SG_BRAND_f372f9969f...",Restaurants and Other Eating Places,Limited-Service Restaurants,...,17023314454,US,10965 Lavender Hill Dr Ste 130 Las Vegas NV 89...,VERIFIED_DELIVERY_POINT,PLACEKEY,zzw-223@5yv-hkm-rc5,PARENT,,,Pieology Pizzeria
2,SAFEGRAPH,22s-222@5yv-jbz-jgk,ORGANIZATION,22s-222@5yv-jbz-jgk,PLACEKEY,22s-222@5yv-jbz-jgk,Ellen Peneyra,[],Offices of Other Health Practitioners,Offices of All Other Miscellaneous Health Prac...,...,17027397716,US,2275 Renaissance Dr Ste D Las Vegas NV 89119-6797,VERIFIED_DELIVERY_POINT,PLACEKEY,22g-222@5yv-jbz-h89,PARENT,,,Ellen Peneyra
3,SAFEGRAPH,28m-222@5yv-hmp-v4v,ORGANIZATION,28m-222@5yv-hmp-v4v,PLACEKEY,28m-222@5yv-hmp-v4v,Stanford Jackson,[],Offices of Other Health Practitioners,Offices of All Other Miscellaneous Health Prac...,...,17025623569,US,8321 W Sahara Ave Apt 2007 Las Vegas NV 89117-...,VERIFIED_DELIVERY_POINT,,,,,,Stanford Jackson
4,SAFEGRAPH,228-232@5yv-hts-bp9,ORGANIZATION,228-232@5yv-hts-bp9,PLACEKEY,228-232@5yv-hts-bp9,MS. Jazlyn James,[],Offices of Physicians,"Offices of Physicians, Mental Health Specialists",...,17029008666,US,5550 Painted Mirage Rd Ste 320 Las Vegas NV 89...,VERIFIED_DELIVERY_POINT,PLACEKEY,223-23b@5yv-hts-bp9,PARENT,,,MS. Jazlyn James


Take a look at the column names. The `"DATA_SOURCE"`, `"RECORD_ID"`, `"RECORD_TYPE"` columns are needed by Senzing to identify unique records,  then any related to names or addresses will get used during _entity resolution_.

In [6]:
df_poi.columns

Index(['DATA_SOURCE', 'RECORD_ID', 'RECORD_TYPE', 'PLACEKEY',
       'REL_ANCHOR_DOMAIN', 'REL_ANCHOR_KEY', 'LOCATION_NAME_ORG', 'BRANDS',
       'TOP_CATEGORY', 'SUB_CATEGORY', 'NAICS_CODE', 'BUSINESS_GEO_LATITUDE',
       'BUSINESS_GEO_LONGITUDE', 'CATEGORY_TAGS', 'CLOSED_ON',
       'TRACKING_CLOSED_SINCE', 'PHONE_NUMBER', 'BUSINESS_ADDR_COUNTRY',
       'BUSINESS_ADDR_FULL', 'MAILING_VERIFIED_STATUS', 'REL_POINTER_DOMAIN',
       'REL_POINTER_KEY', 'REL_POINTER_ROLE', 'OPENED_ON', 'IS_INTERSECTION',
       'name'],
      dtype='object')

In [7]:
df = sample_df(
    df_poi,
    set([
        "RECORD_ID",
        "name",
        "SUB_CATEGORY",
        "BUSINESS_GEO_LATITUDE",
        "BUSINESS_GEO_LONGITUDE",
        "BUSINESS_ADDR_FULL",
    ]),
)

In [8]:
df.head()

Unnamed: 0,RECORD_ID,SUB_CATEGORY,BUSINESS_GEO_LATITUDE,BUSINESS_GEO_LONGITUDE,BUSINESS_ADDR_FULL,name
0,225-222@5yv-j92-tn5,Offices of Lawyers,36.145647,-115.186399,3320 W Sahara Ave Las Vegas NV 89102-3223,Cantwell Michelle L Atty
1,226-222@5yv-hmm-whq,Limited-Service Restaurants,36.144906,-115.332644,10965 Lavender Hill Dr Ste 130 Las Vegas NV 89...,Pieology Pizzeria
2,22s-222@5yv-jbz-jgk,Offices of All Other Miscellaneous Health Prac...,36.10289,-115.121807,2275 Renaissance Dr Ste D Las Vegas NV 89119-6797,Ellen Peneyra
3,28m-222@5yv-hmp-v4v,Offices of All Other Miscellaneous Health Prac...,36.143386,-115.272694,8321 W Sahara Ave Apt 2007 Las Vegas NV 89117-...,Stanford Jackson
4,228-232@5yv-hts-bp9,"Offices of Physicians, Mental Health Specialists",36.261833,-115.255012,5550 Painted Mirage Rd Ste 320 Las Vegas NV 89...,MS. Jazlyn James


In [9]:
df.describe(include = "all").loc[[ "count", "freq", "unique", ]]

Unnamed: 0,RECORD_ID,SUB_CATEGORY,BUSINESS_GEO_LATITUDE,BUSINESS_GEO_LONGITUDE,BUSINESS_ADDR_FULL,name
count,79946,79946,79946,79946,79946,79946
freq,1,12145,11,15,2692,435
unique,79946,388,57686,60708,31831,67555


### Load the DoL WHISARD dataset

Load the `Wage and Hour Compliance Action Data` (WHISARD) from the US Department of Labor:

In [10]:
dol_path: pathlib.Path = pathlib.Path("../lv_data") / "dol.json"

df_dol: pd.DataFrame = pd.DataFrame.from_dict(
    [ json.loads(line) for line in dol_path.open(encoding = "utf-8") ],
)

df_dol = df_dol.astype(str)
df_dol = df_dol.fillna({ "case_violtn_cnt": "0" })

df_dol["name"] = df_dol["BUSINESS_NAME_ORG"]

In [11]:
df_dol.head()

Unnamed: 0,RECORD_TYPE,DATA_SOURCE,RECORD_ID,case_id,BUSINESS_NAME_ORG,LEGAL_NAME_ORG,BUSINESS_ADDR_LINE1,BUSINESS_ADDR_CITY,BUSINESS_ADDR_STATE,BUSINESS_ADDR_POSTAL_CODE,...,flsa_smwsl_ee_atp_cnt,eev_violtn_cnt,h2b_violtn_cnt,h2b_bw_atp_amt,h2b_ee_atp_cnt,sraw_violtn_cnt,sraw_bw_atp_amt,sraw_ee_atp_cnt,ld_dt,name
0,ORGANIZATION,DoL_WHISARD,53,1658108,Fabulous Freddy's (Trailwood),Fabulous LLC,9611 Trail Wood Drive,Las Vegas,NV,89134,...,0,0,0,0.0,0,0,0.0,0,2015-04-01 01:00:03 EDT,Fabulous Freddy's (Trailwood)
1,ORGANIZATION,DoL_WHISARD,165,1419689,Boulder Station Hotel & Casino,"Boulder Station, Inc.",4111 Boulder Hwy,Las Vegas,NV,89121,...,0,0,0,0.0,0,0,0.0,0,2015-02-20 01:00:06 EST,Boulder Station Hotel & Casino
2,ORGANIZATION,DoL_WHISARD,178,1424856,MSI Landscaping,"MIST Systems International, Inc.","4820 Quality Court, #B",Las Vegas,NV,89103,...,0,0,0,0.0,0,0,0.0,0,2015-02-20 01:00:06 EST,MSI Landscaping
3,ORGANIZATION,DoL_WHISARD,196,1668849,Pastime Pools,Pastime Pools,731 Memory Ln,Las Vegas,NV,89110,...,0,0,0,0.0,0,0,0.0,0,2015-02-20 01:00:06 EST,Pastime Pools
4,ORGANIZATION,DoL_WHISARD,491,1602023,Steven Michaels,Steven Michaels,4108 Autum St,Las Vegas,NV,89120,...,0,0,0,0.0,0,0,0.0,0,2015-02-20 01:00:06 EST,Steven Michaels


In [12]:
df_dol.columns

Index(['RECORD_TYPE', 'DATA_SOURCE', 'RECORD_ID', 'case_id',
       'BUSINESS_NAME_ORG', 'LEGAL_NAME_ORG', 'BUSINESS_ADDR_LINE1',
       'BUSINESS_ADDR_CITY', 'BUSINESS_ADDR_STATE',
       'BUSINESS_ADDR_POSTAL_CODE',
       ...
       'flsa_smwsl_ee_atp_cnt', 'eev_violtn_cnt', 'h2b_violtn_cnt',
       'h2b_bw_atp_amt', 'h2b_ee_atp_cnt', 'sraw_violtn_cnt',
       'sraw_bw_atp_amt', 'sraw_ee_atp_cnt', 'ld_dt', 'name'],
      dtype='object', length=114)

In [13]:
df = sample_df(
    df_dol,
    set([
        "RECORD_ID",
        "name",
        "BUSINESS_ADDR_LINE1",
        "naics_code_description",
        "case_violtn_cnt",
    ]),
)

In [14]:
df.head()

Unnamed: 0,RECORD_ID,BUSINESS_ADDR_LINE1,naics_code_description,case_violtn_cnt,name
0,53,9611 Trail Wood Drive,Car Washes,0,Fabulous Freddy's (Trailwood)
1,165,4111 Boulder Hwy,Casino Hotels,1,Boulder Station Hotel & Casino
2,178,"4820 Quality Court, #B",Landscaping Services,14,MSI Landscaping
3,196,731 Memory Ln,All Other Specialty Trade Contractors,0,Pastime Pools
4,491,4108 Autum St,Private Households,2,Steven Michaels


In [15]:
df.describe(include = "all").loc[[ "count", "freq", "unique", ]]

Unnamed: 0,RECORD_ID,BUSINESS_ADDR_LINE1,naics_code_description,case_violtn_cnt,name
count,1554,1554,1554,1554,1554
freq,1,4,143,413,10
unique,1554,1476,388,160,1435


### Load the PPP Loans dataset

Load the `PPP Loans over $150K` (PPP) from the US Small Business Administration:

In [16]:
ppp_path: pathlib.Path = pathlib.Path("../lv_data") / "ppp.json"

df_ppp: pd.DataFrame = pd.DataFrame.from_dict(
    [ json.loads(line) for line in ppp_path.open(encoding = "utf-8") ],
)

df_ppp = df_ppp.astype(str)

df_ppp["name"] = df_ppp["BUSINESS_NAME_ORG"]

In [17]:
df_ppp.head()

Unnamed: 0,RECORD_TYPE,DATA_SOURCE,RECORD_ID,Loan_Range,BUSINESS_NAME_ORG,BUSINESS_ADDR_LINE1,BUSINESS_ADDR_CITY,BUSINESS_ADDR_STATE,BUSINESS_ADDR_POSTAL_CODE,NAICS_Code,Business_Type,OwnedByRaceEthnicity,OwnedBy,OwnedByVeteran,NonProfit,JobsReported,DateApproved,Lender,CD,name
0,ORGANIZATION,PPP_LOANS,7017,c $1-2 million,"INFINITY HOSPICE CARE OF LAS VEGAS, LLC",5110 N 40TH ST STE 107,PHOENIX,AZ,85018,623110.0,Limited Liability Company(LLC),Unanswered,Male Owned,Unanswered,,137,05/01/2020,"JPMorgan Chase Bank, National Association",AZ-09,"INFINITY HOSPICE CARE OF LAS VEGAS, LLC"
1,ORGANIZATION,PPP_LOANS,7018,"d $350,000-1 million",CLUB TATTOO LAS VEGAS LLC,1839 s. almaschool rd. ste 230,MESA,AZ,85210,812199.0,Limited Liability Company(LLC),Unanswered,Male Owned,Non-Veteran,,15,05/01/2020,"JPMorgan Chase Bank, National Association",AZ-09,CLUB TATTOO LAS VEGAS LLC
2,ORGANIZATION,PPP_LOANS,7021,"d $350,000-1 million","LAS VEGAS LABOR, LLC","10265 W Camelback Rd, Ste 104",PHOENIX,AZ,85037,111421.0,Limited Liability Company(LLC),Unanswered,Unanswered,Unanswered,,0,04/07/2020,"UMB Bank, National Association",AZ-03,"LAS VEGAS LABOR, LLC"
3,ORGANIZATION,PPP_LOANS,7022,"e $150,000-350,000","FIRST CUP PARTNERS LAS VEGAS II, LLC","106 S Kyrene Rd, Ste 2",CHANDLER,AZ,85226,722511.0,Limited Liability Company(LLC),Unanswered,Male Owned,Non-Veteran,,105,04/16/2020,"JPMorgan Chase Bank, National Association",AZ-09,"FIRST CUP PARTNERS LAS VEGAS II, LLC"
4,ORGANIZATION,PPP_LOANS,7023,"e $150,000-350,000","FIRST CUP PARTNERS LAS VEGAS, LLC",2121 W Chandler Blvd Ste 106,Chandler,AZ,85224,,Limited Liability Company(LLC),Unanswered,Male Owned,Non-Veteran,,124,05/01/2020,"JPMorgan Chase Bank, National Association",AZ-09,"FIRST CUP PARTNERS LAS VEGAS, LLC"


In [18]:
df_ppp.columns

Index(['RECORD_TYPE', 'DATA_SOURCE', 'RECORD_ID', 'Loan_Range',
       'BUSINESS_NAME_ORG', 'BUSINESS_ADDR_LINE1', 'BUSINESS_ADDR_CITY',
       'BUSINESS_ADDR_STATE', 'BUSINESS_ADDR_POSTAL_CODE', 'NAICS_Code',
       'Business_Type', 'OwnedByRaceEthnicity', 'OwnedBy', 'OwnedByVeteran',
       'NonProfit', 'JobsReported', 'DateApproved', 'Lender', 'CD', 'name'],
      dtype='object')

In [19]:
df = sample_df(
    df_ppp,
    set([
        "RECORD_ID",
        "name",
        "BUSINESS_ADDR_LINE1",
        "Business_Type",
        "JobsReported",
    ]),
)

In [20]:
df.head()

Unnamed: 0,RECORD_ID,BUSINESS_ADDR_LINE1,Business_Type,JobsReported,name
0,7017,5110 N 40TH ST STE 107,Limited Liability Company(LLC),137,"INFINITY HOSPICE CARE OF LAS VEGAS, LLC"
1,7018,1839 s. almaschool rd. ste 230,Limited Liability Company(LLC),15,CLUB TATTOO LAS VEGAS LLC
2,7021,"10265 W Camelback Rd, Ste 104",Limited Liability Company(LLC),0,"LAS VEGAS LABOR, LLC"
3,7022,"106 S Kyrene Rd, Ste 2",Limited Liability Company(LLC),105,"FIRST CUP PARTNERS LAS VEGAS II, LLC"
4,7023,2121 W Chandler Blvd Ste 106,Limited Liability Company(LLC),124,"FIRST CUP PARTNERS LAS VEGAS, LLC"


In [21]:
df.describe(include = "all").loc[[ "count", "freq", "unique", ]]

Unnamed: 0,RECORD_ID,BUSINESS_ADDR_LINE1,Business_Type,JobsReported,name
count,3488,3488,3488,3488,3488
freq,1,20,1532,241,2
unique,3488,3293,14,288,3481


## Connect the GDS library to Neo4j Desktop

Set up a GDS connection using our credentials for Neo4j Desktop

In [22]:
dotenv.load_dotenv(dotenv.find_dotenv())

bolt_uri: str = os.environ.get("NEO4J_BOLT")
database: str = os.environ.get("NEO4J_DBMS")
username: str = os.environ.get("NEO4J_USER")
password: str = os.environ.get("NEO4J_PASS")

gds:GraphDataScience = GraphDataScience(
    bolt_uri,
    auth = ( username, password, ),
    database = database,
    aura_ds = False,
)

Before we begin adding structure to our graph, we'll create a [_constraint_](https://neo4j.com/docs/cypher-manual/current/constraints/) to ensure the uniqueness of records.

In [23]:
gds.run_cypher("""
DROP CONSTRAINT `record_node_key` IF EXISTS
""")

gds.run_cypher("""
CREATE CONSTRAINT `record_node_key` IF NOT EXISTS
  FOR (rec:Record)
  REQUIRE rec.uid IS NODE KEY
""")

If you get a `Failed to write data to connection...` error, ignore it.
These are spurious.

## Load records into the KG

Define utility functions used for loading the graph data.

In [24]:
def get_property_keys (
    df: pd.DataFrame,
    ) -> typing.List[ str ]:
    """
Convert the column names from the given Pandas dataframe into Cypher property names.
    """
    return [
        name.lower().replace(" ", "_")
        for name in df.columns.values.tolist()
    ]


def safe_value (
    obj: typing.Any,
    ) -> typing.Any:
    """
Escape double quotes within string values.
    """
    if pd.isna(obj):
        return None

    if isinstance(obj, str):
        return obj.replace('"', "'")

    return obj


def convert_row (
    row: pd.Series,
    keys: typing.List[ str ],
    ) -> dict:
    """
Transform one row from a Pandas DataFrame of input data into
node data ready to load into our graph.
    """
    safe_vals: list = [ safe_value(v) for v in row.tolist() ]
    params: dict = dict(zip(keys, safe_vals))
    params["uid"] = params["data_source"].upper() + "." + str(params["record_id"])

    return params


def convert_df (   
    df: pd.DataFrame,
    ) -> pd.DataFrame:
    """
Transform a Pandas DataFrame of input data into a DataFrame
of node data ready to load into our graph.
    """
    keys: typing.List[ str ] = get_property_keys(df)

    return pd.DataFrame([
        convert_row(row, keys)
        for _, row in df.iterrows()
    ])

The Cypher clause [`UNWIND`](https://neo4j.com/docs/cypher-cheat-sheet/5/neo4j-community/#_unwind) turns a converted `DataFrame` (e.g., the `$rows` parameter) into individual rows.
Combined with a sub-query this functions like a batch load, and is much faster than loading rows by iterating over them.

In [25]:
unwind_query: str = """
UNWIND $rows AS row
CALL {
  WITH row
  MERGE (rec:Record {uid: row.uid})
  ON CREATE 
    SET rec += row
} IN TRANSACTIONS OF 10000 ROWS
    """

def load_records (
    gds: GraphDataScience,
    df: pd.DataFrame,
    ) -> None:
    """
Load one Pandas DataFrame of node data into Neo4j
    """
    gds.run_cypher(
        unwind_query,
        {"rows": convert_df(df).to_dict(orient = "records")},
    )

In [26]:
load_records(gds, df_poi)

In [27]:
load_records(gds, df_dol)

In [28]:
load_records(gds, df_ppp)

In [29]:
gds.run_cypher(
    """
MATCH (rec:Record)
RETURN COUNT(rec.uid)
    """
)

Unnamed: 0,COUNT(rec.uid)
0,84988
