### analyzeLinkageAlgorithms
This notebook performs a comparative analysis of three record linkage algorithms: a python implementation of LAC's current algorithm (without post-processing heuristics), the DIBBs Basic algorithm, and the DIBBs Log-Odds enhanced algorithm. The notebook is divided into several parts.

Part 1 accesses the MPI database associated with the running environment and reads all records there into memory using spark. From this parallel read, smaller subsets to use as the ground-truth "sub MPI" and the testing evaluation set are generated.

Part 2 performs labeling of the ground-truth subset. In order to accurately compare linkage algorithms, we need to have a set of match and non-match labels for our dataset, even if the labels aren't perfect. This part of the notebook uses a variety of heuristics, as well as the Expectation Maximization algorithm, to assign labels to candidate pairs based on iterative training of the sub-sampled MPI.

Part 3 performs parallelized record linkage on the evaluation set. For each of the three algorithms being compared, the notebook generates fresh labels from the algorithms and compares them to the labels generated in the ground-truth assignment of Part 2. These comparisons allow us to make comparative assessments about the performance of each algorithm.

Part 4 generates the final result statistics of the performance of each algorithm on the testing set.

In [None]:
pip install psycopg2-binary azure-identity phdi recordlinkage azure-keyvault-secrets

In [None]:
# IMPORTS AND CONSTANTS

from azure.identity import DefaultAzureCredential

# Ground-truth labeling imports
import time
import copy
import pandas as pd
import recordlinkage as rl
from recordlinkage.base import BaseCompareFeature
import numpy as np
from phdi.harmonization import compare_strings

# Set your Key Vault information
vault_name = "$KEY_VAULT"
KEY_VAULT_URL = f"https://{vault_name}.vault.azure.net"
vault_linked_service = "$KEY_VAULT_LINKED_SERVICE"

# Set up db_client
DB_NAME = "DibbsMpiDB"
DB_USER = "postgres"
DB_HOST = "$MPI_DB_HOST"
DB_PORT = "5432"

# MPI table names
DB_TABLE_PATIENT = "patient"
DB_TABLE_PERSON = "person"
DB_TABLE_NAME = "name"
DB_TABLE_GIVEN_NAME = "given_name"
DB_TABLE_ADDRESS = "address"
DB_TABLE_IDENTIFIER = "identifier"

# Adjust data volume for scaling
# Make sure evaluation size is less than labeling size!
LABELING_SIZE = 150000
EVALUATION_SIZE = 150000

# Ground-truth labeling parameters
WINDOW_INDEX_SIZE = 5
JARO_THRESHOLD = 0.9
BIRTHDAY_THRESHOLD = 0.95

In [None]:
# OPEN PARALLEL CONNECTION TO MPI, READ DB INTO COMPRESSED MEMORY

from pyspark.sql import SparkSession, Row
import json

# Access the MPI Database
credential = DefaultAzureCredential()
db_password =  TokenLibrary.getSecret(vault_name,"mpi-db-password",vault_linked_service)

url = f"jdbc:postgresql://{DB_HOST}:{DB_PORT}/{DB_NAME}"
db_props = {
    "user": DB_USER,
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

spark = (
    SparkSession.builder.master("local[*]")
    .appName("Build sub-sampled MPI")
    .getOrCreate()
)

# Create views into all tables in the MPI so we can extract them in parallel
patient_view = "patient_view"
mpi_patient_data = spark.read.jdbc(url, DB_TABLE_PATIENT, properties=db_props)
mpi_patient_data.createOrReplaceTempView(patient_view)

name_view = "name_view"
mpi_name_data = spark.read.jdbc(url, DB_TABLE_NAME, properties=db_props)
mpi_name_data.createOrReplaceTempView(name_view)

given_name_view = "given_name_view"
mpi_given_name_data = spark.read.jdbc(url, DB_TABLE_GIVEN_NAME, properties=db_props)
mpi_given_name_data.createOrReplaceTempView(given_name_view)

address_view = "address_view"
mpi_address_data = spark.read.jdbc(url, DB_TABLE_ADDRESS, properties=db_props)
mpi_address_data.createOrReplaceTempView(address_view)

identifier_view = "identifier_view"
mpi_identifier_data = spark.read.jdbc(url, DB_TABLE_IDENTIFIER, properties=db_props)
mpi_identifier_data.createOrReplaceTempView(identifier_view)

In [None]:
# MOUNT THE FILE SYSTEM SO WE CAN WRITE THE OUTPUTS TO FILES

# Set paths
STORAGE_ACCOUNT = "$STORAGE_ACCOUNT"
LINKAGE_OUTPUTS_FILESYSTEM = f"abfss://linkage-notebook-outputs@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
BLOB_STORAGE_LINKED_SERVICE = "$BLOB_STORAGE_LINKED_SERVICE"

from notebookutils import mssparkutils

# Functions for writing and reading results
"""
Function that writes the output of a linkage algorithm to a json file.
"""
def write_linkage_results(fname, results):
    # Results come in as a dict of ints to sets, so just json dumps it
    res_to_write = {str(k):[str(x) for x in list(v)] for (k,v) in results.items()}
    res_to_write = json.dumps(res_to_write)
    mssparkutils.fs.put(LINKAGE_OUTPUTS_FILESYSTEM + fname + ".json", res_to_write, True)


"""
Function that loads the output of a linkage algorithm from a json file using spark.read
and converts it into the same format as the results dict (ints to sets).
"""
def load_linkage_results(fname):
    try:
        res = spark.read.json(LINKAGE_OUTPUTS_FILESYSTEM + fname + ".json")
    except:
        print("Existing results not found.")
        return None
    
    print("Existing results found!")
    res = res.toPandas()
    res = res.to_dict()
    res = {int(k):set([int(x) for x in v[0]]) for (k,v) in res.items()}
    return res
    

# Set up for writing to blob storage
linkage_bucket_name = "linkage-notebook-outputs"
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(BLOB_STORAGE_LINKED_SERVICE)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/' % (linkage_bucket_name, STORAGE_ACCOUNT)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (linkage_bucket_name, STORAGE_ACCOUNT), blob_sas_token)
# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": f"${BLOB_STORAGE_LINKED_SERVICE}"}
    )
except:
    print("Already mounted")

In [None]:
# MPI EXTRACTION
# Pull all the various sources of patient information out of the MPI and
# massage them into a single table. This allows us to format the data for
# training and testing easily off the same spark DF.

import pyspark.sql.functions as F
from pyspark.sql.functions import struct

'''
Helper function to construct a complete string representation of a patient's given 
name from the various fields of a row struct in a pyspark dataframe pulled from
the MPI.
'''
def construct_full_given_name(row):
    gn = ""
    if row["given_name_list"] is not None:
        sorted_structs = sorted(row["given_name_list"], key=lambda x: x.given_name_index)
        gn = [x.given_name for x in sorted_structs]
        gn = " ".join(gn)
    return row["name_id"], row["patient_id"], row["last_name"], gn


# Start with table with 1 row per patient so that when we left join, we preserve that
extracted_patient_data = spark.sql(f"SELECT * from {DB_TABLE_PATIENT}_view")

# Given names don't have a patient_id field, so compile the given names
# associated with each name_id entry in preparation to join to last names
extracted_given_names = spark.sql(f"SELECT * from {DB_TABLE_GIVEN_NAME}_view")
extracted_given_names = extracted_given_names.withColumn(
    "name_structs",
    struct(extracted_given_names.given_name, extracted_given_names.given_name_index)
)
extracted_given_names = extracted_given_names.groupBy("name_id").agg(F.collect_list("name_structs").alias("given_name_list"))
extracted_given_names.cache()

# Last names are 1:1 with a name_id representing all associated given names
# Last names also map back to patient_ids in the patient_table, so use left
# joins to preserve all present info and the 1 row per patient structure
extracted_name_data = spark.sql(f"SELECT * from {DB_TABLE_NAME}_view")
full_name_table = extracted_name_data.join(extracted_given_names, "name_id", "left")
full_name_table = full_name_table.rdd.map(construct_full_given_name).toDF(["name_id", "patient_id", "last_name", "given_name"])
full_name_table = full_name_table.withColumn("full_name_structs", struct(full_name_table.given_name, full_name_table.last_name))
full_name_table = full_name_table.groupBy("patient_id").agg(F.collect_list("full_name_structs").alias("full_name_list"))
extracted_patient_data = extracted_patient_data.join(full_name_table, "patient_id", "left")
extracted_patient_data.cache()

# Identifier table needs compiled and can left join back to patients
extracted_identifier_data = spark.sql(f"SELECT * from {DB_TABLE_IDENTIFIER}_view")
extracted_identifier_data = extracted_identifier_data.withColumn("id_structs", struct(
    extracted_identifier_data.patient_identifier, extracted_identifier_data.type_code
))
extracted_identifier_data = extracted_identifier_data.groupBy("patient_id").agg(F.collect_list("id_structs").alias("ids_list"))
extracted_patient_data = extracted_patient_data.join(extracted_identifier_data, "patient_id", "left")

# Address fields can be massively collapsed into the traditional string representation
# Then we join this back on patient table
extracted_address_data = spark.sql(f"SELECT * from {DB_TABLE_ADDRESS}_view")
extracted_address_data = extracted_address_data.withColumn("address_structs", struct(
    extracted_address_data.line_1,
    extracted_address_data.line_2,
    extracted_address_data.city,
    extracted_address_data.state,
    extracted_address_data.zip_code
))
extracted_address_data = extracted_address_data.groupBy("patient_id").agg(F.collect_list("address_structs").alias("address_list"))
extracted_patient_data = extracted_patient_data.join(extracted_address_data, "patient_id", "left")


In [None]:
# TRAIN/TEST CREATION
# Use the extracted information from the MPI to create two sets of data, one
# in flattened array form in a pandas DF for labeling (training) and one as a 
# list of FHIR bundles for evaluation (testing).

from phdi.linkage.link import _flatten_patient_resource
from pyspark.sql.types import StructType, StructField, StringType


def create_patient_resource_from_spark_row(row):
    # Pull out the various fields from the passed-in row
    extracted_pid = row["patient_id"]
    extracted_birthdate = row["dob"] if row["dob"] is not None else ""
    extracted_gender = row["sex"] if row["sex"] is not None else ""
    extracted_names = row["full_name_list"]
    extracted_identifiers = row["ids_list"]
    extracted_addresses = row["address_list"]

    # Initialize a patient resource to append fields into
    patient_resource = {
        "resourceType": "Patient",
        "id": f"{extracted_pid}",
        "identifier": [],
        "name": [],
        "gender": f"{extracted_gender}",
        "birthDate": f"{extracted_birthdate}",
        "address": [],
    }

    for en in extracted_names:
        givens = en.given_name if en.given_name is not None else ""
        givens = str(givens).split()
        last = en.last_name if en.last_name is not None else ""
        patient_resource["name"].append({
            "family": f"{last}",
            "given": givens
        })
    
    for ident in extracted_identifiers:
        patient_resource["identifier"].append({
            "type": {
                "coding": [
                    {
                        "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                        "code": ident.type_code
                    }
                ]
            },
            "value": ident.patient_identifier
        })
    
    for addr in extracted_addresses:
        l1 = addr.line_1 if addr.line_1 is not None else ""
        l2 = addr.line_2 if addr.line_2 is not None else ""
        lines = [x for x in [l1, l2] if x != ""]
        city = addr.city if addr.city is not None else ""
        state = addr.state if addr.state is not None else ""
        zipcode = addr.zip_code if addr.zip_code is not None else ""
        patient_resource["address"].append({
            "line": lines if len(lines) > 0 else [""],
            "city": f"{city}",
            "state": f"{state}",
            "postalCode": f"{zipcode}"
        })
    
    return (row["person_id"], patient_resource)


def yield_patient_resource(row):
    return row[1]


def yield_flattened_patient_with_person_id(row):
    pid = row[0]
    fp = _flatten_patient_resource(row[1])
    fp[1] = pid
    return fp


# Build the base FHIR and flattened row groups of data
fhir_mapped_data = extracted_patient_data.rdd.map(create_patient_resource_from_spark_row)
fhir_mapped_data.cache()
flattened_patient_data = fhir_mapped_data.map(yield_flattened_patient_with_person_id)
fhir_mapped_data = fhir_mapped_data.map(yield_patient_resource)

# Construct the labeling set
formatted_cols = ["patient_id", "person_id", "address", "birthdate", "city", "first_name", "last_name", "mrn", "sex", "state", "zip"]
pyspark_schema = StructType([
    StructField(x, StringType(), True) for x in formatted_cols
])
flattened_patient_data = flattened_patient_data.toDF(pyspark_schema)
labeling_set = [list(x) for x in flattened_patient_data.collect()]
if LABELING_SIZE is not None and LABELING_SIZE < len(labeling_set):
    labeling_set = labeling_set[:LABELING_SIZE]
labeling_set = pd.DataFrame(labeling_set, columns=formatted_cols)
del flattened_patient_data

# Construct the evaluation set
evaluation_set = [x for x in fhir_mapped_data.collect()]
if EVALUATION_SIZE is not None and EVALUATION_SIZE < len(evaluation_set):
    evaluation_set = evaluation_set[:EVALUATION_SIZE]
evaluation_set = spark.sparkContext.parallelize(evaluation_set, numSlices=512)
del fhir_mapped_data

In [None]:
# CANDIDATE INDEXING
# Generates tuples of all possible candidate pairs that the labeler will compute
# match likelihoods for.

from recordlinkage.index import SortedNeighbourhood, BaseIndexAlgorithm
from recordlinkage.utils import listify

'''
A custom Indexing function built to operate compatibly on the first_name column
returned from the MPI. Since that's a list of strings (because someone could have
multiple given names), we need a way to cross-conjoin these entries and apply
the same fuzzy blocking filter window that a regular column of strings would get.
This performs joint name concatenation on copies of that column in the data, and
then uses an edit distance neighborhood to find fuzzy blocking candidates.
'''
class FirstNameSortedNeighborhood(BaseIndexAlgorithm):
    def __init__(
        self,
        left_on=None,
        right_on=None,
        window=3,
        sorting_key_values=None,
        block_on=[],
        block_left_on=[],
        block_right_on=[],
        **kwargs
    ):
        super(FirstNameSortedNeighborhood, self).__init__(**kwargs)

        # variables to block on
        self.left_on = left_on
        self.right_on = right_on
        self.window = window
        self.sorting_key_values = sorting_key_values
        self.block_on = block_on
        self.block_left_on = block_left_on
        self.block_right_on = block_right_on

    def _get_left_and_right_on(self):
        """
        We only care about the de-dupe case which involves no self.right, but this
        still needs to be implemented for super compatibility.
        """
        if self.right_on is None:
            return (self.left_on, self.left_on)
        else:
            return (self.left_on, self.right_on)

    def _get_sorting_key_values(self, array1, array2):
        """
        Return the sorting key values as a series. This function is required by the"
        package for multi-index neighborhood filtering according to some papers it's"
        built on.
        """

        concat_arrays = np.concatenate([array1, array2])
        return np.unique(concat_arrays)

    def _link_index(self, df_a, df_b):
        df_a = df_a.copy()
        df_b = df_b.copy()
        df_a["first_name"] = df_a["first_name"].str.join(" ")
        df_b["first_name"] = df_a["first_name"].str.join(" ")
        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        window = self.window

        # Correctly generate blocking keys
        block_left_on = listify(self.block_left_on)
        block_right_on = listify(self.block_right_on)

        if self.block_on:
            block_left_on = listify(self.block_on)
            block_right_on = listify(self.block_on)

        blocking_keys = ["sorting_key"] + [
            "blocking_key_%d" % i for i, v in enumerate(block_left_on)
        ]

        # Format the data to thread with index pairs
        data_left = pd.DataFrame(df_a[listify(left_on) + block_left_on], copy=False)
        data_left.columns = blocking_keys
        data_left["index_x"] = np.arange(len(df_a))
        data_left.dropna(axis=0, how="any", subset=blocking_keys, inplace=True)

        data_right = pd.DataFrame(df_b[listify(right_on) + block_right_on], copy=False)
        data_right.columns = blocking_keys
        data_right["index_y"] = np.arange(len(df_b))
        data_right.dropna(axis=0, how="any", subset=blocking_keys, inplace=True)

        # sorting_key_values is the terminology in Data Matching [Christen,
        # 2012]
        if self.sorting_key_values is None:
            self.sorting_key_values = self._get_sorting_key_values(
                data_left["sorting_key"].values, data_right["sorting_key"].values
            )

        sorting_key_factors = pd.Series(
            np.arange(len(self.sorting_key_values)), index=self.sorting_key_values
        )

        data_left["sorting_key"] = data_left["sorting_key"].map(sorting_key_factors)
        data_right["sorting_key"] = data_right["sorting_key"].map(sorting_key_factors)

        # Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            """Merge two dataframes with a lag on in the sorting key."""

            y = y.copy()
            y["sorting_key"] = y["sorting_key"] + w

            return x.merge(y, how="inner")

        pairs_concat = [
            merge_lagged(data_left, data_right, w) for w in range(-_window, _window + 1)
        ]

        pairs_df = pd.concat(pairs_concat, axis=0)

        return pd.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            codes=[pairs_df["index_x"].values, pairs_df["index_y"].values],
            verify_integrity=False,
        )


def find_candidate_links(data):
    start = time.time()
    # Create a windowed neighborhood index on patient table because full is 
    # too expensive
    indexer = rl.Index()
    # Adding multiple different neighborhoods takes their union so we don't over-block
    indexer.add(SortedNeighbourhood('last_name', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('birthdate', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('mrn', window=WINDOW_INDEX_SIZE))
    indexer.add(FirstNameSortedNeighborhood('first_name', window=WINDOW_INDEX_SIZE))
    candidate_links = indexer.index(data)
    print(len(candidate_links), "candidate pairs identified")

    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format
    end = time.time()
    print("Identifying possible candidate pairs took ", str(round(end - start, 2)), "seconds")
    return candidate_links


# Transform a recordlinkage toolkit multi-index into a set of candidate tuples
def get_pred_match_dict_from_multi_idx(mltidx, n_rows):
    candidate_tuples = mltidx.to_list()
    pred_matches = {k: set() for k in range(n_rows)}
    for pair in candidate_tuples:
        reference_record = min(pair)
        linked_record = max(pair)
        pred_matches[reference_record].add(linked_record)
    return pred_matches

# Convert mrn col into actual Nones rather than string Nones
# We'll need to reverse this in the later spark-block stages to parallel mask
labeling_set = labeling_set.replace({"None": None})

candidate_links = find_candidate_links(labeling_set)

In [None]:
# GROUND TRUTH LABELING: VIRGINIA FUNCTIONS

# Special class for comparing LoL concatenated elements
# Use the full concatenation of all values to account for multiple entries like given names
class CompareNestedString(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        strrep1 = s1.str.lstrip('[').str.rstrip(']').str.split(',')
        strrep2 = s2.str.lstrip('[').str.rstrip(']').str.split(',')
        return (strrep1.str[0] == strrep2.str[0]).astype(float)


def get_va_labels(data, candidate_links):
    start = time.time()

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareNestedString("first_name", "first_name",label="first_name"))
    comp.exact("last_name", "last_name", label="last_name")
    comp.exact("birthdate", "birthdate", label="birthdate")
    comp.add(CompareNestedString("address", "address", label="address"))
    features = comp.compute(candidate_links, data)
    matches = features[features.sum(axis=1) == 4]

    end = time.time()
    print("Comparing candidates took", str(round(end - start, 2)), "seconds")

    matches = get_pred_match_dict_from_multi_idx(matches.index, len(data))
    return matches


va_labels = get_va_labels(labeling_set, candidate_links)

In [None]:
# FEATURE COMPARATOR
# Generate string similarity scores for all features in all candidate pairs.

# Special class for comparing LoL first name elements
# Use the full concatenation of all names to account for multiple given names
class CompareFirstName(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        strrep1 = s1.str.lstrip('[').str.rstrip(']').str.split(',')
        strrep2 = s2.str.lstrip('[').str.rstrip(']').str.split(',')
        jarowinklers = np.vectorize(compare_strings)(strrep1.str.join(" "), strrep2.str.join(" "))
        return jarowinklers


# Special class for comparing LoL address line elements
# Check each address line against each other address line to account for moving
class CompareAddress(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):

        def comp_address_fields(a1_list, a2_list):
            best_score = 0.0
            for a1 in a1_list:
                for a2 in a2_list:
                    score = compare_strings(a1, a2)
                    if score >= best_score:
                        best_score = score
            return best_score

        strrep1 = s1.str.lstrip('[').str.rstrip(']').str.split(',')
        strrep2 = s2.str.lstrip('[').str.rstrip(']').str.split(',')
        jarowinklers = np.vectorize(comp_address_fields)(strrep1, strrep2)
        return jarowinklers

'''
Produces a dataframe with a multi-index, in which each tuple of row indices
denotes one potential candidate match. The value in each column of the DF
is the fuzzy match similarity score between the two records given by the 
multi-index.
'''
def compute_comparator_matrix(data, candidate_links):
    start = time.time()

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareFirstName("first_name", "first_name",label="first_name"))
    comp.string(
        "last_name", "last_name", method="jarowinkler", label="last_name"
    )
    comp.string("mrn", "mrn", method="jarowinkler", label="mrn")
    comp.string(
        "birthdate", "birthdate", method="jarowinkler", label="birthdate"
    )
    comp.add(CompareAddress("address", "address", label="address"))
    comp.string("city", "city", method="jarowinkler", label="city")
    comp.string("zip", "zip", method="jarowinkler", label="zip")
    comp.string("sex", "sex", method="jarowinkler", label="sex")
    features = comp.compute(candidate_links, data)

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")
    return features

features = compute_comparator_matrix(labeling_set, candidate_links)

In [None]:
# GROUND-TRUTH LABELING: INTELLIGENT EXTERNAL SYSTEMS
# Leverages the published heuristic rules of both the NCI SEER labaling algorithm
# and the UK's National Health Services deterministic labeler to generate labels
# that we can treat as true for comparative match purposes.

'''
Helper function that combines two dictionaries, each of which has already been
formatted in the requisite stats indexing fashion.
'''
def combine_match_dicts(m1, m2):
    m3 = {}
    for k in m1:
        union_set = set()
        union_set = union_set.union(m1[k])
        union_set = union_set.union(m2[k])
        m3[k] = union_set
    return m3


'''
Generate the NCI's SEER Labels using two types of matches, based on whether or not
the candidate pair has a perfectly agreeing MRN.
'''
def get_seer_labels(data, features):
    mrn_matches = features.loc[features['mrn'] == 1.0]
    matches_type_1 = mrn_matches.loc[
        ((mrn_matches['first_name'] >= JARO_THRESHOLD) & (mrn_matches['last_name'] >= JARO_THRESHOLD)) |
        ((mrn_matches['first_name'] >= JARO_THRESHOLD) & (mrn_matches['birthdate'] >= JARO_THRESHOLD)) |
        ((mrn_matches['birthdate'] >= BIRTHDAY_THRESHOLD) & (mrn_matches['last_name'] >= JARO_THRESHOLD))
    ]

    matches_type_2 = features.loc[
        (features['first_name'] >= JARO_THRESHOLD) &
        (features['last_name'] >= JARO_THRESHOLD) & 
        (features['sex'] >= JARO_THRESHOLD) &
        (
            (features['mrn'] >= JARO_THRESHOLD) |
            (features['birthdate'] >= BIRTHDAY_THRESHOLD)
        )
    ]

    m1_dict = get_pred_match_dict_from_multi_idx(matches_type_1.index, len(data))
    m2_dict = get_pred_match_dict_from_multi_idx(matches_type_2.index, len(data))
    pred_matches = combine_match_dicts(m1_dict, m2_dict)

    return pred_matches


'''
Generate the UK's National Health Service labels using three match conditions, 
depending on the available field information and whether constraints are
progressively relaxed.
'''
def get_uk_nhs_labels(data, features):
    matches_type_1 = features.loc[
        (features['birthdate'] == 1.0) &
        (features['sex'] == 1.0) &
        (features['mrn'] == 1.0)
    ]

    matches_type_2 = features.loc[
        (features['birthdate'] >= BIRTHDAY_THRESHOLD) & 
        (features['sex'] == 1.0) &
        (features['zip'] == 1.0) &
        (features['mrn'] >= JARO_THRESHOLD)
    ]

    matches_type_3 = features.loc[
        (features['birthdate'] == 1.0) &
        (features['sex'] == 1.0) &
        (features['zip'] == 1.0)
    ]

    m1_dict = get_pred_match_dict_from_multi_idx(matches_type_1.index, len(data))
    m2_dict = get_pred_match_dict_from_multi_idx(matches_type_2.index, len(data))
    m3_dict = get_pred_match_dict_from_multi_idx(matches_type_3.index, len(data))
    pred_matches = combine_match_dicts(m1_dict, m2_dict)
    pred_matches = combine_match_dicts(pred_matches, m3_dict)
    
    return pred_matches

seer_labels = get_seer_labels(labeling_set, features)
uk_labels = get_uk_nhs_labels(labeling_set, features)

In [None]:
# LINKAGE DRIVER FUNCTIONS

from phdi.linkage.link import _flatten_patient_resource, extract_blocking_values_from_record
from typing import List
import re


"""
Function that uses a pandas DataFrame construct of an extracted MPI to efficiently
filter down candidates into appropriate blocks. While the filtering itself is not
parallelized, since it occurs on the worker nodes, each executor is performing
linkage for one or more test records simultaneously. As a result, a pandas DF
provides an appropriate level of speed to use .loc retrieval.
"""
def spark_block(block_vals: dict, labeling_set: pd.DataFrame):

    # We'll sequentially apply each blocking filter, since that's equivalent to finding
    # their intersection all at once
    result = labeling_set
    for blocking_criterion in block_vals:
        props = block_vals[blocking_criterion]

        # Special case if we're blocking on first_name or address: pyspark can serialize these
        # as JSON strings, but that means they actually get stored as strings, so we need to 
        # account for the brackets '[' and ']'
        if blocking_criterion == "first_name" or blocking_criterion == "address":
            if "transformation" in props:
                if props["transformation"] == "first4":
                    result = result.loc[result[blocking_criterion].str.startswith("[" + props["value"])]
                elif props["transformation"] == "last4":
                    result = result.loc[result[blocking_criterion].str.endswith(props["value"] + "]")]
            else:
                result = result.loc[result[blocking_criterion] == "[" + props["value"] + "]"]

        # Regular case is just a straight string comparison since we've already stripped the 
        # de-serialization quotes
        else:
            if "transformation" in props:
                if props["transformation"] == "first4":
                    result = result.loc[result[blocking_criterion].str.startswith(props["value"])]
                elif props["transformation"] == "last4":
                    result = result.loc[result[blocking_criterion].str.endswith(props["value"])]
            else:
                result = result.loc[result[blocking_criterion] == props["value"]]
    return result


"""
Function that compares a single blocked candidate from the MPI with the
incoming, now flattened, record. Comparison functions for evaluating the linkage
match are applied iteratively, and a net score is accumulated giving the
total strength of the linkage match. This function is applied sequentially to
each of the candidate records returned in the block.
"""
def spark_compare_df_helper(row, flattened_record, funcs, col_to_idx, matching_rule, **kwargs):

    # Iteratively accumulate results of each feature-wise comparison
    match_score = 0.0
    for col in funcs:
        func = funcs[col]
        feature_idx_in_record = col_to_idx[col]
        feature_in_record = flattened_record[feature_idx_in_record]

        if "fuzzy" in func:
            similarity_measure, fuzzy_threshold = _get_fuzzy_comp_params(col, **kwargs)

            # Given name is a list (possibly including middle name), so our logic says
            # concatenate all the values together and then fuzzy compare
            if col == "first_name":
                feature_in_record = " ".join(feature_in_record)
                feature_in_mpi = re.sub(r'\[|\]', "", row[col])
                feature_in_mpi = feature_in_mpi.split(", ")
                feature_in_mpi = " ".join(feature_in_mpi)
                feature_score = compare_strings(feature_in_mpi, feature_in_record, similarity_measure)
                match_score = _apply_score_contribution(
                    feature_score, col, fuzzy_threshold, match_score, matching_rule, **kwargs
                )

            # Address is also a list, but rather than concatenate them all, we check if each
            # line of an incoming address matches any line of an MPI address; this accounts for
            # a patient's change of residence history
            elif col == "address":
                feature_in_mpi = re.sub(r'\[|\]', "", row[col])
                feature_in_mpi = feature_in_mpi.split(", ")
                best_score = 0.0
                for r in feature_in_record:
                    for m in feature_in_mpi:
                        feature_comp = compare_strings(r, m, similarity_measure)
                        if feature_comp > best_score:
                            best_score = feature_comp
                match_score = _apply_score_contribution(
                    best_score, col, fuzzy_threshold, match_score, matching_rule, **kwargs
                )
            
            # Regular case: straight string comparison on the fields
            else:
                feature_in_mpi = row[col]
                feature_score = compare_strings(feature_in_mpi, feature_in_record, similarity_measure)
                match_score = _apply_score_contribution(
                    feature_score, col, fuzzy_threshold, match_score, matching_rule, **kwargs
                )
        else:
            pass

    return pd.Series([row['patient_id'], match_score])


"""
Quick helper to extract the threshold and metric used in fuzzy string comparisons.
We have this to not clutter the main analytic function.
"""
def _get_fuzzy_comp_params(col, **kwargs):
    similarity_measure = "JaroWinkler"
    if "similarity_measure" in kwargs:
        similarity_measure = kwargs["similarity_measure"]
    threshold = 0.7
    
    # Optional unique threshold per column in the data
    if "thresholds" in kwargs:
        if col in kwargs["thresholds"]:
            threshold = kwargs["thresholds"][col]
    
    # Single universal threshold for all fields
    elif "threshold" in kwargs:
        threshold = kwargs["threshold"]
        
    return similarity_measure, threshold


"""
Helper to apply the result of a feature-wise comparison between an incoming record and a 
candidate from the MPI to the accumulated 'match score' of the two. In a 'normal' case
where we're not using log-odds, this is just a count of the number of feature comparisons
that satisfy the fuzzy string threshold. In the log-odds case, this is an accumulation of
the weighted probability score that the two records are a match.
"""
def _apply_score_contribution(feature_score, col, fuzzy_threshold, match_score, match_rule, **kwargs):
    if "log" in match_rule:
        col_odds = kwargs["log_odds"][col]
        match_score += (feature_score * col_odds)
    else:
        if feature_score >= fuzzy_threshold:
            match_score += 1.0
    return match_score


"""
Orchestrator function that provisions the RDD-mapping of the parallel candidate evaluation.
Once we've parallel-processed the candidates, we apply RDD filtering to identify only those
candidates who satisfy the provided matching rule. (be that "all feature-wise comparisons are
true" or "total probability score exceeds log-odds cutoff").
"""
def spark_compare(data_block: pd.DataFrame, record: List, funcs: dict, col_to_idx: dict, matching_rule, **kwargs):
    res = data_block.apply(lambda x: spark_compare_df_helper(x, record, funcs, col_to_idx, matching_rule, **kwargs), axis=1)
    if "log" in matching_rule:
        match_cutoff = kwargs["true_match_threshold"]
    else:
        match_cutoff = len(funcs)
    match_list = res.loc[res[1] >= match_cutoff]
    match_list = list(match_list[0])
    return match_list


'''
Main driver function that's applied in parallel to each record of the incoming
evaluation set. The procedure is much the same as if the record were being
processed in real time, except that a pandas dataframe (rather than a networked
DB) is used to retrieve the candidate block for speed purposes.
'''
def parallel_eval(record, algo_config: List[dict], labeling_set: pd.DataFrame, testing_field=None, testing_vals=None):

    # Flatten incoming resource and remove any lurking None's
    flattened_record = _flatten_patient_resource(record)
    if flattened_record[2] is None:
        flattened_record[2] = [""]
    if flattened_record[5] is None:
        flattened_record[5] = [""]

    if testing_field:
        matches = {str(x): [] for x in testing_vals}
    else:
        matches = []

    for linkage_pass in algo_config:
        blocking_fields = linkage_pass["blocks"]
        field_blocks = extract_blocking_values_from_record(record, blocking_fields)
        if len(field_blocks) == 0:
            continue
        
        # Use the extract of the MPI to quickly filter down a block of candidates
        data_block = spark_block(field_blocks, labeling_set)
        col_to_idx = {v: k for k, v in enumerate(formatted_cols)}

        # Parallel process the candidates to find any matches
        kwargs = linkage_pass.get("kwargs", {})

        if testing_field:
            for tv in testing_vals:
                vkwargs = copy.deepcopy(kwargs)
                vkwargs["thresholds"][testing_field] = tv
                matching_records = spark_compare(
                    data_block, flattened_record, linkage_pass["funcs"], col_to_idx, linkage_pass["matching_rule"], **vkwargs
                )
                matches[str(tv)] += matching_records
        else:
            matching_records = spark_compare(
                data_block, flattened_record, linkage_pass["funcs"], col_to_idx, linkage_pass["matching_rule"], **kwargs
            )
            matches += matching_records

    if testing_field:
        return flattened_record[0], [matches[str(tv)] for tv in testing_vals]
    else:
        return flattened_record[0], matches


'''
Turn the patient_ids of identified "found matches" into the threaded multi-row-indices
that the ground truth labeler can understand. This way, all indices are expressed in
the same scheme for statistical comparison.
'''
def map_patient_ids_to_idxs(pids: List, data: pd.DataFrame):
    record_idxs = []
    for pid in pids:
        row_idx = data[data['patient_id'] == pid].index.values
        if len(row_idx) > 0:
            record_idxs.append(row_idx[0])
    return record_idxs


'''
Find existing patient records in a dataset that map to each incoming record in a block 
of FHIR data. Since the FHIR data itself is pulled from the MPI, we can freely use it
for querying for blocks without risk of finding unrecognized data.
'''
def link_all_fhir_records_block_dataset(records, algo_config: List[dict], label_set: pd.DataFrame, formatted_cols, testing_field=None, testing_vals=None):
    if testing_field:
        if not testing_vals or len(testing_vals) == 0:
            print("Must supply list of threshold values to test")
            return
        found_matches = { str(x): {} for x in testing_vals }
    else:
        found_matches = {}
    start = time.time()
    res = records.map(lambda x: parallel_eval(x, algo_config, label_set, testing_field, testing_vals))
    res.cache()

    if testing_field:
        for x in res.collect():
            ridx = map_patient_ids_to_idxs([x[0]], label_set)[0]
            for tv in range(len(x[1])):
                linked_rs_at_threshold = x[1][tv]
                lidx = set(linked_rs_at_threshold)
                lidx = map_patient_ids_to_idxs(lidx, label_set)
                found_matches[str(testing_vals[tv])][ridx] = set(lidx)
            
        print("finished linking ", str(time.time() - start))
        return found_matches

    else:
        for x in res.collect():
            ridx = map_patient_ids_to_idxs([x[0]], label_set)[0]
            lidx = set(x[1])
            lidx = map_patient_ids_to_idxs(lidx, label_set)
            found_matches[ridx] = set(lidx)

        print("finished linking ", str(time.time() - start))
        return found_matches

'''
Due to transforming patient_ids back into indices, multiple tuples get inserted for each
match, i.e. we record the link (i,j) and the link (j,i), which would skew our stats.
This function eliminates these redundancies and makes sure each link is counted once.
'''
def dedupe_match_double_counts(match_dict, is_fuzzy_test=False):
    if is_fuzzy_test:
        for tv in match_dict:
            sub_dict = match_dict[tv]
            for k in sub_dict:
                if k > 0:
                    lower_set = set(list(range(k)))
                    sub_dict[k] = sub_dict[k].difference(lower_set)
                if k in sub_dict[k]:
                    sub_dict[k].remove(k)
        return match_dict
    
    for k in match_dict:
        if k > 0:
            lower_set = set(list(range(k)))
            match_dict[k] = match_dict[k].difference(lower_set)
        if k in match_dict[k]:
            match_dict[k].remove(k)
    return match_dict


# Change the real None-type values back into their placeholders so we can mass-boolean filter
labeling_set = labeling_set.replace({None: "None"})

In [None]:
# EXPERIMENTAL THRESHOLD ANALYSIS
# Supplemental code used for experimentally determining the best fuzzy
# matching threshold for a given field

from phdi.linkage import DIBBS_BASIC
import copy

TESTING_FIELD = "first_name"
TESTING_VALS = [0.85, 0.88, 0.90, 0.92, 0.95]


def score_fuzzy_test(found_matches, true_matches, records_in_dataset, testing_vals):

    # Need division by 2 because ordering is irrelevant, matches are symmetric
    total_possible_matches = (records_in_dataset * (records_in_dataset - 1)) / 2.0
    scores = {}
    for tv in testing_vals:
        true_positives = 0.0
        false_positives = 0.0
        false_negatives = 0.0
        matches_at_threshold = found_matches[str(tv)]

        for root_record in true_matches:
            if root_record in matches_at_threshold:
                true_positives += len(
                    true_matches[root_record].intersection(matches_at_threshold[root_record])
                )
                false_positives += len(
                    matches_at_threshold[root_record].difference(true_matches[root_record])
                )
                false_negatives += len(
                    true_matches[root_record].difference(matches_at_threshold[root_record])
                )
            else:
                false_negatives += len(true_matches[root_record])
        for record in set(set(matches_at_threshold.keys()).difference(true_matches.keys())):
            false_positives += len(matches_at_threshold[record])

        sensitivity = round(true_positives / (true_positives + false_negatives), 3)
        ppv = round(true_positives / (true_positives + false_positives), 3)
        f1 = round(
            (2 * true_positives) / (2 * true_positives + false_negatives + false_positives),
            3,
        )
        f_half_num = (1.0 + 0.5**2) * true_positives
        f_half_denom_new = (0.5**2) * false_negatives + false_positives
        f_half = round(f_half_num / (f_half_num + f_half_denom_new), 3)
        scores[str(tv)] = {
            "tp": true_positives,
            "fp": false_positives,
            "fn": false_negatives,
            "sens": sensitivity,
            "ppv": ppv,
            "f1": f1,
            "f_half": f_half
        }
    
    return scores


new_algo = copy.deepcopy(DIBBS_BASIC)
col_thresholds = {
    "address": 0.85,
    "birthdate": 0.85,
    "city": 0.85,
    "first_name": 0.85,
    "last_name": 0.85,
    "mrn": 0.85,
    "sex": 0.85,
    "state": 0.85,
    "zip": 0.85
}
new_algo[0]["kwargs"] = { "thresholds": col_thresholds }
new_algo[1]["kwargs"] = { "thresholds": col_thresholds }

found_matches_dibbs_basic = link_all_fhir_records_block_dataset(evaluation_set, new_algo, labeling_set, formatted_cols, TESTING_FIELD, TESTING_VALS)
found_matches_dibbs_basic = dedupe_match_double_counts(found_matches_dibbs_basic, True)
eval_scores = score_fuzzy_test(found_matches_dibbs_basic, seer_labels, EVALUATION_SIZE, TESTING_VALS)
for t in eval_scores:
    print(t, eval_scores[t])
    print()

In [None]:
# ALGORITHM EVALUATION: LAC EXISTING

LAC_ALGO = [
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "address", "transformation": "first4"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
]

found_matches_lac = load_linkage_results("lac_algorithm_results")
if found_matches_lac is None:
    found_matches_lac = link_all_fhir_records_block_dataset(evaluation_set, LAC_ALGO, labeling_set, formatted_cols)
    found_matches_lac = dedupe_match_double_counts(found_matches_lac)
    write_linkage_results("lac_algorithm_results", found_matches_lac)

In [None]:
# ALGORITHM EVALUATION: DIBBs BASIC
from phdi.linkage import DIBBS_BASIC
import copy

new_algo = copy.deepcopy(DIBBS_BASIC)
col_thresholds = {
    "address": 0.85,
    "birthdate": 0.85,
    "city": 0.85,
    "first_name": 0.85,
    "last_name": 0.85,
    "mrn": 0.85,
    "sex": 0.85,
    "state": 0.85,
    "zip": 0.85
}
new_algo[0]["kwargs"] = { "thresholds": col_thresholds }
new_algo[1]["kwargs"] = { "thresholds": col_thresholds }

found_matches_dibbs_basic = load_linkage_results("dibbs_basic_algorithm_results")
if found_matches_dibbs_basic is None:
    found_matches_dibbs_basic = link_all_fhir_records_block_dataset(evaluation_set, new_algo, labeling_set, formatted_cols)
    found_matches_dibbs_basic = dedupe_match_double_counts(found_matches_dibbs_basic)
    write_linkage_results("dibbs_basic_algorithm_results", found_matches_dibbs_basic)

In [None]:
# ALGORITHM EVALUATION: DIBBs ENHANCED
from phdi.linkage import DIBBS_ENHANCED

new_algo = copy.deepcopy(DIBBS_ENHANCED)
col_thresholds = {
    "address": 0.85,
    "birthdate": 0.85,
    "city": 0.85,
    "first_name": 0.85,
    "last_name": 0.85,
    "mrn": 0.85,
    "sex": 0.85,
    "state": 0.85,
    "zip": 0.85
}
del new_algo[0]["kwargs"]["threshold"]
del new_algo[1]["kwargs"]["threshold"]
new_algo[0]["kwargs"]["thresholds"] = col_thresholds
new_algo[1]["kwargs"]["thresholds"] = col_thresholds

found_matches_dibbs_enhanced = load_linkage_results("dibbs_enhanced_algorithm_results")
if found_matches_dibbs_enhanced is None:
    found_matches_dibbs_enhanced = link_all_fhir_records_block_dataset(evaluation_set, new_algo, labeling_set, formatted_cols)
    found_matches_dibbs_enhanced = dedupe_match_double_counts(found_matches_dibbs_enhanced)
    write_linkage_results("dibbs_enhanced_algorithm_results", found_matches_dibbs_enhanced)

In [None]:
# RECOMPUTE AND EXPORT LOG-ODDS

from phdi.linkage import calculate_m_probs, calculate_log_odds
import json
from random import randint

def calculate_u_probs(
    data: pd.DataFrame,
    true_matches: dict,
    n_samples: int,
):

    # Quick heuristic check to make sure we can generate enough
    # negative samples to satisfy the parameter request
    max_combos = (len(data.index) * (len(data.index) - 1)) / 2.0
    # Based on bernoulli limits for deterministic runtimes, don't worry about the ln(2)
    # This is how many neg pairs you can expect to generate in reasonable time
    runtime_sample_neg_ceiling = np.log(2) * 0.10 * max_combos
    if n_samples >= runtime_sample_neg_ceiling:
        print("Too many samples requested for data size. Lower n_samples parameter.")
        return

    u_probs = {c: 1.0 for c in data.columns}
    neg_pairs = set()

    # Use speed of RNGers to take a sample out of all possible non-match pairs
    # without explicitly constructing the list
    while len(neg_pairs) < n_samples:
        idx1 = randint(0, len(data.index)-1)
        idx2 = randint(0, len(data.index)-1)
        root = min(idx1, idx2)
        ref = max(idx1, idx2)
        if root not in true_matches or ref not in true_matches[root]:
            neg_pairs.add((root, ref))

    neg_pairs = list(neg_pairs)

    # Count up the number of candidate pairs that have a field that matches,
    # then normalize per field
    for root, ref in neg_pairs:
        for c in data.columns:
            if data[c].iloc[root] == data[c].iloc[ref]:
                u_probs[c] += 1.0
    for c in data.columns:
            u_probs[c] = u_probs[c] / (n_samples + 1.0)

    return u_probs

m_probs = calculate_m_probs(labeling_set, seer_labels)

# For small data sets (< 10k records), use 50k sample size.
# For mid sized (10k - 25k records), use 75k samples
# Any larger, don't go above 100k samples
u_probs = calculate_u_probs(labeling_set, seer_labels, n_samples=50000)

log_odds = calculate_log_odds(m_probs, u_probs)
log_odds.pop("patient_id")
log_odds.pop("person_id")
mssparkutils.fs.put(LINKAGE_OUTPUTS_FILESYSTEM + "updated_log_odds.json", json.dumps(log_odds), True)

In [None]:
# PROFILE LOG-ODDS WEIGHTS FOR CUTOFF DETERMINATION
# Run computations on the log-odds cutoff scores for both matches and non-matches
# for later graphical evaluation

import matplotlib.pyplot as plt


def profiling_df_helper(data, idx_i, idx_j, fuzzy_cols, log_odds, col_to_idx):

    # Iteratively accumulate results of each feature-wise comparison
    match_score = 0.0
    ri = data[idx_i]
    rj = data[idx_j]

    for col in fuzzy_cols:
        col_odds = log_odds[col]
        cidx = col_to_idx[col]
        similarity_measure="JaroWinkler"
        min_sim_threshold = 0.85

        # Given name is a list (possibly including middle name), so our logic says
        # concatenate all the values together and then fuzzy compare
        if col == "first_name":
            feature_in_record = re.sub(r'\[|\]', "", ri[cidx])
            feature_in_record = feature_in_record.split(", ")
            feature_in_record = " ".join(feature_in_record)
            feature_in_mpi = re.sub(r'\[|\]', "", rj[cidx])
            feature_in_mpi = feature_in_mpi.split(", ")
            feature_in_mpi = " ".join(feature_in_mpi)
            feature_score = compare_strings(feature_in_mpi, feature_in_record, similarity_measure)

        # Address is also a list, but rather than concatenate them all, we check if each
        # line of an incoming address matches any line of an MPI address; this accounts for
        # a patient's change of residence history
        elif col == "address":
            feature_in_record = re.sub(r'\[|\]', "", ri[cidx])
            feature_in_record = feature_in_record.split(", ")
            feature_in_mpi = re.sub(r'\[|\]', "", rj[cidx])
            feature_in_mpi = feature_in_mpi.split(", ")
            feature_score = 0.0
            for r in feature_in_record:
                for m in feature_in_mpi:
                    feature_comp = compare_strings(r, m, similarity_measure)
                    if feature_comp > feature_score:
                        feature_score = feature_comp
        
        # Regular case: straight string comparison on the fields
        else:
            feature_in_record = ri[cidx]
            feature_in_mpi = rj[cidx]
            feature_score = compare_strings(feature_in_mpi, feature_in_record, similarity_measure)
        
        # Potential match gets no points if strings are too dissimilar
        # Prevents accumulations of minimal points from every field swaying the results
        if feature_score < min_sim_threshold:
            feature_score = 0.0
        match_score += feature_score * col_odds

    return match_score


def profile_log_odds_computation(
    data: pd.DataFrame,
    true_matches: dict,
    log_odds: dict,
    fuzzy_cols,
    neg_samples: int = 50000,
):
    neg_pairs = set()
    while len(neg_pairs) < neg_samples:
        idx1 = randint(0, len(data.index)-1)
        idx2 = randint(0, len(data.index)-1)
        root = min(idx1, idx2)
        ref = max(idx1, idx2)
        if root not in true_matches or ref not in true_matches[root]:
            neg_pairs.add((root, ref))

    neg_pairs = list(neg_pairs)

    data_cols = list(data.columns)
    col_to_idx = dict(zip(data_cols, range(len(data_cols))))
    data = data.values.tolist()

    true_match_scores = []
    for root_record, paired_records in true_matches.items():
        for pr in paired_records:
            score = profiling_df_helper(data, root_record, pr, fuzzy_cols, log_odds, col_to_idx)
            true_match_scores.append(score)

    non_match_scores = []
    for record_1, record_2 in neg_pairs:
        score = profiling_df_helper(data, record_1, record_2, fuzzy_cols, log_odds, col_to_idx)
        non_match_scores.append(score)
    
    return true_match_scores, non_match_scores

match_scores, non_match_scores = profile_log_odds_computation(labeling_set, seer_labels, log_odds, ["address", "city"])

In [None]:
# VISUALIZE LOG-ODDS GRAPH
# Graphically displays elbow curves of the log-odds total match scores for both
# true matches and true non-matches (as determined by SEER labels) so that the
# separating hyperplane can be determined.

def show_profiling_graph(match_scores, non_match_scores):

    # TODO: Verify non-min-length works on larger prod data set; if it doesn't,
    # uncomment this block

    # min_length = min(len(true_match_scores), len(non_match_scores))
    # true_match_scores = true_match_scores[:min_length]
    # non_match_scores = non_match_scores[:min_length]

    fig, ax = plt.subplots()
    fig.set_size_inches(12,6)
    _, bins, _ = plt.hist(match_scores, bins=75, range=[0, 25])
    _ = plt.hist(non_match_scores, bins=bins, alpha=0.5)

    # Adjust the density of tick marks here to find the best separation boundary
    ax.xaxis.set_major_locator(plt.MaxNLocator(20))

    # Use this min and max of the x axis to effectively zoom in
    # ax.set_xlim([6,8])
    plt.show()

show_profiling_graph(match_scores, non_match_scores)

In [None]:
# RUN THE NUMBERS AND GET THE STATS FUNCTIONS

'''
To ensure accurate statistics, the matches and the true matches dictionaries
in the statistical evaluation function should have the following form:

{
    row_num_of_record_in_data: set(row_nums_of_linked_records)
}

Each row in the data should be represented as a key in both dictionaries.
The value for each of these keys should be a set that contains all other
row numbers for records in the data set that link to the key record.
'''

def score_linkage_vs_truth(found_matches, true_matches, records_in_dataset):

    # Need division by 2 because ordering is irrelevant, matches are symmetric
    total_possible_matches = (records_in_dataset * (records_in_dataset - 1)) / 2.0
    true_positives = 0.0
    false_positives = 0.0
    false_negatives = 0.0

    for root_record in true_matches:
        if root_record in found_matches:
            true_positives += len(
                true_matches[root_record].intersection(found_matches[root_record])
            )
            false_positives += len(
                found_matches[root_record].difference(true_matches[root_record])
            )
            false_negatives += len(
                true_matches[root_record].difference(found_matches[root_record])
            )
        else:
            false_negatives += len(true_matches[root_record])
    for record in set(set(found_matches.keys()).difference(true_matches.keys())):
        false_positives += len(found_matches[record])

    true_negatives = (
        total_possible_matches - true_positives - false_positives - false_negatives
    )
    npv = round((true_negatives / (true_negatives + false_negatives)), 3)
    sensitivity = round(true_positives / (true_positives + false_negatives), 3)
    specificity = round(true_negatives / (true_negatives + false_positives), 3)
    ppv = round(true_positives / (true_positives + false_positives), 3)
    f1 = round(
        (2 * true_positives) / (2 * true_positives + false_negatives + false_positives),
        3,
    )
    return {
        "tp": true_positives,
        "fp": false_positives,
        "fn": false_negatives,
        "sens": sensitivity,
        "spec": specificity,
        "ppv": ppv,
        "npv": npv,
        "f1": f1
    }

display_str = ""
n_records = EVALUATION_SIZE

for lbl_type in ["va", "uk-nhs", "nci-seer"]:
    if lbl_type == "va":
        labels = va_labels
    elif lbl_type == "nci-seer":
        labels = seer_labels
    else:
        labels = uk_labels
    
    stats_dict_lac = score_linkage_vs_truth(found_matches_lac, labels, n_records)
    stats_dict_dibbs_b = score_linkage_vs_truth(found_matches_dibbs_basic, labels, n_records)
    stats_dict_dibbs_e = score_linkage_vs_truth(found_matches_dibbs_enhanced, labels, n_records)

    display_str += "DISPLAYING EVALUATION ON " + lbl_type.upper() + " LABELS:\n"
    display_str += "\n"

    for algo in ["lac", "basic", "enhanced"]:
        if algo == "lac":
            display_str += "LAC Existing Algorithm:\n"
            scores = stats_dict_lac
        elif algo == "basic":
            display_str += "DIBBs Basic Algorithm:\n"
            scores = stats_dict_dibbs_b
        else:
            display_str += "DIBBs Log-Odds Algorithm:\n"
            scores = stats_dict_dibbs_e

        display_str += "True Positives: " + str(scores["tp"]) + "\n"
        display_str += "False Positives: " + str(scores["fp"]) + "\n"
        display_str += "False Negatives: " + str(scores["fn"]) + "\n"
        display_str += "Sensitivity: " + str(scores["sens"]) + "\n"
        display_str += "Specificity: " + str(scores["spec"]) + "\n"
        display_str += "PPV: " + str(scores["ppv"]) + "\n"
        display_str += "NPV: " + str(scores["npv"]) + "\n"
        display_str += "F1: " + str(scores["f1"]) + "\n"
        display_str += "\n"
    
    display_str += "\n"

print(display_str)

mssparkutils.fs.put(LINKAGE_OUTPUTS_FILESYSTEM + "results.txt", display_str, True)