### tabularizePatientTable
This notebook extracts all data from the Patient table (`patient`) in the Master Patient Index and tabularizes the data into Lists of Lists of Lists (LoLoL).

In [None]:
pip install psycopg2-binary azure-identity phdi recordlinkage azure-keyvault-secrets

In [None]:
# Database-related imports
import psycopg2
from psycopg2.sql import Identifier, SQL
from azure.identity import DefaultAzureCredential
from phdi.linkage.postgres import DIBBsConnectorClient

# Ground-truth labeling imports
import time
import pandas as pd
import recordlinkage as rl
from recordlinkage.base import BaseCompareFeature
import numpy as np
from phdi.harmonization import compare_strings

In [None]:
# GLOBAL VARIABLES FOR DATABASE ACCESS

# Set your Key Vault information
vault_name = "$KEY_VAULT"
KEY_VAULT_URL = f"https://{vault_name}.vault.azure.net"
vault_linked_service = "$KEY_VAULT_LINKED_SERVICE"

# Set up db_client
DB_NAME = "DibbsMpiDB"
DB_USER = "postgres"
DB_HOST = "$MPI_DB_HOST"
DB_PORT = "5432"
DB_TABLE_PATIENT = "patient"
DB_TABLE_PERSON= "person"

In [None]:
# GLOBAL VARIABLES FOR GROUND-TRUTH LABELING
DATA_SIZE = None    # Optional variable; if none, use whole table
WINDOW_INDEX_SIZE = 13
JARO_THRESHOLD = 0.85

In [None]:
# MPI ACCESS AND TABULATION FUNCTIONS

# Generate a query to extract all data from the patient table of the MPI.
def generate_query(db_client):
    select_query_stubs = []
    query_data = []
    for key in db_client.fields_to_jsonpaths:
        query = f"jsonb_path_query_array(patient_resource,%s) as {key}"
        select_query_stubs.append(query)
        query_data.append(db_client.fields_to_jsonpaths[key])

    select_query = "SELECT patient_id, " + ", ".join(stub for stub in select_query_stubs)

    query = select_query + " FROM {patient_table};"
    query = SQL(query).format(patient_table=Identifier(db_client.patient_table))
    return query, query_data


# Format returned data as Lists of Lists of Lists (LoLoL)
def format_data(data, db_client):
    db_client._close_connections(db_conn=conn, db_cursor=cur)
    data_cols = []
    for key in sorted(list(db_client.fields_to_jsonpaths.keys())):
        data_cols.append(key)
    data_cols.insert(0, "patient_id")
    data.insert(0, data_cols)

    # Bring all data elements up one list level to avoid overly deep nesting
    for i in range(1, len(data)):
        for j in range(1, len(data[i])):
            if len(data[i][j]) > 0:
                data[i][j] = data[i][j][0]
            else:
                data[i][j] = ""
    return data

# Access the MPI Database
credential = DefaultAzureCredential()
db_password =  TokenLibrary.getSecret(vault_name,"mpi-db-password",vault_linked_service)
db_client = DIBBsConnectorClient(database = DB_NAME, user = DB_USER, password = db_password, host= DB_HOST, port = DB_PORT, patient_table= DB_TABLE_PATIENT, person_table=DB_TABLE_PERSON)

# Create a connection and a cursor
conn = db_client.get_connection()
cur = conn.cursor()

# Query for the data and format it
query, query_data = generate_query(db_client)
cur.execute(query, query_data)
data = [list(row) for row in cur.fetchall()]
formatted_data = format_data(data,db_client)

# Apply any size caveats, if desired
if DATA_SIZE is not None:
    dataset = formatted_data[:min(DATA_SIZE+1, len(formatted_data))]
else:
    dataset = formatted_data
labeling_set = pd.DataFrame(dataset[1:], columns=data[0])

# Now, we need a copy of the data in a FHIR format for the linkage algorithms
conn = db_client.get_connection()
cur = conn.cursor()
query = "SELECT patient_resource from patient;"
cur.execute(query)
fhir_data = [list(row)[0] for row in cur.fetchall()]
db_client._close_connections(db_conn=conn, db_cursor=cur)

if DATA_SIZE is not None:
    evaluation_set = fhir_data[:min(DATA_SIZE+1, len(formatted_data))]
else:
    evaluation_set = fhir_data

In [None]:
# GROUND TRUTH LABELING: VIRGINIA FUNCTIONS

from recordlinkage.index import SortedNeighbourhood, BaseIndexAlgorithm
from recordlinkage.utils import listify

'''
A custom Indexing function built to operate compatibly on the first_name column
returned from the MPI. Since that's a list of strings (because someone could have
multiple given names), we need a way to cross-conjoin these entries and apply
the same fuzzy blocking filter window that a regular column of strings would get.
This performs joint name concatenation on copies of that column in the data, and
then uses an edit distance neighborhood to find fuzzy blocking candidates.
'''
class FirstNameSortedNeighborhood(BaseIndexAlgorithm):
    def __init__(
        self,
        left_on=None,
        right_on=None,
        window=3,
        sorting_key_values=None,
        block_on=[],
        block_left_on=[],
        block_right_on=[],
        **kwargs
    ):
        super(FirstNameSortedNeighborhood, self).__init__(**kwargs)

        # variables to block on
        self.left_on = left_on
        self.right_on = right_on
        self.window = window
        self.sorting_key_values = sorting_key_values
        self.block_on = block_on
        self.block_left_on = block_left_on
        self.block_right_on = block_right_on

    def _get_left_and_right_on(self):
        """
        We only care about the de-dupe case which involves no self.right, but this
        still needs to be implemented for super compatibility.
        """
        if self.right_on is None:
            return (self.left_on, self.left_on)
        else:
            return (self.left_on, self.right_on)

    def _get_sorting_key_values(self, array1, array2):
        """
        Return the sorting key values as a series. This function is required by the"
        package for multi-index neighborhood filtering according to some papers it's"
        built on.
        """

        concat_arrays = np.concatenate([array1, array2])
        return np.unique(concat_arrays)

    def _link_index(self, df_a, df_b):
        df_a = df_a.copy()
        df_b = df_b.copy()
        df_a["first_name"] = df_a["first_name"].str.join(" ")
        df_b["first_name"] = df_a["first_name"].str.join(" ")
        left_on, right_on = self._get_left_and_right_on()
        left_on = listify(left_on)
        right_on = listify(right_on)

        window = self.window

        # Correctly generate blocking keys
        block_left_on = listify(self.block_left_on)
        block_right_on = listify(self.block_right_on)

        if self.block_on:
            block_left_on = listify(self.block_on)
            block_right_on = listify(self.block_on)

        blocking_keys = ["sorting_key"] + [
            "blocking_key_%d" % i for i, v in enumerate(block_left_on)
        ]

        # Format the data to thread with index pairs
        data_left = pd.DataFrame(df_a[listify(left_on) + block_left_on], copy=False)
        data_left.columns = blocking_keys
        data_left["index_x"] = np.arange(len(df_a))
        data_left.dropna(axis=0, how="any", subset=blocking_keys, inplace=True)

        data_right = pd.DataFrame(df_b[listify(right_on) + block_right_on], copy=False)
        data_right.columns = blocking_keys
        data_right["index_y"] = np.arange(len(df_b))
        data_right.dropna(axis=0, how="any", subset=blocking_keys, inplace=True)

        # sorting_key_values is the terminology in Data Matching [Christen,
        # 2012]
        if self.sorting_key_values is None:
            self.sorting_key_values = self._get_sorting_key_values(
                data_left["sorting_key"].values, data_right["sorting_key"].values
            )

        sorting_key_factors = pd.Series(
            np.arange(len(self.sorting_key_values)), index=self.sorting_key_values
        )

        data_left["sorting_key"] = data_left["sorting_key"].map(sorting_key_factors)
        data_right["sorting_key"] = data_right["sorting_key"].map(sorting_key_factors)

        # Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            """Merge two dataframes with a lag on in the sorting key."""

            y = y.copy()
            y["sorting_key"] = y["sorting_key"] + w

            return x.merge(y, how="inner")

        pairs_concat = [
            merge_lagged(data_left, data_right, w) for w in range(-_window, _window + 1)
        ]

        pairs_df = pd.concat(pairs_concat, axis=0)

        return pd.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            codes=[pairs_df["index_x"].values, pairs_df["index_y"].values],
            verify_integrity=False,
        )


# Transform a recordlinkage toolkit multi-index into a set of candidate tuples
def get_pred_match_dict_from_multi_idx(mltidx, n_rows):
    candidate_tuples = mltidx.to_list()
    pred_matches = {k: set() for k in range(n_rows)}
    for pair in candidate_tuples:
        reference_record = min(pair)
        linked_record = max(pair)
        pred_matches[reference_record].add(linked_record)
    return pred_matches


# Special class for comparing LoL concatenated elements
# Use the full concatenation of all values to account for multiple entries like given names
class CompareNestedString(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        return (s1.str[0] == s2.str[0]).astype(float)


def get_va_labels(data):
    start = time.time()

    # Create a windowed neighborhood index on patient table because full is 
    # too expensive
    indexer = rl.Index()
    # Adding multiple different neighborhoods takes their union so we don't over-block
    indexer.add(SortedNeighbourhood('last_name', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('birthdate', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('mrn', window=WINDOW_INDEX_SIZE))
    indexer.add(FirstNameSortedNeighborhood('first_name', window=WINDOW_INDEX_SIZE))
    candidate_links = indexer.index(data)

    
    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format

    print(len(candidate_links), "candidate pairs identified")

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareNestedString("first_name", "first_name",label="first_name"))
    comp.exact("last_name", "last_name", label="last_name")
    comp.exact("birthdate", "birthdate", label="birthdate")
    comp.add(CompareNestedString("address", "address", label="address"))
    features = comp.compute(candidate_links, data)
    matches = features[features.sum(axis=1) == 4]

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")

    n_rows = DATA_SIZE if DATA_SIZE is not None else len(data)
    matches = get_pred_match_dict_from_multi_idx(matches.index, n_rows)
    return matches


va_labels = get_va_labels(labeling_set)

In [None]:
# GROUND-TRUTH LABELING: RECORD LINKAGE TOOLKIT FUNCTIONS

# Special class for comparing LoL first name elements
# Use the full concatenation of all names to account for multiple given names
class CompareFirstName(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        jarowinklers = np.vectorize(compare_strings)(s1.str.join(" "), s2.str.join(" "))
        return jarowinklers >= JARO_THRESHOLD


# Special class for comparing LoL address line elements
# Check each address line against each other address line to account for moving
class CompareAddress(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):

        def comp_address_fields(a1_list, a2_list):
            best_score = 0.0
            for a1 in a1_list:
                for a2 in a2_list:
                    score = compare_strings(a1, a2)
                    if score >= best_score:
                        best_score = score
            return best_score

        jarowinklers = np.vectorize(comp_address_fields)(s1, s2)
        return jarowinklers >= JARO_THRESHOLD
    

def predict_third_party_labels(data):
    start = time.time()

    # Create a windowed neighborhood index on patient table because full is 
    # too expensive with EM
    indexer = rl.Index()
    # Adding multiple different neighborhoods takes their union so we don't over-block
    indexer.add(SortedNeighbourhood('last_name', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('birthdate', window=WINDOW_INDEX_SIZE))
    indexer.add(SortedNeighbourhood('mrn', window=WINDOW_INDEX_SIZE))
    indexer.add(FirstNameSortedNeighborhood('first_name', window=WINDOW_INDEX_SIZE))
    candidate_links = indexer.index(data)
    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format

    print(len(candidate_links), "candidate pairs identified")

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareFirstName("first_name", "first_name",label="first_name"))
    comp.string(
        "last_name", "last_name", method="jarowinkler", threshold=JARO_THRESHOLD, label="last_name"
    )
    comp.string("mrn", "mrn", method="jarowinkler", threshold=JARO_THRESHOLD, label="mrn")
    comp.string(
        "birthdate", "birthdate", method="jarowinkler", threshold=JARO_THRESHOLD, label="birthdate"
    )
    comp.add(CompareAddress("address", "address", label="address"))
    comp.string("city", "city", method="jarowinkler", threshold=JARO_THRESHOLD, label="city")
    comp.string("state", "state", method="jarowinkler", threshold=JARO_THRESHOLD, label="state")
    comp.string("zip", "zip", method="jarowinkler", threshold=JARO_THRESHOLD, label="zip")
    comp.string("sex", "sex", method="jarowinkler", threshold=JARO_THRESHOLD, label="sex")
    features = comp.compute(candidate_links, data)

    # Create an EM Predictor and label the binary training vectors
    clf = rl.ECMClassifier()
    pred_links = clf.fit_predict(features)

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")

    n_rows = DATA_SIZE if DATA_SIZE is not None else len(data)
    matches = get_pred_match_dict_from_multi_idx(pred_links, n_rows)
    return matches


third_party_labels = predict_third_party_labels(labeling_set)

In [None]:
# LINKAGE DRIVER FUNCTIONS

from phdi.linkage.link import (
    _flatten_patient_resource,
    _bind_func_names_to_invocations,
    extract_blocking_values_from_record,
    _compare_records,
    extract_blocking_values_from_record
)
from typing import List
import copy

'''
Need a modified version of the link_record_against_mpi function from the PHDI SDK,
since we don't want to assign either a person_id or membership test each cluster for
the best strength. Instead, we just want to find all other_records within a data block
that are matches to a single incoming record.
'''
def link_fhir_record_from_dataset(
    record: dict,
    algo_config: List[dict],
    db_client
) -> List:

    flattened_record = _flatten_patient_resource(record)

    # Need to bind function names back to their symbolic invocations
    # in context of the module--i.e. turn the string of a function
    # name back into the callable defined in link.py
    algo_config = copy.deepcopy(algo_config)
    algo_config = _bind_func_names_to_invocations(algo_config)

    # Accumulate all matches across all passes to return
    found_matches = []
    for linkage_pass in algo_config:
        blocking_fields = linkage_pass["blocks"]

        # MPI will be able to find patients if *any* of their names or addresses
        # contains extracted values, so minimally block on the first line
        # if applicable
        field_blocks = extract_blocking_values_from_record(record, blocking_fields)
        if len(field_blocks) == 0:
            continue
        data_block = db_client.block_data(field_blocks)

        # First row of returned block is column headers
        # Map column name to idx, not including patient/person IDs
        col_to_idx = {v: k for k, v in enumerate(data_block[0][2:])}
        data_block = data_block[1:]

        # Blocked fields are returned as LoLoL, but only name / address
        # need to preserve multiple elements, so flatten other fields
        for i in range(len(data_block)):
            blocked_record = data_block[i]
            for j in range(len(blocked_record)):
                # patient_id, person_id not included in col->idx mapping
                if j < 2:
                    continue
                if len(blocked_record[j]) > 0:
                    blocked_record[j] = blocked_record[j][0]
                else:
                    blocked_record[j] = ""

        # Correct any None types that might be lurking
        if flattened_record[2] is None:
            flattened_record[2] = [""]
        if flattened_record[5] is None:
            flattened_record[5] = [""]

        # Check if incoming record matches each thing it blocked with
        kwargs = linkage_pass.get("kwargs", {})
        for blocked_record in data_block:
            is_match = _compare_records(
                flattened_record,
                blocked_record,
                linkage_pass["funcs"],
                col_to_idx,
                linkage_pass["matching_rule"],
                **kwargs
            )
            if is_match:
                found_matches.append(blocked_record[0])
    
    return found_matches


'''
Turn the patient_ids of identified "found matches" into the threaded multi-row-indices
that the ground truth labeler can understand. This way, all indices are expressed in
the same scheme for statistical comparison.
'''
def map_patient_ids_to_idxs(pids: List, data: pd.DataFrame):
    record_idxs = []
    for pid in pids:
        row_idx = data[data['patient_id'] == pid].index.values
        if len(row_idx) > 0:
            record_idxs.append(row_idx[0])
    return record_idxs


'''
Find existing patient records in a dataset that map to each incoming record in a block 
of FHIR data. Since the FHIR data itself is pulled from the MPI, we can freely use it
for querying for blocks without risk of finding unrecognized data.
'''
def link_all_fhir_records_block_dataset(records: List, algo_config: List[dict], db_client, label_set):
    found_matches = {}
    for record in records:
        ridx = map_patient_ids_to_idxs([record.get("id")], label_set)[0]
        linked_records = link_fhir_record_from_dataset(record, algo_config, db_client)
        linked_idxs = map_patient_ids_to_idxs(linked_records, label_set)
        idx_set = set(linked_idxs)
        if ridx in idx_set:
            idx_set.remove(ridx)
        found_matches[ridx] = idx_set
    return found_matches


'''
Due to transforming patient_ids back into indices, multiple tuples get inserted for each
match, i.e. we record the link (i,j) and the link (j,i), which would skew our stats.
This function eliminates these redundancies and makes sure each link is counted once.
'''
def dedupe_match_double_counts(match_dict):
    for k in match_dict:
        if k > 0:
            lower_set = set(list(range(k)))
            match_dict[k] = match_dict[k].difference(lower_set)
    return match_dict

In [None]:
# ALGORITHM EVALUATION: LAC EXISTING

LAC_ALGO = [
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "address", "transformation": "first4"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
]

found_matches_lac = link_all_fhir_records_block_dataset(evaluation_set, LAC_ALGO, db_client, labeling_set)
found_matches_lac = dedupe_match_double_counts(found_matches_lac)

In [None]:
# ALGORITHM EVALUATION: DIBBs BASIC
from phdi.linkage import DIBBS_BASIC
found_matches_dibbs_basic = link_all_fhir_records_block_dataset(evaluation_set, DIBBS_BASIC, db_client, labeling_set)
found_matches_dibbs_basic = dedupe_match_double_counts(found_matches_dibbs_basic)

In [None]:
# ALGORITHM EVALUATION: DIBBs ENHANCED
from phdi.linkage import DIBBS_ENHANCED
found_matches_dibbs_enhanced = link_all_fhir_records_block_dataset(evaluation_set, DIBBS_ENHANCED, db_client, labeling_set)
found_matches_dibbs_enhanced = dedupe_match_double_counts(found_matches_dibbs_enhanced)

In [None]:
# MOUNT THE FILE SYSTEM SO WE CAN WRITE THE OUTPUTS TO FILES

# Set paths
STORAGE_ACCOUNT = "$STORAGE_ACCOUNT"
LINKAGE_OUTPUTS_FILESYSTEM = f"abfss://linkage-notebook-outputs@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
BLOB_STORAGE_LINKED_SERVICE = "$BLOB_STORAGE_LINKED_SERVICE"

from notebookutils import mssparkutils

# Set up for writing to blob storage
linkage_bucket_name = "linkage-notebook-outputs"
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(BLOB_STORAGE_LINKED_SERVICE)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/' % (linkage_bucket_name, STORAGE_ACCOUNT)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (linkage_bucket_name, STORAGE_ACCOUNT), blob_sas_token)
# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": f"${BLOB_STORAGE_LINKED_SERVICE}"}
    )
except:
    print("Already mounted")

In [None]:
# RECOMPUTE AND EXPORT LOG-ODDS

from phdi.linkage import calculate_m_probs, calculate_u_probs, calculate_log_odds
import json

m_probs = calculate_m_probs(labeling_set, third_party_labels)
u_probs = calculate_u_probs(labeling_set, third_party_labels, n_samples=25000)
log_odds = calculate_log_odds(m_probs, u_probs)
log_odds.pop("patient_id")
mssparkutils.fs.put(LINKAGE_OUTPUTS_FILESYSTEM + "updated_log_odds.json", json.dumps(log_odds), True)

In [None]:
# RUN THE NUMBERS AND GET THE STATS FUNCTIONS

'''
To ensure accurate statistics, the matches and the true matches dictionaries
in the statistical evaluation function should have the following form:

{
    row_num_of_record_in_data: set(row_nums_of_linked_records)
}

Each row in the data should be represented as a key in both dictionaries.
The value for each of these keys should be a set that contains all other
row numbers for records in the data set that link to the key record.
'''

def score_linkage_vs_truth(found_matches, true_matches, records_in_dataset):

    # Need division by 2 because ordering is irrelevant, matches are symmetric
    total_possible_matches = (records_in_dataset * (records_in_dataset - 1)) / 2.0
    true_positives = 0.0
    false_positives = 0.0
    false_negatives = 0.0

    for root_record in true_matches:
        if root_record in found_matches:
            true_positives += len(
                true_matches[root_record].intersection(found_matches[root_record])
            )
            false_positives += len(
                found_matches[root_record].difference(true_matches[root_record])
            )
            false_negatives += len(
                true_matches[root_record].difference(found_matches[root_record])
            )
        else:
            false_negatives += len(true_matches[root_record])
    for record in set(set(found_matches.keys()).difference(true_matches.keys())):
        false_positives += len(found_matches[record])

    true_negatives = (
        total_possible_matches - true_positives - false_positives - false_negatives
    )
    npv = round((true_negatives / (true_negatives + false_negatives)), 3)
    sensitivity = round(true_positives / (true_positives + false_negatives), 3)
    specificity = round(true_negatives / (true_negatives + false_positives), 3)
    ppv = round(true_positives / (true_positives + false_positives), 3)
    f1 = round(
        (2 * true_positives) / (2 * true_positives + false_negatives + false_positives),
        3,
    )
    return {
        "tp": true_positives,
        "fp": false_positives,
        "fn": false_negatives,
        "sens": sensitivity,
        "spec": specificity,
        "ppv": ppv,
        "npv": npv,
        "f1": f1
    }

display_str = ""

if DATA_SIZE is not None:
    n_records = DATA_SIZE
else:
    n_records = len(evaluation_set)

for lbl_type in ["va", "emc"]:
    if lbl_type == "va":
        labels = va_labels
    else:
        labels = third_party_labels
    
    stats_dict_lac = score_linkage_vs_truth(found_matches_lac, labels, n_records)
    stats_dict_dibbs_b = score_linkage_vs_truth(found_matches_dibbs_basic, labels, n_records)
    stats_dict_dibbs_e = score_linkage_vs_truth(found_matches_dibbs_enhanced, labels, n_records)

    display_str += "DISPLAYING EVALUATION ON " + lbl_type.upper() + " LABELS:\n"
    display_str += "\n"

    for algo in ["lac", "basic", "enhanced"]:
        if algo == "lac":
            display_str += "LAC Existing Algorithm:\n"
            scores = stats_dict_lac
        elif algo == "basic":
            display_str += "DIBBs Basic Algorithm:\n"
            scores = stats_dict_dibbs_b
        else:
            display_str += "DIBBs Log-Odds Algorithm:\n"
            scores = stats_dict_dibbs_e

        display_str += "True Positives: " + str(scores["tp"]) + "\n"
        display_str += "False Positives: " + str(scores["fp"]) + "\n"
        display_str += "False Negatives: " + str(scores["fn"]) + "\n"
        display_str += "Sensitivity: " + str(scores["sens"]) + "\n"
        display_str += "Specificity: " + str(scores["spec"]) + "\n"
        display_str += "PPV: " + str(scores["ppv"]) + "\n"
        display_str += "NPV: " + str(scores["npv"]) + "\n"
        display_str += "F1: " + str(scores["f1"]) + "\n"
        display_str += "\n"
    
    display_str += "\n"

print(display_str)

mssparkutils.fs.put(LINKAGE_OUTPUTS_FILESYSTEM + "results.txt", display_str, True)