### tabularizePatientTable
This notebook extracts all data from the Patient table (`patient`) in the Master Patient Index and tabularizes the data into Lists of Lists of Lists (LoLoL).

In [None]:
pip install psycopg2-binary azure-identity phdi recordlinkage

In [None]:
# Database-related imports
import psycopg2
from psycopg2.sql import Identifier, SQL
from azure.identity import DefaultAzureCredential
from phdi.linkage.postgres import DIBBsConnectorClient

# Ground-truth labeling imports
import time
import pandas as pd
import recordlinkage as rl
from phdi.linkage import score_linkage_vs_truth
from recordlinkage.base import BaseCompareFeature
import numpy as np
from phdi.harmonization import compare_strings

In [None]:
# GLOBAL VARIABLES FOR DATABASE ACCESS

# Set your Key Vault information
vault_name = "$KEY_VAULT"
KEY_VAULT_URL = f"https://{vault_name}.vault.azure.net"
vault_linked_service = "$KEY_VAULT_LINKED_SERVICE"

# Set up db_client
DB_NAME = "DibbsMpiDB"
DB_USER = "postgres"
DB_HOST = "$MPI_DB_HOST"
DB_PORT = "5432"
DB_TABLE_PATIENT = "patient"
DB_TABLE_PERSON= "person"

In [None]:
# GLOBAL VARIABLES FOR GROUND-TRUTH LABELING
DATA_SIZE = None    # Optional variable; if none, use whole table

In [None]:
# MPI ACCESS AND TABULATION FUNCTIONS

# Generate a query to extract all data from the patient table of the MPI.
def generate_query(db_client):
    select_query_stubs = []
    query_data = []
    for key in db_client.fields_to_jsonpaths:
        query = f"jsonb_path_query_array(patient_resource,%s) as {key}"
        select_query_stubs.append(query)
        query_data.append(db_client.fields_to_jsonpaths[key])

    select_query = "SELECT patient_id, " + ", ".join(stub for stub in select_query_stubs)

    query = select_query + " FROM {patient_table};"
    query = SQL(query).format(patient_table=Identifier(db_client.patient_table))
    return query, query_data


# Format returned data as Lists of Lists of Lists (LoLoL)
def format_data(data, db_client):
    db_client._close_connections(db_conn=conn, db_cursor=cur)
    data_cols = []
    for key in sorted(list(db_client.fields_to_jsonpaths.keys())):
        data_cols.append(key)
    data_cols.insert(0, "patient_id")
    data.insert(0, data_cols)

    # Bring all data elements up one list level to avoid overly deep nesting
    for i in range(1, len(data)):
        for j in range(1, len(data[i])):
            if len(data[i][j]) > 0:
                data[i][j] = data[i][j][0]
            else:
                data[i][j] = ""
    return data

# Access the MPI Database
credential = DefaultAzureCredential()
db_password =  TokenLibrary.getSecret(vault_name,"mpi-db-password",vault_linked_service)
db_client = DIBBsConnectorClient(database = DB_NAME, user = DB_USER, password = db_password, host= DB_HOST, port = DB_PORT, patient_table= DB_TABLE_PATIENT, person_table=DB_TABLE_PERSON)

# Create a connection and a cursor
conn = db_client.get_connection()
cur = conn.cursor()

# Query for the data and format it
query, query_data = generate_query(db_client)
cur.execute(query, query_data)
data = [list(row) for row in cur.fetchall()]
formatted_data = format_data(data,db_client)

# Apply any size caveats, if desired
if DATA_SIZE is not None:
    dataset = formatted_data[:min(DATA_SIZE+1, len(formatted_data))]
else:
    dataset = formatted_data
labeling_set = pd.DataFrame(dataset[1:], columns=data[0])

# Now, we need a copy of the data in a FHIR format for the linkage algorithms
conn = db_client.get_connection()
cur = conn.cursor()
query = "SELECT patient_resource from patient;"
cur.execute(query)
fhir_data = [list(row)[0] for row in cur.fetchall()]
db_client._close_connections(db_conn=conn, db_cursor=cur)

if DATA_SIZE is not None:
    evaluation_set = fhir_data[:min(DATA_SIZE+1, len(formatted_data))]
else:
    evaluation_set = fhir_data


In [None]:
# GROUND TRUTH LABELING: VIRGINIA FUNCTIONS

# Transform a recordlinkage toolkit multi-index into a set of candidate tuples
def get_pred_match_dict_from_multi_idx(mltidx, n_rows):
    candidate_tuples = mltidx.to_list()
    pred_matches = {k: set() for k in range(n_rows)}
    for pair in candidate_tuples:
        reference_record = min(pair)
        linked_record = max(pair)
        pred_matches[reference_record].add(linked_record)
    return pred_matches


# Special class for comparing LoL first name elements
# Use the full concatenation of all names to account for multiple given names
class CompareNestedString(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        return (s1.str[0] == s2.str[0]).astype(float)

def get_va_labels(data):
    start = time.time()

    # Create a full index on patient table so we don't miss any pairs
    indexer = rl.Index()
    indexer.full()
    candidate_links = indexer.index(data)
    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format

    print(len(candidate_links), "candidate pairs identified")

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareNestedString("first_name", "first_name",label="first_name"))
    comp.exact("last_name", "last_name", label="last_name")
    comp.exact("birthdate", "birthdate", label="birthdate")
    comp.add(CompareNestedString("address", "address", label="address"))
    features = comp.compute(candidate_links, data)
    matches = features[features.sum(axis=1) == 4]

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")

    n_rows = DATA_SIZE if DATA_SIZE is not None else len(data)
    matches = get_pred_match_dict_from_multi_idx(matches.index, n_rows)
    return matches

va_labels = get_va_labels(labeling_set)

In [None]:
# GROUND-TRUTH LABELING: RECORD LINKAGE TOOLKIT FUNCTIONS

# Special class for comparing LoL first name elements
# Use the full concatenation of all names to account for multiple given names
class CompareFirstName(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):
        jarowinklers = np.vectorize(compare_strings)(s1.str.join(" "), s2.str.join(" "))
        return jarowinklers >= 0.85


# Special class for comparing LoL address line elements
# Check each address line against each other address line to account for moving
class CompareAddress(BaseCompareFeature):
    def _compute_vectorized(self, s1, s2):

        def comp_address_fields(a1_list, a2_list):
            best_score = 0.0
            for a1 in a1_list:
                for a2 in a2_list:
                    score = compare_strings(a1, a2)
                    if score >= best_score:
                        best_score = score
            return best_score

        jarowinklers = np.vectorize(comp_address_fields)(s1, s2)
        return jarowinklers >= 0.85
    

def predict_third_party_labels(data):
    start = time.time()

    # Create a full index on patient table so we don't miss any pairs
    indexer = rl.Index()
    indexer.full()
    candidate_links = indexer.index(data)
    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format

    print(len(candidate_links), "candidate pairs identified")

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.add(CompareFirstName("first_name", "first_name",label="first_name"))
    comp.string(
        "last_name", "last_name", method="jarowinkler", threshold=0.85, label="last_name"
    )
    comp.string("mrn", "mrn", method="jarowinkler", threshold=0.85, label="mrn")
    comp.string(
        "birthdate", "birthdate", method="jarowinkler", threshold=0.85, label="birthdate"
    )
    comp.add(CompareAddress("address", "address", label="address"))
    comp.string("city", "city", method="jarowinkler", threshold=0.85, label="city")
    comp.string("state", "state", method="jarowinkler", threshold=0.85, label="state")
    comp.string("zip", "zip", method="jarowinkler", threshold=0.85, label="zip")
    comp.string("sex", "sex", method="jarowinkler", threshold=0.85, label="sex")
    features = comp.compute(candidate_links, data)

    # Create an EM Predictor and label the binary training vectors
    clf = rl.ECMClassifier()
    pred_links = clf.fit_predict(features)

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")

    n_rows = DATA_SIZE if DATA_SIZE is not None else len(data)
    matches = get_pred_match_dict_from_multi_idx(pred_links, n_rows)
    return matches

third_party_labels = predict_third_party_labels(labeling_set)

In [None]:
# LINKAGE DRIVER FUNCTIONS

from phdi.linkage.link import (
    _flatten_patient_resource,
    _bind_func_names_to_invocations,
    extract_blocking_values_from_record,
    _compare_records,
    extract_blocking_values_from_record
)
from typing import List
import copy

def link_fhir_record_from_dataset(
    record: dict,
    algo_config: List[dict],
    db_client
) -> List:

    flattened_record = _flatten_patient_resource(record)

    # Need to bind function names back to their symbolic invocations
    # in context of the module--i.e. turn the string of a function
    # name back into the callable defined in link.py
    algo_config = copy.deepcopy(algo_config)
    algo_config = _bind_func_names_to_invocations(algo_config)

    # Accumulate all matches across all passes to return
    found_matches = []
    for linkage_pass in algo_config:
        blocking_fields = linkage_pass["blocks"]

        # MPI will be able to find patients if *any* of their names or addresses
        # contains extracted values, so minimally block on the first line
        # if applicable
        field_blocks = extract_blocking_values_from_record(record, blocking_fields)
        data_block = db_client.block_data(field_blocks)

        # First row of returned block is column headers
        # Map column name to idx, not including patient/person IDs
        col_to_idx = {v: k for k, v in enumerate(data_block[0][2:])}
        data_block = data_block[1:]

        # Blocked fields are returned as LoLoL, but only name / address
        # need to preserve multiple elements, so flatten other fields
        for i in range(len(data_block)):
            blocked_record = data_block[i]
            for j in range(len(blocked_record)):
                # patient_id, person_id not included in col->idx mapping
                if j < 2:
                    continue
                if len(blocked_record[j]) > 0:
                    blocked_record[j] = blocked_record[j][0]
                else:
                    blocked_record[j] = ""

        # Check if incoming record matches each thing it blocked with
        kwargs = linkage_pass.get("kwargs", {})
        for blocked_record in data_block:
            is_match = _compare_records(
                flattened_record,
                blocked_record,
                linkage_pass["funcs"],
                col_to_idx,
                linkage_pass["matching_rule"],
                **kwargs
            )
            if is_match:
                found_matches.append(blocked_record[0])
    
    return found_matches

def map_patient_ids_to_idxs(pids: List, data: pd.DataFrame):
    record_idxs = []
    for pid in pids:
        row_idx = data[data['patient_id'] == pid].index.values[0]
        record_idxs.append(row_idx)
    return record_idxs


def link_all_fhir_records_block_dataset(records: List, algo_config: List[dict], db_client, label_set):
    found_matches = {}
    for record in records:
        ridx = map_patient_ids_to_idxs([record.get("id")], label_set)[0]
        linked_records = link_fhir_record_from_dataset(record, algo_config, db_client)
        linked_idxs = map_patient_ids_to_idxs(linked_records, label_set)
        idx_set = set(linked_idxs)
        if ridx in idx_set:
            idx_set.remove(ridx)
        found_matches[ridx] = idx_set
    return found_matches


def dedupe_match_double_counts(match_dict):
    for k in match_dict:
        if k > 0:
            lower_set = set(list(range(k)))
            match_dict[k] = match_dict[k].difference(lower_set)
    return match_dict

In [None]:
# ALGORITHM EVALUATION: LAC EXISTING

LAC_ALGO = [
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "first_name", "transformation": "first4"},
            {"value": "last_name", "transformation": "first4"},
            {"value": "address", "transformation": "first4"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
    {
        "funcs": {
            "first_name": "feature_match_fuzzy_string",
            "last_name": "feature_match_fuzzy_string",
            "address": "feature_match_fuzzy_string",
            "mrn": "feature_match_fuzzy_string",
        },
        "blocks": [
            {"value": "birthdate"},
        ],
        "matching_rule": "eval_perfect_match",
        "cluster_ratio": 0.9,
    },
]

found_matches_lac = link_all_fhir_records_block_dataset(evaluation_set, DIBBS_BASIC, db_client, labeling_set)
found_matches_lac = dedupe_match_double_counts(found_matches_lac)

In [None]:
# ALGORITHM EVALUATION: DIBBs BASIC
from phdi.linkage import DIBBS_BASIC
found_matches_dibbs_basic = link_all_fhir_records_block_dataset(evaluation_set, DIBBS_BASIC, db_client, labeling_set)
found_matches_dibbs_basic = dedupe_match_double_counts(found_matches_dibbs_basic)

In [None]:
# ALGORITHM EVALUATION: DIBBs ENHANCED
from phdi.linkage import DIBBS_ENHANCED
found_matches_dibbs_enhanced = link_all_fhir_records_block_dataset(evaluation_set, DIBBS_ENHANCED, db_client, labeling_set)
found_matches_dibbs_enhanced = dedupe_match_double_counts(found_matches_dibbs_enhanced)

In [None]:
# RUN THE NUMBERS AND GET THE STATS FUNCTIONS

'''
To ensure accurate statistics, the matches and the true matches dictionaries
in the statistical evaluation function should have the following form:

{
    row_num_of_record_in_data: set(row_nums_of_linked_records)
}

Each row in the data should be represented as a key in both dictionaries.
The value for each of these keys should be a set that contains all other
row numbers for records in the data set that link to the key record.
'''
def display_statistical_evaluation(
    matches: dict, true_matches: dict, num_recs: int, cluster_mode_used: bool = False
):
    sensitivitiy, specificity, ppv, f1 = score_linkage_vs_truth(
        matches, true_matches, num_recs, cluster_mode_used
    )
    print("Sensitivity:", sensitivitiy)
    print("Specificity:", specificity)
    print("PPV:", ppv)
    print("F1:", f1)

if DATA_SIZE is not None:
    n_records = DATA_SIZE
else:
    n_records = len(evaluation_set)

print("DISPLAYING EVALUATION ON VA LABELS:")
print()
print("LAC Existing Algorithm:")
display_statistical_evaluation(found_matches_lac, va_labels, n_records)
print()
print("DIBBs Basic Algorithm:")
display_statistical_evaluation(found_matches_dibbs_basic, va_labels, n_records)
print()
print("DIBBs Log-Odds Algorithm:")
display_statistical_evaluation(found_matches_dibbs_enhanced, va_labels, n_records)
print()
print()


print("DISPLAYING EVALUATION ON EMC LABELS:")
print()
print("LAC Existing Algorithm:")
display_statistical_evaluation(found_matches_lac, third_party_labels, n_records)
print()
print("DIBBs Basic Algorithm:")
display_statistical_evaluation(found_matches_dibbs_basic, third_party_labels, n_records)
print()
print("DIBBs Log-Odds Algorithm:")
display_statistical_evaluation(found_matches_dibbs_enhanced, third_party_labels, n_records)
