### tabularizePatientTable
This notebook extracts all data from the Patient table (`patient`) in the Master Patient Index and tabularizes the data into Lists of Lists of Lists (LoLoL).

In [None]:
pip install psycopg2-binary azure-identity phdi==1.0.6 recordlinkage

In [None]:
# Database-related imports
import psycopg2
from psycopg2.sql import Identifier, SQL
from azure.identity import DefaultAzureCredential
from phdi.linkage.postgres import DIBBsConnectorClient

# Ground-truth labeling imports
import time
import pandas as pd
import recordlinkage as rl
from phdi.linkage import score_linkage_vs_truth

In [None]:
# GLOBAL VARIABLES FOR DATABASE ACCESS

# Set your Key Vault information
vault_name = "$KEY_VAULT"
KEY_VAULT_URL = f"https://{vault_name}.vault.azure.net"
vault_linked_service = "$KEY_VAULT_LINKED_SERVICE"

# Set up db_client
DB_NAME = "DibbsMpiDB"
DB_USER = "postgres"
DB_HOST = "$MPI_DB_HOST"
DB_PORT = "5432"
DB_TABLE_PATIENT = "patient"
DB_TABLE_PERSON= "person"

In [None]:
# GLOBAL VARIABLES FOR GROUND-TRUTH LABELING
DATA_SIZE = None    # Optional variable; if none, use whole table

In [None]:
# MPI ACCESS AND TABULATION FUNCTIONS

# Generate a query to extract all data from the patient table of the MPI.
def generate_query(db_client):
    select_query_stubs = []
    query_data = []
    for key in db_client.fields_to_jsonpaths:
        query = f"jsonb_path_query_array(patient_resource,%s) as {key}"
        select_query_stubs.append(query)
        query_data.append(db_client.fields_to_jsonpaths[key])

    select_query = "SELECT " + ", ".join(stub for stub in select_query_stubs)

    query = select_query + " FROM {patient_table};"
    query = SQL(query).format(patient_table=Identifier(db_client.patient_table))
    return query, query_data


# Format returned data as Lists of Lists of Lists (LoLoL)
def format_data(data, db_client):
    db_client._close_connections(db_conn=conn, db_cursor=cur)
    data_cols = []
    for key in sorted(list(db_client.fields_to_jsonpaths.keys())):
        data_cols.append(key)
    data.insert(0, data_cols)
    return data

# Access the MPI Database
credential = DefaultAzureCredential()
db_password =  TokenLibrary.getSecret(vault_name,"mpi-db-password",vault_linked_service)
db_client = DIBBsConnectorClient(database = DB_NAME, user = DB_USER, password = db_password, host= DB_HOST, port = DB_PORT, patient_table= DB_TABLE_PATIENT, person_table=DB_TABLE_PERSON)

# Create a connection and a cursor
conn = db_client.get_connection()
cur = conn.cursor()

# Query for the data and format it
query, query_data = generate_query(db_client)
cur.execute(query, query_data)
data = [list(row) for row in cur.fetchall()]
formatted_data = format_data(data,db_client)

# Apply any size caveats, if desired
if DATA_SIZE is not None:
    dataset = formatted_data[:min(DATA_SIZE+1, len(formatted_data))]
else:
    dataset = formatted_data

In [None]:
# GROUND TRUTH LABELING: VIRGINIA FUNCTIONS
#TODO: write a simple labeling function that accepts a LoLoL and outputs a dictionary
# of "true matches" in this data. For the format of this dictionary, see the evaluation
# cell below. Since the order of the data doesn't matter, you can just use the row
# number of the data in the list as its index for computing purposes.

In [None]:

# GROUND-TRUTH LABELING: RECORD LINKAGE TOOLKIT FUNCTIONS

# Transform a recordlinkage toolkit multi-index into a set of candidate tuples
def get_pred_match_dict_from_multi_idx(mltidx, n_rows):
    candidate_tuples = mltidx.to_list()
    pred_matches = {k: set() for k in range(n_rows)}
    for pair in candidate_tuples:
        reference_record = min(pair)
        linked_record = max(pair)
        pred_matches[reference_record].add(linked_record)
    return pred_matches


def predict_third_party_labels(data):
    data = pd.DataFrame(data[1:], columns=data[0])
    start = time.time()

    # Create a full index on patient table so we don't miss any pairs
    indexer = rl.Index()
    indexer.full()
    candidate_links = indexer.index(data)
    # Note: using a multi-indexer treats the row number as the index, so
    # results will automatically be in acceptable eval format

    print(len(candidate_links), "candidate pairs identified")

    # Apply feature comparisons on each supported field from the MPI
    comp = rl.Compare()
    comp.string(
        "first_name", "first_name", method="jarowinkler", threshold=0.85, label="first_name"
    )
    comp.string(
        "last_name", "last_name", method="jarowinkler", threshold=0.85, label="last_name"
    )
    comp.string("mrn", "mrn", method="jarowinkler", threshold=0.85, label="mrn")
    comp.string(
        "birthdate", "birthdate", method="jarowinkler", threshold=0.85, label="birthdate"
    )
    comp.string("address", "address", method="jarowinkler", threshold=0.85, label="address")
    comp.string("city", "city", method="jarowinkler", threshold=0.85, label="city")
    comp.string("state", "state", method="jarowinkler", threshold=0.85, label="state")
    comp.string("zip", "zip", method="jarowinkler", threshold=0.85, label="zip")
    comp.string("sex", "sex", method="jarowinkler", threshold=0.85, label="sex")
    features = comp.compute(candidate_links, data)

    # Create an EM Predictor and label the binary training vectors
    clf = rl.ECMClassifier()
    pred_links = clf.fit_predict(features)

    end = time.time()
    print("Computation took", str(round(end - start, 2)), "seconds")

    n_rows = DATA_SIZE if DATA_SIZE is not None else len(data)
    matches = get_pred_match_dict_from_multi_idx(pred_links, n_rows)
    return matches

third_party_labels = predict_third_party_labels(dataset)

In [None]:
# ALGORITHM EVALUATION: LAC EXISTING
# TODO: Write a function that runs LAC's existing algorithm on the LoLoL
# data extracted from the MPI and creates matches following acceptable
# evaluation dictionary format.

In [None]:
# ALGORITHM EVALUATION: DIBBs BASIC
# TODO: Write a function that runs the DIBBs basic algorithm on the LoLoL
# data extracted from the MPI and outputs its matches in a dictionary that
# conforms to the row number format below.

In [None]:
# ALGORITHM EVALUATION: DIBBs ENHANCED
# TODO: As above but with the DIBBs enhanced algorithm

In [None]:
# RUN THE NUMBERS AND GET THE STATS FUNCTIONS

'''
To ensure accurate statistics, the matches and the true matches dictionaries
in the statistical evaluation function should have the following form:

{
    row_num_of_record_in_data: set(row_nums_of_linked_records)
}

Each row in the data should be represented as a key in both dictionaries.
The value for each of these keys should be a set that contains all other
row numbers for records in the data set that link to the key record.
'''
def display_statistical_evaluation(
    matches: dict, true_matches: dict, cluster_mode_used: bool = False
):
    sensitivitiy, specificity, ppv, f1 = score_linkage_vs_truth(
        matches, true_matches, DATA_SIZE, cluster_mode_used
    )
    print("Sensitivity:", sensitivitiy)
    print("Specificity:", specificity)
    print("PPV:", ppv)
    print("F1:", f1)

# Call this function once for each combination of label, linkage_algo
# display_statistical_evaluation(matches, true_matches)