# convertParquetMPI

This notebook reads in patient data from an uploaded parquet file (`mpi_incoming_file_path`), converts the data to FHIR bundles, and posts the FHIR bundles to the Record Linkage endpoint.

In [None]:
pip install azure-identity phdi

Set up parameters for connecting to the storage account, vault client, and record linkage container app.

In [None]:
from pyspark.sql import SparkSession
from phdi.linkage.seed import convert_to_patient_fhir_resources
from azure.identity import DefaultAzureCredential
import requests
import time
from datetime import date
from phdi.transport.http import http_request_with_retry

spark = SparkSession.builder.getOrCreate()

# Set up file client and endpoint credentials
account_name = "$STORAGE_ACCOUNT"
endpoint_scope = "$SCOPE"
snapshot_bucket_name = "bundle-snapshots"
file_system_name = "patient-data"
file_path = "MPI.parquet"
storage_account_url = f"https://{account_name}.blob.core.windows.net"
mpi_incoming_file_path = f"abfss://{file_system_name}@{account_name}.dfs.core.windows.net/{file_path}"

# Set up key vault client
vault_name = "$KEY_VAULT"
vault_url = f"https://{vault_name}.vault.azure.net/"
vault_linked_service = "$KEY_VAULT_LINKED_SERVICE"

credential = DefaultAzureCredential()

# Get client ID and secret for GitHub app registration
client_id = TokenLibrary.getSecret(vault_name,"synapse-client-id",vault_linked_service)
client_secret = TokenLibrary.getSecret(vault_name,"synapse-client-secret",vault_linked_service)

# Get access token for record linkage container app
tenant_id = "$TENANT_ID"
endpoint_login_url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"

# Set up for record linkage and ingestion access
def set_up_endpoint_access(endpoint, url):
    data = {
         'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
        'scope': f'api://{endpoint_scope}-{endpoint}/.default'
    }

    response = requests.post(url, data=data)
    access_token = response.json()['access_token']
    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    return data, headers

record_linkage_data, record_linkage_headers = set_up_endpoint_access(endpoint="record-linkage",url=endpoint_login_url)
ingestion_data, ingestion_headers = set_up_endpoint_access(endpoint="ingestion",url=endpoint_login_url)

# Define function for refreshing Bearer access token
def request_with_token_refresh(url,headers,data,endpoint_type,endpoint_login_url):
    resp = http_request_with_retry(url=url,retry_count=2,request_type="POST",headers=headers,allowed_methods=["POST"],data=data)
    if resp.status_code == 401:
        _, headers = set_up_endpoint_access(endpoint=endpoint_type,url=endpoint_login_url)
        return request_with_token_refresh(url,headers,data,endpoint_type,endpoint_login_url)
    resp.raise_for_status()
    return resp

# Get URLs for container apps
record_linkage_url = TokenLibrary.getSecret(vault_name,"record-linkage-url",vault_linked_service)+ "/link-record" 
ingestion_url = TokenLibrary.getSecret(vault_name,"ingestion-url",vault_linked_service)
std_names_url = ingestion_url + "/fhir/harmonization/standardization/standardize_names"
std_phones_url = ingestion_url + "/fhir/harmonization/standardization/standardize_phones"
std_dob_url = ingestion_url + "/fhir/harmonization/standardization/standardize_dob"
std_address_url = ingestion_url + "/fhir/geospatial/geocode/geocode_bundle"
write_to_storage_url = ingestion_url + "/cloud/storage/write_blob_to_storage"

Read the MPI parquet data into a spark dataframe. Iterate over each row of patient data in the dataframe and convert to a FHIR bundle and associated iris_id. Create a POST request to the record linkage container with FHIR bundle and iris_id.

In [None]:

# Convert and post data from mpi_incoming_file_path
def convert(mpi_incoming_file_path):
    df = spark.read.parquet(mpi_incoming_file_path)
    curr_date = date.today()
    file_idx = 0
    for row in df.collect():
        file_idx += 1
        iris_id, fhir_bundle = convert_to_patient_fhir_resources(row.asDict())

        # Add metadata to LAC Extract patientdenoting it came from IRIS
        # We know this URI is ~probably~ not right but Brady might know what to use...
        patients = [
            r for r in fhir_bundle.get("entry", []) if r.get("resource", {}).get("resourceType", "") == "Patient"
        ]
        patients[0]["meta"] = {"source": "uri:iris"}
        data = {
            'bundle': fhir_bundle,
            'external_person_id': iris_id
        }


        # Store a snapshot of the FHIR bundle before it hits standardization
        pre_write_data = {
            'blob': data["bundle"],
            'cloud_provider': 'azure',
            'bucket_name': snapshot_bucket_name,
            'file_name': f"/pre/lac_extract_{str(curr_date)}_{str(file_idx)}.json",
            'storage_account_url':storage_account_url

        }

        resp = request_with_token_refresh(url=write_to_storage_url,headers=ingestion_headers,data=pre_write_data,endpoint_type="ingestion",endpoint_login_url=endpoint_login_url)

        # Perform pipeline standardization on extracted data to mirror how non-extracted data gets handled by the pipeline
        resp = request_with_token_refresh(url=std_names_url,headers=ingestion_headers,data={"data":data["bundle"]},endpoint_type="ingestion",endpoint_login_url=endpoint_login_url)
        data['bundle'] = resp.json().get("bundle")

        resp = request_with_token_refresh(url=std_phones_url,headers=ingestion_headers,data={"data":data["bundle"]},endpoint_type="ingestion",endpoint_login_url=endpoint_login_url)
        data['bundle'] = resp.json().get("bundle")

        resp = request_with_token_refresh(url=std_dob_url,headers=ingestion_headers,data={"data":data["bundle"]},endpoint_type="ingestion",endpoint_login_url=endpoint_login_url)
        data['bundle'] = resp.json().get("bundle")

        resp = requests.post(std_address_url, headers=ingestion_headers, json={"bundle":data["bundle"],"geocode_method":"smarty"})
        data['bundle'] = resp.json().get("bundle")

        # Also store a copy of the bundle after it hits standardization
        post_write_data = {
            'blob': data["bundle"],
            'cloud_provider': 'azure',
            'bucket_name': snapshot_bucket_name,
            'file_name': f"/post/lac_extract_{str(curr_date)}_{str(file_idx)}.json",
            'storage_account_url':storage_account_url
        } 
        
        resp = request_with_token_refresh(url=write_to_storage_url,headers=ingestion_headers,data=post_write_data,endpoint_type="ingestion",endpoint_login_url=endpoint_login_url)

     
        # Now we can send the extracted record off to the MPI
        resp = request_with_token_refresh(url=record_linkage_url,headers=record_linkage_headers,data=data,endpoint_type="record-linkage",endpoint_login_url=endpoint_login_url)

convert(mpi_incoming_file_path)
