# convertParquetMPI

This notebook reads in patient data from an uploaded parquet file (`mpi_incoming_file_path`), converts the data to FHIR bundles, and posts the FHIR bundles to the Record Linkage endpoint.

In [None]:
pip install phdi=1.0.6 azure-identity

Set up parameters for connecting to the storage account, vault client, and record linkage container app.

In [None]:
from pyspark.sql import SparkSession
from phdi.linkage.seed import convert_to_patient_fhir_resources
from azure.identity import DefaultAzureCredential
import requests

spark = SparkSession.builder.getOrCreate()

# Set up file client and endpoint credentials
account_name = "$STORAGE_ACCOUNT"
file_system_name = "patient-data"
file_path = "MPI.parquet"
mpi_incoming_file_path = f"abfss://{file_system_name}@{account_name}.dfs.core.windows.net/{file_path}"


# Set up key vault client
vault_name = "$KEY_VAULT"
vault_url = f"https://{vault_name}.vault.azure.net/"
vault_linked_service = "AzureKeyVault1"

credential = DefaultAzureCredential()

# Get client ID and secret for GitHub app registration
client_id = TokenLibrary.getSecret(vault_name,"synapse-client-id",vault_linked_service)
client_secret = TokenLibrary.getSecret(vault_name,"synapse-client-secret",vault_linked_service)

# Get access token for record linkage container app
tenant_id = "$TENANT_ID"
url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"

data = {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
    'scope': 'api://phdi-dev-record-linkage/.default'
}

response = requests.post(url, data=data)
access_token = response.json()['access_token']

# Make request to record linkage container app
record_linkage_url = TokenLibrary.getSecret(vault_name,"record-linkage-url",vault_linked_service)+ "/link-record" 
ingestion_url = TokenLibrary.getSecret(vault_name,"ingestion-url",vault_linked_service)
std_names_url = ingestion_url + "/standardize_names"
std_phones_url = ingestion_url + "/standardize_phones"
std_dob_url = ingestion_url + "/standardize_dob"
std_address_url = ingestion_url + "/geocode_bundle"
headers = {
    'Authorization': f'Bearer {access_token}'
}


Read the MPI parquet data into a spark dataframe. Iterate over each row of patient data in the dataframe and convert to a FHIR bundle and associated iris_id. Create a POST request to the record linkage container with FHIR bundle and iris_id.

In [None]:
# Convert and post data from mpi_incoming_file_path
def convert(mpi_incoming_file_path):
    df = spark.read.parquet(mpi_incoming_file_path)
    for row in df.collect():
        iris_id, fhir_bundle = convert_to_patient_fhir_resources(row.asDict())

        # Add metadata to LAC Extract patientdenoting it came from IRIS
        # We know this URI is ~probably~ not right but Brady might know what to use...
        patients = [
            r for r in fhir_bundle.get("entry", []) if r.get("resource", {}).get("resourceType", "") == "Patient"
        ]
        patients[0]["meta"] = {"source": "uri:iris"}
        data = {
            'bundle': fhir_bundle,
            'external_person_id': iris_id
        }
        
        # Perform pipeline standardization on extracted data to mirror how
        # non-extracted data gets handled by the pipeline
        resp = requests.post(std_names_url, headers=headers, json=data)
        data['bundle'] = resp.get("bundle")
        resp = requests.post(std_phones_url, headers=headers, json=data)
        data['bundle'] = resp.get("bundle")
        resp = requests.post(std_dob_url, headers=headers, json=data)
        data['bundle'] = resp.get("bundle")
        resp = requests.post(std_address_url, headers=headers, json=data)
        data['bundle'] = resp.get("bundle")

        # Now we can send the extracted record off to the MPI
        requests.post(record_linkage_url, headers=headers, json=data)

convert(mpi_incoming_file_path)
    