# convertParquetMPI

This notebook reads in patient data from an uploaded parquet file (`mpi_incoming_file_path`), converts the data to FHIR bundles, and posts the FHIR bundles to the Record Linkage endpoint.

In [None]:
pip install phdi==1.0.6 azure-keyvault-secrets

Set up parameters for connecting to the storage account, vault client, and record linkage container app.

In [None]:
from pyspark.sql import SparkSession
from phdi.linkage.seed import convert_to_patient_fhir_resources
from azure.identity import ManagedIdentityCredential
from azure.core.credentials import AccessToken
from azure.keyvault.secrets import SecretClient
import requests
import time


spark = SparkSession.builder.getOrCreate()

# Set up file client and endpoint credentials
account_name = "$STORAGE_ACCOUNT"
file_system_name = "patient-data"
file_path = "MPI.parquet"
mpi_incoming_file_path = f"abfss://{file_system_name}@{account_name}.dfs.core.windows.net/{file_path}"

class spoof_token:
    def get_token(*args, **kwargs):
        return AccessToken(
            token=mssparkutils.credentials.getToken(audience="vault"),
            expires_on=int(time.time())+60*10 # some random time in future... Synapse doesn't document how to get the actual time
        )

credential = ManagedIdentityCredential()
credential._credential = spoof_token() # monkey-patch the contents of the private `_credential`

# Set up key vault client
vault_name = "$KEY_VAULT"
vault_url = f"https://{vault_name}.vault.azure.net/"

client = SecretClient(vault_url=vault_url, credential=credential)

# Get client ID and secret for GitHub app registration
client_id = client.get_secret("synapse-client-id").value
client_secret = client.get_secret("synapse-client-secret").value

# Get access token for record linkage container app
tenant_id = "$TENANT_ID"
url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"

data = {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
    'scope': 'api://phdi-dev-record-linkage/.default'
}

response = requests.post(url, data=data)
access_token = response.json()['access_token']

# Make request to record linkage container app
record_linkage_url = client.get_secret("record-linkage-url").value + "/link-record" 
headers = {
    'Authorization': f'Bearer {access_token}'
}


Read the MPI parquet data into a spark dataframe. Iterate over each row of patient data in the dataframe and convert to a FHIR bundle and associated iris_id. Create a POST request to the record linkage container with FHIR bundle and iris_id.

In [None]:
# Convert and post data from mpi_incoming_file_path
def convert(mpi_incoming_file_path):
    df = spark.read.parquet(mpi_incoming_file_path)
    for row in df.collect():
        iris_id, fhir_bundle = convert_to_patient_fhir_resources(row.asDict())
        data = {
        'bundle': fhir_bundle,
        'external_person_id': iris_id
    }
        requests.post(record_linkage_url, headers=headers, json=data)

convert(mpi_incoming_file_path)

    