# convert_MPI_write_to_blob

This notebook reads in patient data from an uploaded parquet file (`mpi_incoming_file_path`), converts to FHIR, and writes the data to blob storage.

In [None]:
pip install azure-identity phdi

Set up parameters for connecting to the storage account, vault client, and record linkage container app.

In [None]:
from notebookutils import mssparkutils
from phdi.linkage.seed import convert_to_patient_fhir_resources
from phdi.linkage.link import generate_hash_str
from datetime import date
import json
import asyncio
import nest_asyncio
nest_asyncio.apply()
from random import randint

# Set up file client
storage_account = "$STORAGE_ACCOUNT"
source_data_bucket = "source-data"
patient_data_bucket = "patient-data"
file_path = "MPI.parquet"
storage_account_url = f"https://{storage_account}.blob.core.windows.net/"
mpi_incoming_file_path = f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/{file_path}"

# Set up for writing to blob storage
blob_relative_path = ""
blob_storage_linked_service = "$BLOB_STORAGE_LINKED_SERVICE" 
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(blob_storage_linked_service)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (source_data_bucket, storage_account, blob_relative_path)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (source_data_bucket, storage_account), blob_sas_token)
# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": blob_storage_linked_service}
    )
except:
    print("Already mounted")


In [7]:
async def convert_write_data(row,bucket_name,storage_account,curr_date):
    iris_id, fhir_bundle = convert_to_patient_fhir_resources(row.asDict())
    patient = [
        r for r in fhir_bundle.get("entry", []) if r.get("resource", {}).get("resourceType", "") == "Patient"
    ]
    patient[0]["meta"] = {"source": "uri:iris"}

    # generate unique hash for writing files
    salt_str = "salt"
    hash = generate_hash_str(json.dumps(fhir_bundle),salt_str)

    # Write file to storage pre-harmonization
    pre_filename = f"abfss://{bucket_name}@{storage_account}.dfs.core.windows.net/fhir/lac_extract_{str(curr_date)}_{str(hash)}.json"
    mssparkutils.fs.put(pre_filename, json.dumps(fhir_bundle), True)

async def batch_seed(rows,bucket_name,storage_account,curr_date):
    resps = await asyncio.gather(*[convert_write_data(row,bucket_name,storage_account,curr_date) for row in rows])

def harmonize(data):
    data = standardize_names(data = data)
    data = standardize_phones(data = data)
    try:
        data = standardize_dob(data = data)
    except Exception as e:
        pass
    return data

StatementMeta(sparkpool, 82, 8, Finished, Available)

In [8]:
# Harmonize data and write to blob storage
def pre_process(mpi_incoming_file_path):
    curr_date = date.today()
    df = spark.read.parquet(mpi_incoming_file_path)
    all_rows = df.collect()
    batch_size = 10000
  
    for lower_bound in range(0,len(all_rows),batch_size):
        upper_bound = lower_bound+batch_size
        print("lower_bound:",lower_bound)
        print("upper_bound:",upper_bound)
        batch = all_rows[lower_bound:upper_bound]
        asyncio.run(batch_seed(batch,source_data_bucket,storage_account,curr_date))

pre_process(mpi_incoming_file_path)