# convertParquetMPI

This notebook reads in patient data from an uploaded parquet file (`mpi_incoming_filename`), converts to FHIR, and writes the data to blob storage.

In [None]:
# pip install --upgrade pip

In [None]:
# pip install git+https://github.com/CDCgov/phdi@main

In [None]:
filename=""

In [None]:
# This script converts patient data from parquet to patient FHIR resources.
from typing import Dict, Tuple
import uuid
from datetime import datetime


def extract_given_name(data: Dict):
    first_name = data.get("first_name", None)
    middle_name = data.get("middle_name", None)

    given_names = []

    for name in [first_name, middle_name]:
        if name is not None:
            for n in name.split():
                given_names.append(n)

    if len(given_names) > 0:
        return given_names
    else:
        return None


def adjust_birthdate(data: Dict):
    # TODO: remove this function and pass in the `format` parameter to dob
    # standardization in ReadSourceData for LAC
    format = "%d%b%Y:00:00:00.000"
    dob = data.get("birthdate", None)
    if dob is not None and ":" in dob:
        datetime_str = datetime.strptime(dob, format)
        dob = datetime_str.strftime("%Y-%m-%d")
    return dob

def convert_to_patient_fhir_resources(data: Dict) -> Tuple:
    """
    Converts and returns a row of patient data into patient resource in a FHIR-formatted
    patient resouce with a newly generated patient id as well as the
    `external_person_id`.

    :param data: Dictionary of patient data that optionionally includes the following
      fields: mrn, ssn, first_name, middle_name, last_name, home_phone, cell-phone, sex,
      birthdate, address, city, state, zip.
    :return: Tuple of the `external_person_id` and FHIR-formatted patient resource.
    """

    patient_id = str(uuid.uuid4())

    optional_data = {
        "mrn": data.get("mrn", None),
        "ssn": data.get("ssn", None),
        "home_phone": data.get("home_phone", None),
        "cell_phone": data.get("cell_phone", None),
        "email": data.get("email", None),
    }
    identifiers = []
    telecom = []

    # Iterate through each patient and convert patient data to FHIR resource
    patient_resource = {
        "resourceType": "Patient",
        "id": f"{patient_id}",
        "name": [
            {
                "family": f"{data.get('last_name',None)}",
                "given": extract_given_name(data),
            }
        ],
        "gender": f"{data.get('sex',None)}",
        "birthDate": adjust_birthdate(data),
        "address": [
            {
                "use": "home",
                "line": [f"{data.get('address',None)}"],
                "city": f"{data.get('city',None)}",
                "state": f"{data.get('state',None)}",
                "postalCode": f"{data.get('zip',None)}",
            }
        ],
    }

    for col, value in optional_data.items():
        if value is not None:
            if col == "mrn":
                mrn = {
                    "type": {
                        "coding": [
                            {
                                "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                "code": "MR",
                            }
                        ]
                    },
                    "value": value,
                }
                identifiers.append(mrn)
            elif col == "ssn":
                ssn = {
                    "type": {
                        "coding": [
                            {
                                "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                "code": "SS",
                            }
                        ]
                    },
                    "value": value,
                }
                identifiers.append(ssn)
            elif col == "home_phone":
                home_phone = {
                        "system": "phone",
                        "value": value,
                        "use": "home",
                    }
                telecom.append(home_phone)
            elif col == "cell_phone":
                cell_phone = {
                    "system": "phone",
                    "value": value,
                    "use": "mobile",
                }
                telecom.append(cell_phone)

            elif col == "email":
                email = {"value": value, "system": "email"}
                telecom.append(email)

        if len(identifiers) > 0:
            patient_resource["identifier"] = identifiers
        if len(telecom) > 0:
            patient_resource["telecom"] = telecom

    fhir_bundle = {
        "resourceType": "Bundle",
        "type": "batch",
        "id": str(uuid.uuid4()),
        "entry": [
            {
                "fullUrl": f"urn:uuid:{patient_id}",
                "resource": patient_resource,
                "request": {"method": "PUT", "url": f"Patient/{patient_id}"},
            },
        ],
    }

    external_person_id = data.get("person_id", None)
    return (external_person_id, fhir_bundle)

In [None]:
from notebookutils import mssparkutils
# from phdi.linkage.seed import convert_to_patient_fhir_resources
from datetime import date
import json
from pyspark.sql import SparkSession
import os
from datetime import datetime, timezone, timedelta
import pytz 
import time

spark = SparkSession.builder.appName("ProcessRowsInChunks").getOrCreate()

# Set up number of rows to be processed at a time
n_rows = 1000

# Set up file client
storage_account = "$STORAGE_ACCOUNT"
source_data_bucket = "source-data"
patient_data_bucket = "patient-data"
storage_account_url = f"https://{storage_account}.blob.core.windows.net/"
mpi_incoming_filename = f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/{filename}"

# Set up for writing to blob storage
blob_relative_path = ""
blob_storage_linked_service = "$BLOB_STORAGE_LINKED_SERVICE" 
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(blob_storage_linked_service)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (source_data_bucket, storage_account, blob_relative_path)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (source_data_bucket, storage_account), blob_sas_token)

# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": blob_storage_linked_service}
    )
except:
    print("Already mounted")


In [None]:
def get_row_start(filename,patient_data_bucket,storage_account):
    """
    Checks where in the seed (or large) file to start processing.
    """
    row_count_filename = f"last_row_added_to_mpi_{filename.split('.')[0]}.json"
    incoming_file_dir = f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/"

    incoming_files = mssparkutils.fs.ls(incoming_file_dir)
    filenames = [file.name for file in incoming_files]

    if row_count_filename in filenames:

        l = mssparkutils.fs.head(incoming_file_dir + f"{row_count_filename}")
        row_start = int(l.split(':')[-1][:-1])
        
    else:
        row_start = 0
    
    return row_start, row_count_filename

def is_valid_time_window():
    """
    Checks that updating the MPI occurs outside the window in which eCR data is processed
    """
    # Set the timezone to Pacific Time (PT)
    pt_timezone = pytz.timezone("US/Pacific")

    # Get the current time in the Pacific Time zone
    current_time = datetime.now().astimezone(pt_timezone)
    
    # Define the time window (9:30am to 11:30am PT)
    start_time = current_time.replace(hour=9, minute=30, second=0, microsecond=0)
    end_time = current_time.replace(hour=11, minute=30, second=0, microsecond=0)
    
    # Check if the current time is NOT within the specified window when eCR data is likely being processed
    valid_time = start_time <= current_time <= end_time

    return not valid_time


def process_rows_in_chunks(dataframe, last_processed_row, patient_data_bucket, storage_account, row_count_filename, chunk_size):
    """
    Processes rows to seed data in `n_rows` chunks outside the time window when eCR data is being processed.
    """
    curr_date = date.today()
    total_rows = dataframe.count()
    start = last_processed_row
    idx = start

    while start < total_rows:

        if is_valid_time_window():
    
            # Process the chunk of data
            for row in dataframe.collect()[start:start+chunk_size]:
                idx +=1 
                iris_id, fhir_bundle = convert_to_patient_fhir_resources(row.asDict())
                fhir_bundle["meta"] = {"source": "uri:iris"}

                data = {
                    'bundle': fhir_bundle,
                    'external_person_id': iris_id
                }

                pre_filename = f"abfss://{source_data_bucket}@{storage_account}.dfs.core.windows.net/fhir/lac_extract_{str(curr_date)}_{str(idx)}.json"
                mssparkutils.fs.put(pre_filename, json.dumps(data), True)

            start += chunk_size

            # Update the last processed row in the checkpoint file
            last_row_data = {"last_row_added_to_mpi":idx}
            mssparkutils.fs.put(f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/{row_count_filename}", json.dumps(last_row_data), True)

        else:
            # Wait for a certain time before checking again
            # Assuming a delay of 15 minutes
            time.sleep(900)  # Sleep for 15 minutes before rechecking


In [None]:
# Read in MPI seed data
df = spark.read.parquet(mpi_incoming_filename) 

# Process rows in chunks of n_rows
last_processed_row, row_count_filename = get_row_start(filename,patient_data_bucket,storage_account)
process_rows_in_chunks(df, last_processed_row, patient_data_bucket, storage_account, row_count_filename, chunk_size=n_rows)


In [None]:
# Move file that triggered the MPI update event and the row_count_filename file into the archive folder 
for f in filename, row_count_filename:
    source = f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/{f}"
    destination = f"abfss://{patient_data_bucket}@{storage_account}.dfs.core.windows.net/archive/{f}"
    mssparkutils.fs.mv(src=source,dest=destination,create_path=True)