# updateECRdatastore

This notebook updates the ECR datastore delta table with new ECR records (`PARSED_ECR_PATH`); a new ECR datastore delta table is created if one does not already exist.

In [None]:
# Set paths
STORAGE_ACCOUNT = "$STORAGE_ACCOUNT"
BASE_DATASTORE_DIRECTORY = "ecr-datastore"
DELTA_TABLES_FILESYSTEM = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
ECR_DATASTORE_PATH = DELTA_TABLES_FILESYSTEM + "ecr-datastore"
ECR_DATASTORE_DAILY_EXTRACT_PATH = DELTA_TABLES_FILESYSTEM + "ecr-datastore"
PARSED_ECR_PATH = DELTA_TABLES_FILESYSTEM + "raw_data"
DAILY_EXTRACT_FORMATS = ["parquet","csv"]

In [None]:
from notebookutils import mssparkutils

# Set up for writing to blob storage
delta_bucket_name = "delta-tables"
linked_service_name = "$BLOB_STORAGE_LINKED_SERVICE" 
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/' % (delta_bucket_name, STORAGE_ACCOUNT)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (delta_bucket_name, STORAGE_ACCOUNT), blob_sas_token)
# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": linked_service_name}
    )
except:
    print("Already mounted")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
    BooleanType,
    DateType,
    TimestampType,
    ArrayType
)
from pyspark.sql.functions import explode_outer

from delta.tables import *
import json
from typing import Tuple

# Prepare Schemas
ecr_schema_path = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/ecr_datastore_config.json"
ecr_schema = spark.read.json(ecr_schema_path,multiLine=True)

def prepare_schemas(ecr_schema):
    table_schemas = {
    "core":{
        "patient_id": ["string", False],
        "person_id": ["string", False],
        "person_id_date_added": ["timestamp", True],
        "iris_id": ["string", True],
        "iris_id_date_added": ["timestamp", True],
        "incident_id": ["string", True],
        "incident_id_date_added": ["timestamp", True]}
    }

    row = ecr_schema.collect()[0].asDict()
    column_names = list(row.keys())

    for column_name, column_data in row.items():
        if column_data['data_type'] != "array":
            table_schemas["core"][column_name] = [column_data['data_type'], column_data['nullable']]
        else:
            table_schemas[column_name] = {}
            table_schemas[column_name]['eicr_id'] = ['string', False]
            table_schemas[column_name][column_name] = {}
            
            for secondary_column_name, secondary_column_data in row[column_name]['secondary_schema'].asDict().items():
                table_schemas[column_name][column_name][secondary_column_name] = [secondary_column_data['data_type'], secondary_column_data['nullable']]

    return table_schemas

def get_schemas(table_schemas: dict) -> Tuple[StructType, dict]:
    """
    Get a Spark StructType object from a JSON schema string.

    :param schema: A dictionary defining the schema of the ECR datastore including 
        the data type of each field and whether null values are allowed. Should be of the form:
        '{"fieldname": [<data type>, <nullable?(True/False)>]}'.
    :return: A tuple containing a Spark StructType object representing the schema 
    and a dictionary defining field mappings for merge operations. 
    """

    schema_type_map = {
        "string": StringType(),
        "integer": IntegerType(),
        "float": FloatType(),
        "boolean": BooleanType(),
        "date": DateType(),
        "timestamp": TimestampType(),
        "datetime": DateType(),
        "number": IntegerType()
    }
    spark_schemas = {}

    for table_name, schema in table_schemas.items():
        spark_schema = StructType()
        flattened_df_schema = StructType()
        merge_schema = {}
        array_fields = []

        for field in schema:
            if isinstance(schema[field], dict):
                for array_field, data in schema[field].items():
                    array_fields.append(StructField(array_field, schema_type_map[data[0]],data[1]))
                    merge_schema[array_field] = "new." + array_field
                    flattened_df_schema.add(StructField(array_field, schema_type_map[data[0]],data[1]))
                spark_schema.add(StructField(field, ArrayType(
                    StructType(array_fields)
                )))
                        
            else:
                spark_schema.add(StructField(field, schema_type_map[schema[field][0]], schema[field][1]))
                flattened_df_schema.add(StructField(field, schema_type_map[schema[field][0]], schema[field][1]))
                merge_schema[field] = "new." + field

        spark_schemas[table_name] = {
            "spark_schema": spark_schema,
            "merge_schema": merge_schema,
            "flattened_df_schema": flattened_df_schema
        }

    return spark_schemas

# Prepare Schemas 
table_schemas = prepare_schemas(ecr_schema)

# Format table schemas for spark
spark_schemas = get_schemas(table_schemas)

# Initialize Spark session
spark = (
    SparkSession.builder.master("local[*]")
    .appName("Update eCR Datastore")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .getOrCreate()
)  

In [None]:
def transform_active_problems_ecr_records(active_problems: DataFrame) -> DataFrame:
    """
    Formats eCR records with active problems into dataframes that have 
     three columns, `eicr_id`, `problem`, and `problem_date`, and 1
     active problem per row.
    
    :param active_problems: Spark DataFrame.
    :return: Spark DataFrame with 1 active problem per row.

    """
    active_problems = active_problems.select(active_problems.eicr_id,explode_outer(active_problems.active_problems).alias("problems")).rdd.map(
        lambda x: (x.eicr_id, x.problems['problem'],x.problems['problem_date'])).toDF(flattened_active_problems_schema)
    return active_problems


def transform_labs_ecr_records(labs: DataFrame) -> DataFrame:
    """
    Formats eCR records with labs into a DataFrame that has 
     1 lab test and associated results per row.
    
    :param labs: Spark DataFrame.
    :return: Spark DataFrame with 1 lab per row.

    """
    labs = labs.select(labs.eicr_id,explode_outer(labs.labs).alias("labs")).rdd.map(
    lambda x: (
        x.eicr_id, 
        x.labs['performing_lab'],
        x.labs['specimen_collection_date'],
        x.labs['specimen_type'],
        x.labs['test_result_code'],
        x.labs['test_result_code_display'],
        x.labs['test_result_code_system'],
        x.labs['test_result_interp'],
        x.labs['test_result_interp_code'],
        x.labs['test_result_interp_system'],
        x.labs['test_result_qualitative'],
        x.labs['test_result_quantitative'],
        x.labs['test_result_ref_range_high'],
        x.labs['test_result_ref_range_high_units'],
        x.labs['test_result_ref_range_low'],
        x.labs['test_result_ref_range_low_units'],
        x.labs['test_result_units'],
        x.labs['test_type'],
        x.labs['test_type_code'],
        x.labs['test_type_system'],
         )).toDF(flattened_labs_schema)
    return labs


def update_ecr_datastore(schema,merge_schema,table_name, ECR_DATASTORE_PATH, DELTA_TABLES_FILESYSTEM):
    # Read JSON files into a DataFrame with the specified schema
    new_ecr_records = spark.read.schema(schema).json(PARSED_ECR_PATH,multiLine=True)

    if table_name == "labs":
        new_ecr_records = transform_labs_ecr_records(new_ecr_records)
    elif table_name == "active_problems":
        new_ecr_records = transform_active_problems_ecr_records(new_ecr_records)

    # Check if Delta table exists
    ECR_DATASTORE_PATH = ECR_DATASTORE_PATH + f"-{table_name}"

    # Check if Delta table exists
    if DeltaTable.isDeltaTable(spark, ECR_DATASTORE_PATH):
        # If the table exists add new records.
        ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH)

        ecr_datastore.alias("old").merge(
            new_ecr_records.alias("new"), "old.eicr_id = new.eicr_id"
        ).whenNotMatchedInsert(values=merge_schema).execute()
    else:
        # If Delta table doesn't exist, create it.
        new_ecr_records.write.format("delta").mode("append").save(ECR_DATASTORE_PATH)

    # Make a copy of the Delta table in CSV format for easy access.
    ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH).toDF()

    # Set up storage client
    container_url = f"https://{STORAGE_ACCOUNT}.blob.core.windows.net/"

    for format in DAILY_EXTRACT_FORMATS:

        # Write standard pyspark directories for each file format
        # Force pyspark to coalesce the results into a single file
        format_path = ECR_DATASTORE_PATH + "." + format
        modified_datastore_directory = ECR_DATASTORE_PATH + "." + format + "/"
        ecr_datastore.coalesce(1).write.format(format).option("header",True).mode('overwrite').save(format_path)

        # Locate the file which actually has the data amidst the pyspark kruft
        partial_file = ""
        for f in mssparkutils.fs.ls(format_path):
            file_in_namespace = f.path.split("/")[-1]
            if file_in_namespace.startswith("part-") and file_in_namespace.endswith("." + format):
                partial_file = f.path

        # Create a copy of just the data at the root level, formatted appropriately
        mssparkutils.fs.cp(partial_file, DELTA_TABLES_FILESYSTEM + f"/updated_ecr_datastore/updated_ecr_datastore-{table_name}." + format)

        # Now delete the pyspark junk folder by deleting all virtual filepaths
        mssparkutils.fs.rm(format_path, recurse=True)

In [None]:
# Core
update_ecr_datastore(core_spark_schema, core_merge_schema, table_name = "core", ECR_DATASTORE_PATH = ECR_DATASTORE_PATH, DELTA_TABLES_FILESYSTEM = DELTA_TABLES_FILESYSTEM)

# Labs
update_ecr_datastore(labs_spark_schema, labs_merge_schema, table_name = "labs", ECR_DATASTORE_PATH = ECR_DATASTORE_PATH, DELTA_TABLES_FILESYSTEM = DELTA_TABLES_FILESYSTEM)

#  Active Problems
update_ecr_datastore(active_problems_spark_schema, active_problems_merge_schema, table_name = "active_problems", ECR_DATASTORE_PATH = ECR_DATASTORE_PATH, DELTA_TABLES_FILESYSTEM = DELTA_TABLES_FILESYSTEM)
