# updateECRdatastore

This notebook updates the ECR datastore delta table with new ECR records (`PARSED_ECR_PATH`); a new ECR datastore delta table is created if one does not already exist.

In [None]:
# Set paths
STORAGE_ACCOUNT = "$STORAGE_ACCOUNT"
BASE_DATASTORE_DIRECTORY = "ecr-datastore"
DELTA_TABLES_FILESYSTEM = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/"
ECR_DATASTORE_PATH = DELTA_TABLES_FILESYSTEM + "ecr-datastore"
ECR_DATASTORE_DAILY_EXTRACT_PATH = DELTA_TABLES_FILESYSTEM + "ecr-datastore"
PARSED_ECR_PATH = DELTA_TABLES_FILESYSTEM + "raw_data"
DAILY_EXTRACT_FORMATS = ["parquet","csv"]

In [None]:
from notebookutils import mssparkutils

# Set up for writing to blob storage
delta_bucket_name = "delta-tables"
linked_service_name = "$BLOB_STORAGE_LINKED_SERVICE" 
blob_sas_token = mssparkutils.credentials.getConnectionStringOrCreds(linked_service_name)
wasb_path = 'wasbs://%s@%s.blob.core.windows.net/' % (delta_bucket_name, STORAGE_ACCOUNT)
spark.conf.set('fs.azure.sas.%s.%s.blob.core.windows.net' % (delta_bucket_name, STORAGE_ACCOUNT), blob_sas_token)
# Try mounting the remote storage directory at the mount point
try:
    mssparkutils.fs.mount(
        wasb_path,
        "/",
        {"LinkedService": linked_service_name}
    )
except:
    print("Already mounted")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
    BooleanType,
    DateType,
    TimestampType,
    ArrayType
)
from delta.tables import *
import json
from typing import Tuple

# Prepare Schemas
core_table_schema = {}
labs_table_schema = {}
active_problems_table_schema = {}

ecr_schema_path = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/ecr_datastore_config.json"
ecr_schema = spark.read.json(ecr_schema_path,multiLine=True)

for row in ecr_schema.collect():
    row = row.asDict()
    column_names = list(row.keys())
    for column in range(len(column_names)):
        if row[column_names[column]]['data_type'] != 'array':
            core_table_schema[column_names[column]] = [row[column_names[column]]['data_type'], row[column_names[column]]['nullable']]
        else:
            labs_table_schema['eicr_id'] = [row['eicr_id']['data_type'], row['eicr_id']['nullable']]
            active_problems_table_schema['eicr_id'] = [row['eicr_id']['data_type'], row['eicr_id']['nullable']]

            secondary_schema_columns = row[column_names[column]].asDict()['secondary_schema'].asDict()
            for col in secondary_schema_columns.keys():
                if column_names[column] == 'labs':
                    labs_table_schema[col] = [secondary_schema_columns[col].asDict()['data_type'], secondary_schema_columns[col].asDict()['nullable']]
                elif column_names[column] == 'active_problems':
                    active_problems_table_schema[col] = [secondary_schema_columns[col].asDict()['data_type'], secondary_schema_columns[col].asDict()['nullable']]

def get_schemas(schema: dict) -> Tuple[StructType, dict]:
    """
    Get a Spark StructType object from a JSON schema string.

    :param schema: A dictionary defining the schema of the ECR datastore including 
        the data type of each field and whether null values are allowed. Should be of the form:
        '{"fieldname": [<data type>, <nullable?(True/False)>]}'.
    :return: A tuple containing a Spark StructType object representing the schema 
    and a dictionary defining field mappings for merge operations. 
    """

    schema_type_map = {
        "string": StringType(),
        "integer": IntegerType(),
        "float": FloatType(),
        "boolean": BooleanType(),
        "date": DateType(),
        "timestamp": TimestampType(),
    }
    spark_schema = StructType()
    merge_schema = {}
    for field in schema:
        spark_schema.add(StructField(field, schema_type_map[schema[field][0]], schema[field][1]))
        merge_schema[field] = "new." + field
    return spark_schema, merge_schema


core_spark_schema, core_merge_schema = get_schemas(core_table_schema)
labs_spark_schema, labs_merge_schema = get_schemas(labs_table_schema)
active_problems_spark_schema, active_problems_merge_schema = get_schemas(active_problems_table_schema)

# Initialize Spark session
spark = (
    SparkSession.builder.master("local[*]")
    .appName("Update eCR Datastore")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .getOrCreate()
)  

In [None]:

test_active_problems_schema = StructType(
    [
        StructField('eicr_id', StringType(), False), 
        StructField("active_problems", ArrayType(
            StructType(
                [
                    StructField("problem", StringType(), True),
                    StructField("problem_date", DateType(), True)
                ]
            )
        ), True)
    ]
    )

test_labs_schema = StructType(
    [
        StructField('eicr_id', StringType(), False), 
        StructField("labs", ArrayType(
            StructType(
                [
                    StructField("test_type", StringType(), True),
                    StructField("test_type_code", StringType(), True),
                    StructField("test_type_system", StringType(), True),
                    StructField("test_result_qualitative", StringType(), True),
                    StructField("test_result_quantitative", StringType(), True),
                    StructField("test_result_units", StringType(), True),
                    StructField("test_result_code", StringType(), True),
                    StructField("test_result_code_display", StringType(), True),
                    StructField("test_result_interp", StringType(), True),
                    StructField("test_result_interp_code", StringType(), True),
                    StructField("test_result_interp_system", StringType(), True),
                    StructField("test_result_ref_range_low", StringType(), True),
                    StructField("test_result_ref_range_low_units", StringType(), True),
                    StructField("test_result_ref_range_high", StringType(), True),
                    StructField("test_result_ref_range_high_units", StringType(), True),
                    StructField("specimen_type", StringType(), True),
                    StructField("performing_lab", StringType(), True),
                    StructField("specimen_collection_date", TimestampType(), True)
                ]
            )
        ), True)
    ]
    )
    

In [None]:
test_PARSED_ECR_PATH = DELTA_TABLES_FILESYSTEM + "raw_data_test"
from pyspark.sql.functions import explode_outer

active_problems = spark.read.schema(test_active_problems_schema).json(test_PARSED_ECR_PATH,multiLine=True)
active_problems = active_problems.select(active_problems.eicr_id,explode_outer(active_problems.active_problems).alias("problems")).rdd.map(lambda x: (x.eicr_id, x.problems['problem'],x.problems['problem_date'])).toDF(['eicr_id','problem','problem_date'])
active_problems.show()

In [None]:
test_PARSED_ECR_PATH = DELTA_TABLES_FILESYSTEM + "raw_data_test"
from pyspark.sql.functions import explode_outer

labs = spark.read.schema(test_labs_schema).json(test_PARSED_ECR_PATH,multiLine=True)
labs = labs.select(labs.eicr_id,explode_outer(labs.labs).alias("labs")).rdd.map(
    lambda x: (
        x.eicr_id, 
        x.labs['test_type'],
        x.labs['test_type_code'],
        x.labs['test_type_system'],
        x.labs['test_result_qualitative'],
        x.labs['test_result_quantitative'],
        x.labs['test_result_units'],
        x.labs['test_result_code'],
        x.labs['test_result_code_display'],
        x.labs['test_result_interp'],
        x.labs['test_result_interp_code'],
        x.labs['test_result_interp_system'],
        x.labs['test_result_ref_range_low'],
        x.labs['test_result_ref_range_low_units'],
        x.labs['test_result_ref_range_high'],
        x.labs['test_result_ref_range_high_units'],
        x.labs['specimen_type'],
        x.labs['performing_lab'],
        x.labs['specimen_collection_date'],
         )).toDF([
            'eicr_id',
            'test_type',
            'test_type_code',
            'test_type_system',
            'test_result_qualitative',
            'test_result_quantitative',
            'test_result_units',
            'test_result_code',
            'test_result_code_display',
            'test_result_interp',
            'test_result_interp_code',
            'test_result_interp_system',
            'test_result_ref_range_low',
            'test_result_ref_range_low_units',
            'test_result_ref_range_high',
            'test_result_ref_range_high_units',
            'specimen_type',
            'performing_lab',
            'specimen_collection_date'])
labs.show()



In [None]:
# Read JSON files into a DataFrame with the specified schema
new_core_ecr_records = spark.read.schema(core_spark_schema).json(PARSED_ECR_PATH,multiLine=True)
new_labs_ecr_records = spark.read.schema(labs_spark_schema).json(PARSED_ECR_PATH,multiLine=True)
new_active_problems_ecr_records = spark.read.schema(active_problems_spark_schema).json(PARSED_ECR_PATH,multiLine=True)


# Check if Delta table exists
if DeltaTable.isDeltaTable(spark, ECR_DATASTORE_PATH):
    # If the table exists add new records.
    ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH)

    ecr_datastore.alias("old").merge(
        new_ecr_records.alias("new"), "old.eicr_id = new.eicr_id"
    ).whenNotMatchedInsert(values=merge_schema).execute()
else:
    # If Delta table doesn't exist, create it.
    new_ecr_records.write.format("delta").mode("append").save(ECR_DATASTORE_PATH)

# Make a copy of the Delta table in CSV format for easy access.
ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH).toDF()

# Set up storage client
container_url = f"https://{STORAGE_ACCOUNT}.blob.core.windows.net/"

for format in DAILY_EXTRACT_FORMATS:

    # Write standard pyspark directories for each file format
    # Force pyspark to coalesce the results into a single file
    format_path = ECR_DATASTORE_DAILY_EXTRACT_PATH + "." + format
    modified_datastore_directory = BASE_DATASTORE_DIRECTORY + "." + format + "/"
    ecr_datastore.coalesce(1).write.format(format).option("header",True).mode('overwrite').save(format_path)

    # Locate the file which actually has the data amidst the pyspark kruft
    partial_file = ""
    for f in mssparkutils.fs.ls(format_path):
        file_in_namespace = f.path.split("/")[-1]
        if file_in_namespace.startswith("part-") and file_in_namespace.endswith("." + format):
            partial_file = f.path

    # Create a copy of just the data at the root level, formatted appropriately
    mssparkutils.fs.cp(partial_file, DELTA_TABLES_FILESYSTEM + "updated_ecr_datastore." + format)

    # Now delete the pyspark junk folder by deleting all virtual filepaths
    mssparkutils.fs.rm(format_path, recurse=True)