In [None]:
# Set paths
storage_account_name = "$STORAGE_ACCOUNT"
ECR_DATASTORE_PATH = f"abfss://delta-tables@{storage_account_name}.dfs.core.windows.net/ecr-datastore"
PARSED_ECR_PATH = f"abfss://delta-tables@{storage_account_name}.dfs.core.windows.net/raw_data"



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    FloatType,
    BooleanType,
    DateType,
    TimestampType,
)
from delta.tables import *
import json
from typing import Tuple

# Prepare Schema
schema = {
    "patient_id": ["string", False],
    "person_id": ["string", False],
    "iris_id": ["string", True],
    "incident_id": ["string", True],
    "last_name": ["string", True],
    "first_name": ["string", True],
    "rr_id": ["string", True],
    "status": ["string", True],
    "conditions": ["string", True],
    "eicr_id": ["string", False],
    "eicr_version_number": ["integer", True],
    "authoring_datetime": ["timestamp", True],
    "provider_id": ["string", True],
    "facility_id_number": ["string", True],
    "facility_name": ["string", True],
    "facility_type": ["string", True],
    "encounter_type": ["string", True],
    "encounter_start_date": ["date", True],
    "encounter_end_date": ["date", True],
    "active_problem_1": ["string", True],
    "active_problem_date_1": ["date", True],
    "active_problem_2": ["string", True],
    "active_problem_date_2": ["date", True],
    "active_problem_3": ["string", True],
    "active_problem_date_3": ["date", True],
    "active_problem_4": ["string", True],
    "active_problem_date_4": ["date", True],
    "active_problem_5": ["string", True],
    "active_problem_date_5": ["date", True],
    "reason_for_visit": ["string", True],
    "test_type_1": ["string", True],
    "test_type_code_1": ["string", True],
    "test_result_1": ["string", True],
    "test_result_interp_1": ["string", True],
    "specimen_type_1": ["string", True],
    "performing_lab_1": ["string", True],
    "specimen_collection_date_1": ["timestamp", True],
    "result_date_1": ["timestamp", True],
    "test_type_2": ["string", True],
    "test_type_code_2": ["string", True],
    "test_result_2": ["string", True],
    "test_result_interp_2": ["string", True],
    "specimen_type_2": ["string", True],
    "performing_lab_2": ["string", True],
    "specimen_collection_date_2": ["timestamp", True],
    "result_date_2": ["timestamp", True],
    "test_type_3": ["string", True],
    "test_type_code_3": ["string", True],
    "test_result_3": ["string", True],
    "test_result_interp_3": ["string", True],
    "specimen_type_3": ["string", True],
    "performing_lab_3": ["string", True],
    "specimen_collection_date_3": ["timestamp", True],
    "result_date_3": ["timestamp", True],
    "test_type_4": ["string", True],
    "test_type_code_4": ["string", True],
    "test_result_4": ["string", True],
    "test_result_interp_4": ["string", True],
    "specimen_type_4": ["string", True],
    "performing_lab_4": ["string", True],
    "specimen_collection_date_4": ["timestamp", True],
    "result_date_4": ["timestamp", True],
    "test_type_5": ["string", True],
    "test_type_code_5": ["string", True],
    "test_result_5": ["string", True],
    "test_result_interp_5": ["string", True],
    "specimen_type_5": ["string", True],
    "performing_lab_5": ["string", True],
    "specimen_collection_date_5": ["timestamp", True],
    "result_date_5": ["timestamp", True],
    "test_type_6": ["string", True],
    "test_type_code_6": ["string", True],
    "test_result_6": ["string", True],
    "test_result_interp_6": ["string", True],
    "specimen_type_6": ["string", True],
    "performing_lab_6": ["string", True],
    "specimen_collection_date_6": ["timestamp", True],
    "result_date_6": ["timestamp", True],
    "test_type_7": ["string", True],
    "test_type_code_7": ["string", True],
    "test_result_7": ["string", True],
    "test_result_interp_7": ["string", True],
    "specimen_type_7": ["string", True],
    "performing_lab_7": ["string", True],
    "specimen_collection_date_7": ["timestamp", True],
    "result_date_7": ["timestamp", True],
    "test_type_8": ["string", True],
    "test_type_code_8": ["string", True],
    "test_result_8": ["string", True],
    "test_result_interp_8": ["string", True],
    "specimen_type_8": ["string", True],
    "performing_lab_8": ["string", True],
    "specimen_collection_date_8": ["timestamp", True],
    "result_date_8": ["timestamp", True],
    "test_type_9": ["string", True],
    "test_type_code_9": ["string", True],
    "test_result_9": ["string", True],
    "test_result_interp_9": ["string", True],
    "specimen_type_9": ["string", True],
    "performing_lab_9": ["string", True],
    "specimen_collection_date_9": ["timestamp", True],
    "result_date_9": ["timestamp", True],
    "test_type_10": ["string", True],
    "test_type_code_10": ["string", True],
    "test_result_10": ["string", True],
    "test_result_interp_10": ["string", True],
    "specimen_type_10": ["string", True],
    "performing_lab_10": ["string", True],
    "specimen_collection_date_10": ["timestamp", True],
    "result_date_10": ["timestamp", True],
    "test_type_11": ["string", True],
    "test_type_code_11": ["string", True],
    "test_result_11": ["string", True],
    "test_result_interp_11": ["string", True],
    "specimen_type_11": ["string", True],
    "performing_lab_11": ["string", True],
    "specimen_collection_date_11": ["timestamp", True],
    "result_date_11": ["timestamp", True],
    "test_type_12": ["string", True],
    "test_type_code_12": ["string", True],
    "test_result_12": ["string", True],
    "test_result_interp_12": ["string", True],
    "specimen_type_12": ["string", True],
    "performing_lab_12": ["string", True],
    "specimen_collection_date_12": ["timestamp", True],
    "result_date_12": ["timestamp", True],
    "test_type_13": ["string", True],
    "test_type_code_13": ["string", True],
    "test_result_13": ["string", True],
    "test_result_interp_13": ["string", True],
    "specimen_type_13": ["string", True],
    "performing_lab_13": ["string", True],
    "specimen_collection_date_13": ["timestamp", True],
    "result_date_13": ["timestamp", True],
    "test_type_14": ["string", True],
    "test_type_code_14": ["string", True],
    "test_result_14": ["string", True],
    "test_result_interp_14": ["string", True],
    "specimen_type_14": ["string", True],
    "performing_lab_14": ["string", True],
    "specimen_collection_date_14": ["timestamp", True],
    "result_date_14": ["timestamp", True],
    "test_type_15": ["string", True],
    "test_type_code_15": ["string", True],
    "test_result_15": ["string", True],
    "test_result_interp_15": ["string", True],
    "specimen_type_15": ["string", True],
    "performing_lab_15": ["string", True],
    "specimen_collection_date_15": ["timestamp", True],
    "result_date_15": ["timestamp", True],
    "test_type_16": ["string", True],
    "test_type_code_16": ["string", True],
    "test_result_16": ["string", True],
    "test_result_interp_16": ["string", True],
    "specimen_type_16": ["string", True],
    "performing_lab_16": ["string", True],
    "specimen_collection_date_16": ["timestamp", True],
    "result_date_16": ["timestamp", True],
    "test_type_17": ["string", True],
    "test_type_code_17": ["string", True],
    "test_result_17": ["string", True],
    "test_result_interp_17": ["string", True],
    "specimen_type_17": ["string", True],
    "performing_lab_17": ["string", True],
    "specimen_collection_date_17": ["timestamp", True],
    "result_date_17": ["timestamp", True],
    "test_type_18": ["string", True],
    "test_type_code_18": ["string", True],
    "test_result_18": ["string", True],
    "test_result_interp_18": ["string", True],
    "specimen_type_18": ["string", True],
    "performing_lab_18": ["string", True],
    "specimen_collection_date_18": ["timestamp", True],
    "result_date_18": ["timestamp", True],
    "test_type_19": ["string", True],
    "test_type_code_19": ["string", True],
    "test_result_19": ["string", True],
    "test_result_interp_19": ["string", True],
    "specimen_type_19": ["string", True],
    "performing_lab_19": ["string", True],
    "specimen_collection_date_19": ["timestamp", True],
    "result_date_19": ["timestamp", True],
    "test_type_20": ["string", True],
    "test_type_code_20": ["string", True],
    "test_result_20": ["string", True],
    "test_result_interp_20": ["string", True],
    "specimen_type_20": ["string", True],
    "performing_lab_20": ["string", True],
    "specimen_collection_date_20": ["timestamp", True],
    "result_date_20": ["timestamp", True]
  }

def get_schemas(schema: dict) -> Tuple[StructType, dict]:
    """
    Get a Spark StructType object from a JSON schema string.

    :param schema: A dictionary defining the schema of the eCR data store including 
        the data type of each field and whether null values are allowed. Should be of the form:
        '{"fieldname": [<data type>, <nullable?(True/False)>]}'.
    :return: A tuple containing a Spark StructType object representing the schema 
    and a dictionary defining field mappings for merge operations. 
    """

    schema_type_map = {
        "string": StringType(),
        "integer": IntegerType(),
        "float": FloatType(),
        "boolean": BooleanType(),
        "date": DateType(),
        "timestamp": TimestampType(),
    }
    spark_schema = StructType()
    merge_schema = {}
    for field in schema:
        spark_schema.add(StructField(field, schema_type_map[schema[field][0]], schema[field][1]))
        merge_schema[field] = "new." + field
    return spark_schema, merge_schema


spark_schema, merge_schema = get_schemas(schema)

# Initialize Spark session
spark = (
    SparkSession.builder.master("local[*]")
    .appName("Update eCR Datastore")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .getOrCreate()
)

# Read JSON files into a DataFrame with the specified schema
new_ecr_records = spark.read.schema(spark_schema).json(PARSED_ECR_PATH)

# Check if Delta table exists
if DeltaTable.isDeltaTable(spark, ECR_DATASTORE_PATH):
    # If the table exists add new records.
    ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH)

    ecr_datastore.alias("old").merge(
        new_ecr_records.alias("new"), "old.eicr_id = new.eicr_id"
    ).whenNotMatchedInsert(values=merge_schema).execute()
else:
    # If Delta table doesn't exist, create it.
    new_ecr_records.write.format("delta").mode("append").save(ECR_DATASTORE_PATH)

ecr_datastore = DeltaTable.forPath(spark, ECR_DATASTORE_PATH).toDF()
ecr_datastore.show(20)

print(DeltaTable.isDeltaTable(spark, ECR_DATASTORE_PATH))
print(ecr_datastore.count())
