In [None]:
from pyspark.sql import SparkSession
from delta.tables import *
from pyspark.sql.functions import *

ECR_DELTA_TABLE_FILE_PATH = "/delta-tables/ecr_datastore"
MCI_DELTA_TABLE_FILE_PATH = "/delta-tables/mci"
COVID_IDENTIFICATION_CONFIG_FILE_PATH = "/config/covid_identification_config.json"

spark = SparkSession.builder.getOrCreate()

# Read in data
ecr = spark.read.format("delta").load(ECR_DELTA_TABLE_FILE_PATH)
mci = spark.read.format("delta").load(MCI_DELTA_TABLE_FILE_PATH).select("incident_id","person_id","specimen_collection_date").withColumnRenamed("incident_id","incident_id_mci").withColumnRenamed("person_id","person_id_mci").withColumnRenamed("specimen_collection_date","specimen_collection_date_mci")

# Covid identification data
df = spark.read.json(COVID_IDENTIFICATION_CONFIG_FILE_PATH)
covid_conditions=df.select('covid_conditions').rdd.flatMap(lambda x: x).collect()[0]
covid_test_types = df.select('covid_test_types').rdd.flatMap(lambda x: x).collect()[0]
covid_positive_results = df.select('covid_positive_results').rdd.flatMap(lambda x: x).collect()[0]

In [None]:
# Add `comparison_date` column to ecr data ahead of join with mci to find positive covid tests
ecr = ecr.withColumn("comparison_date",
    when((lower(ecr.test_type_1).isin(covid_test_types) & lower(ecr.test_result_1).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_1)
    .when((lower(ecr.test_type_2).isin(covid_test_types) & lower(ecr.test_result_2).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_2)
    .when((lower(ecr.test_type_3).isin(covid_test_types) & lower(ecr.test_result_3).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_3)
    .when((lower(ecr.test_type_4).isin(covid_test_types) & lower(ecr.test_result_4).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_4)
    .when((lower(ecr.test_type_5).isin(covid_test_types) & lower(ecr.test_result_5).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_5)
    .when((lower(ecr.test_type_6).isin(covid_test_types) & lower(ecr.test_result_6).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_6)
    .when((lower(ecr.test_type_7).isin(covid_test_types) & lower(ecr.test_result_7).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_7)
    .when((lower(ecr.test_type_8).isin(covid_test_types) & lower(ecr.test_result_8).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_8)
    .when((lower(ecr.test_type_9).isin(covid_test_types) & lower(ecr.test_result_9).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_9)
    .when((lower(ecr.test_type_10).isin(covid_test_types) & lower(ecr.test_result_10).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_10)
    .when((lower(ecr.test_type_11).isin(covid_test_types) & lower(ecr.test_result_11).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_11)
    .when((lower(ecr.test_type_12).isin(covid_test_types) & lower(ecr.test_result_12).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_12)
    .when((lower(ecr.test_type_12).isin(covid_test_types) & lower(ecr.test_result_13).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_13)
    .when((lower(ecr.test_type_14).isin(covid_test_types) & lower(ecr.test_result_14).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_14)
    .when((lower(ecr.test_type_15).isin(covid_test_types) & lower(ecr.test_result_15).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_15)
    .when((lower(ecr.test_type_16).isin(covid_test_types) & lower(ecr.test_result_16).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_16)
    .when((lower(ecr.test_type_17).isin(covid_test_types) & lower(ecr.test_result_17).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_17)
    .when((lower(ecr.test_type_18).isin(covid_test_types) & lower(ecr.test_result_18).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_18)
    .when((lower(ecr.test_type_19).isin(covid_test_types) & lower(ecr.test_result_19).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_19)
    .when((lower(ecr.test_type_20).isin(covid_test_types) & lower(ecr.test_result_20).isin(covid_positive_results) & lower(ecr.conditions).isin(covid_conditions)), ecr.specimen_collection_date_20)
    .otherwise(lit(None))
)


In [None]:
# Join MCI and ECR to get ecr updates (positive covid tests)
ecr_updates = ecr.join(mci,((ecr.iris_id ==  mci.person_id_mci) & (datediff(ecr.comparison_date,mci.specimen_collection_date_mci) <= 90)),"inner").select("iris_id","incident_id_mci")
ecr_updates = ecr_updates.toDF("iris_id","incident_id_mci")


In [None]:
# Load ecr delta table
ecr_main = DeltaTable.forPath(spark,ECR_DELTA_TABLE_FILE_PATH)

# Merge in ecr updates such that the incident_id is updated
ecr_main.alias("ecr") \
  .merge(
    ecr_updates.alias("ecr_updates"),
    "ecr.person_id = ecr_updates.iris_id") \
  .whenMatchedUpdate(set = {"incident_id": "ecr_updates.incident_id_mci" }) \
  .execute()
