In [None]:
import pyspark
from delta import *
from pyspark.sql import Row
from pyspark.sql.functions import lit, col
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
from datetime import datetime

spark = SparkSession.builder.getOrCreate()

current_datetime = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
STORAGE_ACCOUNT = "$STORAGE_ACCOUNT"
ECR_DELTA_TABLE_FILE_PATH = "ecr-datastore"
COVID_IDENTIFICATION_CONFIG_FILE_PATH = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/covid_identification_config.json"
ECR_WITH_IDS_FILE_PATH = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/ecr_tsvs/matched_ecrs/{current_datetime}.tsv"
ECR_WITHOUT_IDS_FILE_PATH = f"abfss://delta-tables@{STORAGE_ACCOUNT}.dfs.core.windows.net/ecr_tsvs/unmatched_ecrs/{current_datetime}.tsv"

# ECR
ecr = spark.read.format("delta").load(ECR_DELTA_TABLE_FILE_PATH)

# Covid identification data
df = spark.read.json(COVID_IDENTIFICATION_CONFIG_FILE_PATH, multiLine=True)
covid_test_type_codes = df.select('covid_test_type_codes').rdd.flatMap(lambda x: x).collect()[0]


TEST_TEMPLATE = [
    "test_type_",
    "test_result_",
    "test_result_interp_",
    "specimen_type_",
    "performing_lab_",
    "specimen_collection_date_",
    "result_date_"
]

iris_tsv_column_aliases = {
    "incident_id":"Incident ID",
    "last_name":" Last name",
    "first_name": "First name",
    # Section action
    # Section instance
    "rr_id": "RR ID",
    "status": "Status",
    "conditions": "Conditions",
    "eicr_id": "eICR ID",
    "eicr_version_number":"eICR Version Number",
    "authoring_datetime": "Authoring date/time",
    "provider_id": "Provider ID",
    "facility_id_number": "Facility ID Number",
    "facility_name": "Facility Name",
    "facility_type": "Facility Type/Hospital unit",
    "encounter_type": "Encounter Details: type",
    "encounter_start_date": "Encounter Details: date (from)",
    "encounter_end_date": "Encounter Details: date (to)",
    "active_problem_1": "Active Problem 1",
    "active_problem_date_1": "Active Problem Noted Date 1",
    "active_problem_2": "Active Problem 2",
    "active_problem_date_2": "Active Problem Noted Date 2",
    "active_problem_3": "Active Problem 3",
    "active_problem_date_3": "Active Problem Noted Date 3",
    "active_problem_4": "Active Problem 4",
    "active_problem_date_4": "Active Problem Noted Date 4",
    "active_problem_5": "Active Problem 5",
    "active_problem_date_5": "Active Problem Noted Date 5",
    "reason_for_visit": "Reason for visit",
    # Comments
    "test_type_1": "Test Type 1",
    "test_result_1": "Test Result 1",
    # "test_result_interp_1"
    "specimen_type_1": "Specimen Type 1",
    "performing_lab_1": "Performing Lab 1",
    "specimen_collection_date_1": "Specimen Collection Date 1",
    "result_date_1": "Result Date 1",
    "test_type_2": "Test Type 2",
    "test_result_2": "Test Result 2",
    # "test_result_interp_2"
    "specimen_type_2": "Specimen Type 2",
    "performing_lab_2": "Performing Lab 2",
    "specimen_collection_date_2": "Specimen Collection Date 2",
    "result_date_2": "Result Date 2"
    # "Note"
    }


def filter_tests(df_row, test_cols):
    """
    Filters a given row from the ECR data store by COVID-relatedness of tests.
    For each row, a list is constructed holding the test numbers of only the
    tests that map to a predefined list of covid test types.
    """
    row_dict = df_row.asDict()
    filtered_tests = [c for c in test_cols if df_row[c] in covid_test_type_codes]
    filtered_tests = [f.split("_")[-1] for f in filtered_tests]
    row_dict["covid_tests"] = filtered_tests
    return row_dict

def identify_recent_tests(row_dict):
    """
    Given a dictionary-formatted RDD row that's had covid-related tests identified,
    determines which of those tests are the two most recent. If a row has no 
    associated covid tests, the created recency columns are left blank, and if
    there's only one, the `recent_test_1` column is populated while the other is
    left blank. Otherwise, both recency columns are populated with the test number
    of either the first or second most recent covid related test.
    """
    covid_tests = row_dict["covid_tests"]
    row_dict["recent_covid_test_1"] = None
    row_dict["recent_covid_test_2"] = None
    if len(covid_tests) == 0:
        return row_dict
    if len(covid_tests) == 1:
        row_dict["recent_covid_test_1"] == covid_tests[0]
        return row_dict
    tests_to_dates = dict(zip(covid_tests, [row_dict["result_date_" + str(v)] for v in covid_tests]))
    sorted_tests = sorted(tests_to_dates.items(), key=lambda x: x[1], reverse=True)
    row_dict["recent_covid_test_1"] = sorted_tests[0][0]
    row_dict["recent_covid_test_2"] = sorted_tests[1][0]
    return row_dict

def rewrite_test_x_with_test_y(row_dict, test_to_rewrite):
    """
    Given a dictionary-formatted spark RDD row that's had recent covid tests identified,
    replaces the values of test 1 and test 2 with the information pertaining to the
    most recent and second most recent covid tests, respectively.
    """
    replace_with_test = row_dict["recent_covid_test_" + str(test_to_rewrite)]
    if replace_with_test is None:
        return row_dict
    for test_field in TEST_TEMPLATE:
        row_dict[test_field + str(test_to_rewrite)] = row_dict[test_field + replace_with_test]
    return row_dict
    

# Explicitly define new schema to prevent interpretation breakage during RDD map
new_schema = ecr.schema.add(StructField("covid_tests", StringType(), True))
new_schema = new_schema.add(StructField("recent_covid_test_1", StringType(), True))
new_schema = new_schema.add(StructField("recent_covid_test_2", StringType(), True))

# Apply map functions to parallelize row processing and identify covid tests
test_cols = [c for c in ecr.columns if "test_type" in c]
rdd2 = ecr.rdd.map(lambda x: filter_tests(x, test_cols))
rdd3 = rdd2.map(lambda x: identify_recent_tests(x))
rdd4 = rdd3.map(lambda x: rewrite_test_x_with_test_y(x, 1))
rdd5 = rdd4.map(lambda x: rewrite_test_x_with_test_y(x, 2))

# Convert back to DF and drop the temp cols we created as well as superfluous tests
ecr = spark.createDataFrame(rdd5, new_schema)
cols_to_drop = ["covid_tests", "recent_covid_test_1", "recent_covid_test_2"]
for i in range(3,21):
    for col_name in TEST_TEMPLATE:
        cols_to_drop.append(col_name + str(i))
ecr = ecr.drop(*cols_to_drop)

# Now, rename all the processed columns to what LAC would like in their output
ecr = ecr.select(
    col("incident_id").alias(iris_tsv_column_aliases["incident_id"]),
    # col("iris_id"),
    # col("patient_id"),
    # col("person_id"),
    col("first_name").alias(iris_tsv_column_aliases["first_name"]),
    col("last_name").alias(iris_tsv_column_aliases["last_name"]),
    lit("[ListSectionInsert]").alias("Section action"),
    lit(None).cast('string').alias("Section instance"),
    col("rr_id").alias(iris_tsv_column_aliases["rr_id"]),
    col("status").alias(iris_tsv_column_aliases["status"]),
    col("conditions").alias(iris_tsv_column_aliases["conditions"]),
    col("eicr_id").alias(iris_tsv_column_aliases["eicr_id"]),
    col("eicr_version_number").alias(iris_tsv_column_aliases["eicr_version_number"]),
    col("authoring_datetime").alias(iris_tsv_column_aliases["authoring_datetime"]),
    col("provider_id").alias(iris_tsv_column_aliases["provider_id"]),
    col("facility_id_number").alias(iris_tsv_column_aliases["facility_id_number"]),
    col("facility_name").alias(iris_tsv_column_aliases["facility_name"]),
    col("facility_type").alias(iris_tsv_column_aliases["facility_type"]),
    col("encounter_type").alias(iris_tsv_column_aliases["encounter_type"]),
    col("encounter_start_date").alias(iris_tsv_column_aliases["encounter_start_date"]),
    col("encounter_end_date").alias(iris_tsv_column_aliases["encounter_end_date"]),
    col("active_problem_1").alias(iris_tsv_column_aliases["active_problem_1"]),
    col("active_problem_date_1").alias(iris_tsv_column_aliases["active_problem_date_1"]),
    col("active_problem_2").alias(iris_tsv_column_aliases["active_problem_2"]),
    col("active_problem_date_2").alias(iris_tsv_column_aliases["active_problem_date_2"]),
    col("active_problem_3").alias(iris_tsv_column_aliases["active_problem_3"]),
    col("active_problem_date_3").alias(iris_tsv_column_aliases["active_problem_date_3"]),
    col("active_problem_4").alias(iris_tsv_column_aliases["active_problem_4"]),
    col("active_problem_date_4").alias(iris_tsv_column_aliases["active_problem_date_4"]),
    col("active_problem_5").alias(iris_tsv_column_aliases["active_problem_5"]),
    col("active_problem_date_5").alias(iris_tsv_column_aliases["active_problem_date_5"]),
    col("reason_for_visit").alias(iris_tsv_column_aliases["reason_for_visit"]),
    lit(None).cast('string').alias("Comments"),
    col("test_type_1").alias(iris_tsv_column_aliases["test_type_1"]),
    col("test_result_1").alias(iris_tsv_column_aliases["test_result_1"]),
    col("specimen_type_1").alias(iris_tsv_column_aliases["specimen_type_1"]),
    col("performing_lab_1").alias(iris_tsv_column_aliases["performing_lab_1"]),
    col("specimen_collection_date_1").alias(iris_tsv_column_aliases["specimen_collection_date_1"]),
    col("result_date_1").alias(iris_tsv_column_aliases["result_date_1"]),
    col("test_type_2").alias(iris_tsv_column_aliases["test_type_2"]),
    col("test_result_2").alias(iris_tsv_column_aliases["test_result_2"]),
    col("specimen_type_2").alias(iris_tsv_column_aliases["specimen_type_2"]),
    col("performing_lab_2").alias(iris_tsv_column_aliases["performing_lab_2"]),
    col("specimen_collection_date_2").alias(iris_tsv_column_aliases["specimen_collection_date_2"]),
    col("result_date_2").alias(iris_tsv_column_aliases["result_date_2"]),
    lit("eCR data added to UDF through import utility").alias("Note"),
)

In [None]:
def tsv_flatten(row):
    """
    Given a row from a spark RDD, extract some information to create a pair of TSV rows
    for each row in the RDD. The first of these rows is a simple name, incident, and note
    header, while the second contains the comprehensive value set built above.
    """
    incident_id = row[iris_tsv_column_aliases["incident_id"]]
    last_name = row[iris_tsv_column_aliases["last_name"]]
    first_name = row[iris_tsv_column_aliases["first_name"]]
    section_action = row["Section action"]
    section_instance = row["Section instance"]
    comments = row["Comments"]
    note = row["Note"]
    none_tuple_1 = (None,) * (24)
    none_tuple_2 = (None,) * (12)
    return iter([(incident_id, first_name, last_name, section_action, section_instance) + none_tuple_1 + (comments,) + none_tuple_2 + (note,)] + [row])

flattend_rdd=ecr.rdd.flatMap(tsv_flatten)

# DF will break with this because properties aren't supposed to be nullable,
# so change those fields because we're just writing output
for sf in ecr.schema:
    sf.nullable = True
flattened_df = spark.createDataFrame(flattend_rdd, ecr.schema)


# Now split this up into those that have incident IDs and those that don't
df_with_ids = flattened_df.filter(flattened_df["Incident ID"].isNotNull())
df_no_ids = flattened_df.filter(flattened_df["Incident ID"].isNull())
df_with_ids.coalesce(1).write.format("csv").mode("overwrite").option("header", "true").option("sep", "\t").csv(ECR_WITH_IDS_FILE_PATH)
df_no_ids.coalesce(1).write.format("csv").mode("overwrite").option("header", "true").option("sep", "\t").csv(ECR_WITHOUT_IDS_FILE_PATH)