In [0]:
%run ./00_functions_and_libraries

In [0]:
%run ./01_params

In [0]:
spark.conf.set('spark.sql.execution.arrow.enabled', True)
spark.conf.set('spark.sql.execution.arrow.fallback.enabled', False)
spark.conf.set("spark.sql.session.timeZone", "America/New_York")
spark.conf.set(
  params["AzureSASLocation"],
  dbutils.secrets.get(scope=params["AzureSASScope"],key=params["AzureSASKey"])
)
tempDataDir = params["tempdir"]

In [0]:
df_weights = spark.read.format("csv").option('delimiter','|').option('header',True).load(f"{tempDataDir}/weights")

#matched_results
df_matched = (spark.sql('select * from pcr_master where mp_id in (Select mp_id from pcr_master group by mp_id having count(*)>1)')
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('M'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))

df_matched = df_matched.withColumn('pm_score', df_matched['pm_score'].cast(IntegerType()))\
                                        .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))

# #updated_unmatched_results

# #updated_unmatched_results
df_unmatched = (spark.sql("Select * from pcr_master where mp_id not in (Select mp_id from pcr_master group by mp_id having count(*)>1)")
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('U'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))


df_unmatched = df_unmatched.withColumn('pm_score', df_unmatched['pm_score'].cast(IntegerType()))\
                                            .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_record_type', lit(None).cast(StringType())) #change 'P' to null for unmatched records

In [0]:
print('***MATCHED COUNTS***')
print('Matched count:',df_matched.count())
print('Matched distinct count:',df_matched.distinct().count())
print('Count MPIDs:',df_matched.select('mpid').count())
print('Unique MPIDs:',df_matched.select('mpid').distinct().count())
print('Unique Patient IDs:',df_matched.select('patient_id').distinct().count())
print('pm_status count:',df_matched.select('pm_status').count())

In [0]:
print('***UNMATCHED COUNTS***')
print('Unmatched count:',df_unmatched.count())
print('Unmatched distinct count:',df_unmatched.distinct().count())
print('Count MPIDs:',df_unmatched.select('mpid').count())
print('Unique MPIDs:',df_unmatched.select('mpid').distinct().count())
print('Unique Patient IDs:',df_unmatched.select('patient_id').distinct().count())
print('pm_status count:',df_unmatched.select('pm_status').count())

In [0]:
df_matched_deid=df_matched
df_matched_deid=df_matched_deid\
.withColumn('patient_id',lit('*****'))\
.withColumn( 'pcr_number',lit('*****'))\
.withColumn( 'last_name',lit('*****'))\
.withColumn( 'first_name',lit('*****'))\
.withColumn( 'middle_name',lit('*****'))\
.withColumn( 'home_address',lit('*****'))\
.withColumn( 'home_city',lit('*****'))\
.withColumn( 'home_county',lit('*****'))\
.withColumn( 'home_state',lit('*****'))\
.withColumn( 'home_zip_code',lit('*****'))\
.withColumn( 'ssn',lit('*****'))\
.withColumn( 'gender',lit('*****'))\
.withColumn( 'race',lit('*****'))\
.withColumn( 'age',lit('*****'))\
.withColumn( 'age_units',lit('*****'))\
.withColumn( 'date_of_birth',lit('*****'))\
.withColumn( 'state_issuing_drivers_license',lit('*****'))\
.withColumn( 'drivers_license_number',lit('*****'))\
.withColumn( 'alternate_home_address',lit('*****'))\
.withColumn( 'mbi',lit('*****'))\
.withColumn( 'agency_unique_state_id',lit('*****'))\
.withColumn( 'agency_id',lit('*****'))\
.withColumn( 'agency_state',lit('*****'))\
.withColumn( 'mpid',lit('*****'))\
.withColumn('aka_first_name_1',lit('*****'))\
.withColumn('aka_first_name_2',lit('*****'))\
.withColumn('aka_first_name_3',lit('*****'))\
.withColumn('aka_first_name_4',lit('*****'))\
.withColumn('aka_first_name_5',lit('*****'))\
.withColumn('aka_last_name_1',lit('*****'))\
.withColumn('aka_last_name_2',lit('*****'))\
.withColumn('aka_last_name_3',lit('*****'))\
.withColumn('aka_last_name_4',lit('*****'))\
.withColumn('aka_last_name_5',lit('*****'))

df_unmatched_deid=df_unmatched
df_unmatched_deid=df_unmatched_deid\
.withColumn('patient_id',lit('*****'))\
.withColumn( 'pcr_number',lit('*****'))\
.withColumn( 'last_name',lit('*****'))\
.withColumn( 'first_name',lit('*****'))\
.withColumn( 'middle_name',lit('*****'))\
.withColumn( 'home_address',lit('*****'))\
.withColumn( 'home_city',lit('*****'))\
.withColumn( 'home_county',lit('*****'))\
.withColumn( 'home_state',lit('*****'))\
.withColumn( 'home_zip_code',lit('*****'))\
.withColumn( 'ssn',lit('*****'))\
.withColumn( 'gender',lit('*****'))\
.withColumn( 'race',lit('*****'))\
.withColumn( 'age',lit('*****'))\
.withColumn( 'age_units',lit('*****'))\
.withColumn( 'date_of_birth',lit('*****'))\
.withColumn( 'state_issuing_drivers_license',lit('*****'))\
.withColumn( 'drivers_license_number',lit('*****'))\
.withColumn( 'alternate_home_address',lit('*****'))\
.withColumn( 'mbi',lit('*****'))\
.withColumn( 'agency_unique_state_id',lit('*****'))\
.withColumn( 'agency_id',lit('*****'))\
.withColumn( 'agency_state',lit('*****'))\
.withColumn( 'mpid',lit('*****'))\
.withColumn('aka_first_name_1',lit('*****'))\
.withColumn('aka_first_name_2',lit('*****'))\
.withColumn('aka_first_name_3',lit('*****'))\
.withColumn('aka_first_name_4',lit('*****'))\
.withColumn('aka_first_name_5',lit('*****'))\
.withColumn('aka_last_name_1',lit('*****'))\
.withColumn('aka_last_name_2',lit('*****'))\
.withColumn('aka_last_name_3',lit('*****'))\
.withColumn('aka_last_name_4',lit('*****'))\
.withColumn('aka_last_name_5',lit('*****'))


In [0]:
df_matched.display()
df_matched_deid.display()
df_unmatched.display()
df_unmatched_deid.display()

In [0]:
dbutils.fs.rm(f"/mnt/dev_et3/impl/snowflake/DEID/unmatched",True)
dbutils.fs.rm((f"/mnt/dev_et3/impl/snowflake/DEID/matched",True)
dbutils.fs.rm((f"/mnt/dev_et3/impl/snowflake/DEID/weights",True)
dbutils.fs.mkdirs(f"/mnt/dev_et3/impl/snowflake/DEID/unmatched")
dbutils.fs.mkdirs(f"/mnt/dev_et3/impl/snowflake/DEID/matched")
dbutils.fs.mkdirs(f"/mnt/dev_et3/impl/snowflake/DEID/weights")

In [0]:
df_weights=df_weights.coalesce(1)
df_weights.write.format("csv").mode("overwrite").save(f"/mnt/dev_et3/impl/snowflake/DEID/weights")
df_updated_unmatched_deid=df_updated_unmatched_deid.coalesce(1)
df_updated_unmatched_deid.write.format("csv").mode("overwrite").save(f"/mnt/dev_et3/impl/snowflake/DEID/unmatched")
df_updated_matched_deid=df_updated_matched_deid.coalesce(1)
df_updated_matched_deid.write.format("csv").mode("overwrite").save(f"/mnt/dev_et3/impl/snowflake/DEID/matched")