In [0]:
%run ./00_functions_and_libraries

In [0]:
%run ./01_params

In [0]:
spark.conf.set('spark.sql.execution.arrow.enabled', True)
spark.conf.set('spark.sql.execution.arrow.fallback.enabled', False)
spark.conf.set("spark.sql.session.timeZone", "America/New_York")
spark.conf.set(
  params["AzureSASLocation"],
  dbutils.secrets.get(scope=params["AzureSASScope"],key=params["AzureSASKey"])
)


db = params["Database"]

tempDataDir = params["tempdir"]
tempDataDir_claims = params["tempdir_claims"]
outboundDir = params["AzureET3Mount"]+"prod/outbound"
archiveDir = params["AzureET3Mount"]+"prod/archive/outbound"
snowflakeDir = params["AzureET3Mount"]+"prod/snowflake/"
timestamp = spark.sql("select string(date_format(current_timestamp, 'yyMMdd_HHmmss'))").collect()[0][0]
spark.sql(f"create database if not exists {db}")
spark.sql(f"use {db}")
print(f"Input Parameters:\n   Database: {db}\n   Temp Dir:{tempDataDir}\n   Temp Dir (claims):{tempDataDir_claims}\n   Outbound Dir:{outboundDir}\n   Archive Dir:{archiveDir}\n   Snowflake Dir:{snowflakeDir}\n")

In [0]:
weights_file = [f"/dbfs{tempDataDir}/weights/{f.name}" for f in dbutils.fs.ls(f"{tempDataDir}/weights") if ".csv" in f.name][0]
updated_matched_file = [f"/dbfs{tempDataDir}/updated_matched/{f.name}" for f in dbutils.fs.ls(f"{tempDataDir}/updated_matched") if ".csv" in f.name][0]
updated_unmatched_file = [f"/dbfs{tempDataDir}/updated_unmatched/{f.name}" for f in dbutils.fs.ls(f"{tempDataDir}/updated_unmatched") if ".csv" in f.name][0]
file_names = [weights_file, updated_matched_file, updated_unmatched_file]

with open(f"/dbfs{outboundDir}/pm_response_output_{timestamp}.txt", 'x') as outfile:
    for fname in file_names:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [0]:
claims_file = [f"/dbfs{tempDataDir_claims}/claims_to_rf/{f.name}" for f in dbutils.fs.ls(f"{tempDataDir_claims}/claims_to_rf") if ".csv" in f.name][0]

with open(f"/dbfs{outboundDir}/pm_response_output_claims_{timestamp}.txt", 'x') as outfile:
  with open(claims_file) as infile:
    for line in infile:
      outfile.write(line)

In [0]:
w_LName = params["WEIGHT: Last Name"]
w_FName = params["WEIGHT: First Name"]
w_MName = params["WEIGHT: Middle Initial/Name"]
w_Street = params["WEIGHT: Patient's Home Address"]
w_City = params["WEIGHT: Patient's Home City"]
w_County = params["WEIGHT: Patient's Home County"]
w_State = params["WEIGHT: Patient's Home State"]
w_Zip = params["WEIGHT: Patient's Home ZIP Code"]
w_SSN = params["WEIGHT: Social Security Number"]
w_Gender = params["WEIGHT: Gender"]
w_Race = params["WEIGHT: Race"]
w_Age = params["WEIGHT: Age"]
w_Dob = params["WEIGHT: Date of Birth"]
w_Lic = params["WEIGHT: Driver's License Number"]
w_MBI = params["WEIGHT: MBI"]
w_StateDL = params["WEIGHT: State Issuing Driver's License"]
match_threshold = params["Match Threshold"]

w_schema = StructType([
  StructField('param_name', StringType(), False),
  StructField('param_weight', FloatType(), False)
])

weights = [('Threshold_Percentage', float(match_threshold)), ('Last_Name_WP', float(w_LName)), ('First_Name_WP', float(w_FName)), ('Middle_Name_WP', float(w_MName)), 
          ("Address_WP", float(w_Street)), ('City_WP', float(w_City)), ('County_WP', float(w_County)), 
          ('State_WP', float(w_State)), ('ZIP_Code_WP', float(w_Zip)), ('SSN_WP', float(w_SSN)), 
          ('Gender_WP', float(w_Gender)), ('Race_WP', float(w_Race)), ('Age_WP', float(w_Age)),
          ('DOB_WP', float(w_Dob)), ('Drivers_License_Number_WP', float(w_Lic)),
          ('MBI_WP', float(w_MBI)), ('State_Issuing_Drivers_License_WP', float(w_StateDL))]
df_weights = spark.createDataFrame(weights, w_schema)

In [0]:
#updated_matched_results
df_matched = (spark.sql('select * from pcr_master where mp_id in (Select mp_id from pcr_master group by mp_id having count(*)>1)')
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('M'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))

df_matched = df_matched.withColumn('pm_score', df_matched['pm_score'].cast(IntegerType()))\
                                        .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                        .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))

# #updated_unmatched_results
df_unmatched = (spark.sql("Select * from pcr_master where mp_id not in (Select mp_id from pcr_master group by mp_id having count(*)>1)")
        .withColumnRenamed('address', 'home_address')
        .withColumnRenamed('city', 'home_city')
        .withColumnRenamed('county', 'home_county')
        .withColumnRenamed('state', 'home_state')
        .withColumnRenamed('zip_code', 'home_zip_code')
        .withColumnRenamed('dob', 'date_of_birth')
        .withColumnRenamed('alternative_address', 'alternate_home_address')
        .withColumn('pm_status', lit('U'))
        .withColumnRenamed('mp_id', 'mpid')
        .withColumnRenamed('pc_flag', 'pm_record_type')
        .withColumn('pm_date', current_date())
        .withColumn('pm_timestamp', current_timestamp())
        .select('patient_id', 'pcr_number', 'last_name', 'first_name', 'middle_name', 'home_address', 'home_city', 'home_county', 'home_state', 'home_zip_code', 'ssn', 'gender', 'race', 'age', 'age_units', 'date_of_birth', 'state_issuing_drivers_license', 'drivers_license_number', 'alternate_home_address', 'mbi', 'agency_unique_state_id', 'agency_id', 'agency_state', 'uuid', 'source', 'official_name_flag', 'pm_overwrite_flag', 'dispatch_timestamp', 'pcr_received_timestamp', 'claims_update_timestamp', 'action', 'mpid', 'pm_score', 'pm_status', 'pm_record_type', 'pm_date', 'pm_timestamp','aka_first_name_1','aka_first_name_2','aka_first_name_3','aka_first_name_4','aka_first_name_5','aka_last_name_1','aka_last_name_2','aka_last_name_3','aka_last_name_4','aka_last_name_5'))


df_unmatched = df_unmatched.withColumn('pm_score', df_unmatched['pm_score'].cast(IntegerType()))\
                                            .withColumn('dispatch_timestamp', date_format('dispatch_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pcr_received_timestamp', date_format('pcr_received_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('claims_update_timestamp', date_format('claims_update_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_timestamp', date_format('pm_timestamp', 'dd-MMM-yy hh.mm.ss.SSSSSSSSS a'))\
                                            .withColumn('pm_record_type', lit(None).cast(StringType()))\
                                            .withColumn("patient_id",df_unmatched.patient_id.cast(IntegerType()))\
                                            .withColumn("date_of_birth",df_unmatched.date_of_birth.cast(DateType()))\
                                            .withColumn("pm_date",df_unmatched.pm_date.cast(DateType()))\


user = dbutils.secrets.get("developers", "snowflake-databricks-username")
password = dbutils.secrets.get("developers", "snowflake-databricks-pw")
database = dbutils.secrets.get("developers", "snowflake-databricks-database")
warehouse = dbutils.secrets.get("developers", "snowflake-databricks-warehouse")

options = {"sfUrl": "cms_idos.us-gov-virginia.azure.snowflakecomputing.com:443",
                 "sfUser": user,
                 "sfPassword": password,
                 "sfDatabase": database,
                 "sfSchema": "ET3_LOOKER_SCHEMA", 
                 "truncate_table" : "ON",
                 "sfWarehouse" : warehouse,
                 "usestagingtable" : "OFF"}

#write to Prod environment
df_matched.write \
  .format("snowflake") \
  .options(**options) \
  .option("parallelism", "8") \
  .option("dbtable", "ET3_LOOKER_SCHEMA.UPDATED_MATCHED_RESULTS") \
  .mode('overwrite')\
  .save()

df_unmatched.write \
  .format("snowflake") \
  .options(**options) \
  .option("parallelism", "8") \
  .option("dbtable", "ET3_LOOKER_SCHEMA.UPDATED_UNMATCHED_RESULTS") \
  .mode('overwrite')\
  .save()

df_weights.write \
  .format("snowflake") \
  .options(**options) \
  .option("parallelism", "8") \
  .option("dbtable", "ET3_LOOKER_SCHEMA.WEIGHTS") \
  .mode('overwrite')\
  .save()

In [0]:
%sh
#RF has requested that both the PCR and Claims response files are EFT'd to them in a single gzip. The current DevOps EFT script will gzip, but we must TAR the files into a #single archive file as the GZIP utility only comresses one file at a time. Below is filename breakdown required for the final TAR....

#[Var0]#EFT.ON.DR[Var1].ET3PMR.PO1.D[Var2].T[Var3]0
#Var 0 = P for PROD, T for all other environments
#Var1 = the destination folder in MAG. Possible values and their corresponding destinations:
#  1 = RF DEV0 
#  2 = RF DEV1 
#  3 = RF VAL0 
#  4 = RF VAL1 
#  X = RF PROD 
#Var2 = YYMMDD
#Var3 = HHMMSS
#NOTE, there is a literal '0' stuck on the end
#NOTE2, the tar must be files and not a directory. The latter creates metadata that is incompatible with RF's ingestion script. You must to navidate to the folder.
#NOTE3, be sure to have the filenames and locations changed respective to the databricks environment, RF destination, and the current files' timestamps!

cd /dbfs/mnt/edfr/et3/prod/outbound
tar cvzf P#EFT.ON.DRX.ET3PMR.P01.D220502.T1553330 pm_response_output_220502_155333 pm_response_output_claims_220311_163756

In [0]:
%sh
cd /dbfs/mnt/edfr/et3/prod/outbound
sha256sum P#EFT.ON.DRX.ET3PMR.P01.D220502.T1553330

In [0]:
%sql
select * from pcr_master where patient_id in ('10002495','10004313')

patient_id,pcr_number,last_name,first_name,middle_name,address,city,county,state,zip_code,ssn,gender,race,age,age_units,dob,state_issuing_drivers_license,drivers_license_number,alternative_address,mbi,agency_unique_state_id,agency_id,agency_state,uuid,source,official_name_flag,pm_overwrite_flag,dispatch_timestamp,pcr_received_timestamp,claims_update_timestamp,action,mp_id,pc_flag,pm_score,aka_first_name_1,aka_first_name_2,aka_first_name_3,aka_first_name_4,aka_first_name_5,aka_last_name_1,aka_last_name_2,aka_last_name_3,aka_last_name_4,aka_last_name_5,er_flag
10004313,c4eea0d2172648ad9723acc7007cd788,AGOSTO,LYNETTE,,57 NORTH 18TH STREET,City of Harrisburg,Dauphin,PA,17103,590187614.0,Female,White,39,Years,1981-08-20,,,,9Q53T82YW91,22027,22027,PA,,PCR,N,N,2021-02-06T14:30:08.000+0000,2021-11-10T19:59:03.000+0000,2022-02-04T11:22:13.650+0000,A,M10004313,P,,LYNETTE,,,,,AGOSTO,,,,,1
10002495,59bb450d379f46a08cb5acc300836a27,AGOSTO,LYNETTE,,57 N 18th St,City of Harrisburg,Dauphin,PA,17104,,Female,Hispanic or Latino,39,Years,1981-08-20,,,,,22027,22027,PA,,PCR,N,N,2021-02-02T14:55:00.000+0000,2021-11-10T19:58:54.000+0000,,A,M10002495,P,,LYNETTE,,,,,AGOSTO,,,,,1


In [0]:
df=spark.sql('select * from claims_master')
df.display()