In [0]:
%run ./00_functions_and_libraries

In [0]:
%run ./01_params

In [0]:
spark.conf.set('spark.sql.execution.arrow.enabled', True)
spark.conf.set('spark.sql.execution.arrow.fallback.enabled', False)
spark.conf.set("spark.sql.session.timeZone", "America/New_York")
spark.conf.set(
  params["AzureSASLocation"],
  dbutils.secrets.get(scope=params["AzureSASScope"],key=params["AzureSASKey"])
)
spark.sql('use {}'.format(params["Database"]))

In [0]:
inbound_file=dbutils.fs.ls(f'{params["AzureET3Container"]}/prod/inbound/')[2][0]
inbound_filename=dbutils.fs.ls(f'{params["AzureET3Container"]}/prod/inbound/')[2][1]
print("inbound filepath-> {}".format(inbound_file),"\n","inbound filename -> {}".format(inbound_filename))

In [0]:
sample_data = spark.read.format('csv').option('header', True).option('delimiter', '|').load(inbound_file)
assert sample_data.schema == InboundFileExpectedSchema, "Inbound File Does Not Match Expected Schema!"
print("Inbound File Matches Expected Schema")

In [0]:
display(sample_data.groupBy('agency_id').count())

In [0]:
print(sample_data.count())
sample_data.filter('mbi is not null and length(mbi)>0').count()

In [0]:
sample_data_dedup = sample_data.dropDuplicates(subset=['patient_id'])
sample_data_dedup.groupBy('patient_id').count().filter('count>1').display()

In [0]:
sample_data_clean = (sample_data_dedup
  .withColumnRenamed('home_address', 'address')
  .withColumnRenamed('home_city', 'city')
  .withColumnRenamed('home_county', 'county')
  .withColumnRenamed('home_state', 'state')
  .withColumnRenamed('home_zip_code', 'zip_code')
  .withColumnRenamed('date_of_birth', 'dob')
  .withColumnRenamed('alternate_home_address', 'alternative_address'))

In [0]:
# Here we focus on records that have been overridden by a user.
# We need to store these now so we can enforce them in the next notebook.
# We will need to lookup the current groupings for each identified record in the PCR_master table.
# To make sure we are looking at the correct version, this step should occur before updating the PCR_master table

# Retrieve the latest groupings. We only need record IDs and grouping IDs here.
pcr_master_current = spark.sql("Select patient_id, mp_id from pcr_master")

# From our new dataset, keep the records where the overwrite flag is "Y"
new_override_ids = (sample_data_clean
                    .filter(upper(col("PM_OVERWRITE_FLAG"))=="Y")
                    .select("PATIENT_ID")
                    .withColumnRenamed("PATIENT_ID", "flagged_id")
                    .distinct()
                   )

# Join with the pcr_master table on record IDs to get the mp_id associated with each user override
new_override_ids_with_mpids = (new_override_ids
                               .join(pcr_master_current, new_override_ids.flagged_id==pcr_master_current.patient_id)
                               .drop("patient_id")
                              )

# Join with the pcr_master table again, this time on mp_id, to get all other records that will form a "delinked pair"
# Also remove, "self-pairs" where the id is the same on both sides of the pair.
new_override_pairs = (new_override_ids_with_mpids
                      .join(pcr_master_current, "mp_id", "left")
                      .withColumnRenamed("patient_id", "delinked_partner_id")
                      .filter(col("flagged_id")!=col("delinked_partner_id"))
                      .select("flagged_id", "delinked_partner_id")
                     )
new_override_pairs.createOrReplaceTempView("delinking_changes")

# Merge new delinked records into delinked_pairs
spark.sql("""
  MERGE INTO delinked_pairs M USING delinking_changes C
  ON M.flagged_id == C.flagged_id AND M.delinked_partner_id == C.delinked_partner_id
  WHEN MATCHED 
    THEN UPDATE SET *
  WHEN NOT MATCHED 
    THEN INSERT *
""")

In [0]:
batch_df = sample_data_clean

#######################################################################
# Convert Timestamp fields from string to timestamp
#######################################################################
batch_df = (batch_df.withColumn("DISPATCH_TIMESTAMP", to_timestamp("DISPATCH_TIMESTAMP", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a"))\
                    .withColumn("PCR_RECEIVED_TIMESTAMP", to_timestamp("PCR_RECEIVED_TIMESTAMP", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a"))\
                    .withColumn("CLAIMS_UPDATE_TIMESTAMP", to_timestamp("CLAIMS_UPDATE_TIMESTAMP", "dd-MMM-yy hh.mm.ss.SSSSSSSSS a")))

################################################################
# Add name alias columns (aka columns)  to batch data
################################################################
batch_df = (batch_df.withColumn('aka_first_name_1', lit(None).cast(StringType()))
                   .withColumn('aka_first_name_2', lit(None).cast(StringType()))
                   .withColumn('aka_first_name_3', lit(None).cast(StringType()))
                   .withColumn('aka_first_name_4', lit(None).cast(StringType()))
                   .withColumn('aka_first_name_5', lit(None).cast(StringType()))
                   .withColumn('aka_last_name_1', lit(None).cast(StringType()))
                   .withColumn('aka_last_name_2', lit(None).cast(StringType()))
                   .withColumn('aka_last_name_3', lit(None).cast(StringType()))
                   .withColumn('aka_last_name_4', lit(None).cast(StringType()))
                   .withColumn('aka_last_name_5', lit(None).cast(StringType())))
                   
#####################################
#      DATA CLEANSING
#####################################


batch_df = batch_df.withColumn('address', when(col('address') == ' ', None).otherwise(col('address')))
batch_df = batch_df.withColumn('address', when(trim(lower(col('address'))).isin(['homeless','refused','unknown','na','transient','homless','no fixed address',
                                                                                 '2400 cypress st','2401 cypress st','2400 cypress','2400 cypress st.','2400 cypress st presbyterian night shelter',
                                                                                 '2400 cypress st homeless','2401 cypress','2400 cypress ave','2400 cypress homeless','2400 cyprus','e presidio st / cypress st homeless',
                                                                                 'cypress st / e lancaster ave','1513 e persideo','2401 cypress st women and children�s shelter',
                                                                                 '620 fallsway','620 the fallsway','421 fallsway','725 fallsway','620 fallsway homeless',
                                                                                 '1513 e presidio st','1513 e presidio','1513 presidio','1513 presidio st','1513 e presidio junction','1513 e. presidio','1513 e precidio',
                                                                                '600 n henderson st','1321 e lancaster ave','1331 e lancaster ave','2700 n charles st', '123 homeless','1234 homeless','9999 homeless',
                                                                                'unable to obtain', '1306 goodwood ave','2605 loyola southway','2434 w belvedere av']), None).otherwise(col('address')))
batch_df = batch_df.withColumn('address', regexp_replace('address', '\?', ' '))

batch_df = batch_df.withColumn('ssn', when(col('ssn').isin(['999999999','000000000','222222222','777777777','555555555','111111111','666666666','999999990','998999999','123456789']), None).otherwise(col('ssn')))
# https://www.ssa.gov/employer/randomization.html add SSNs combos that aren't possible.

batch_df = batch_df.withColumn('first_name_temp', when((trim(lower(col('first_name'))) == 'john') & (trim(lower(col('last_name'))) == 'doe'), None).otherwise(col('first_name')))
batch_df = batch_df.withColumn('last_name', when((trim(lower(col('first_name'))) == 'john') & (trim(lower(col('last_name'))) == 'doe'), None).otherwise(col('last_name')))
batch_df = batch_df.withColumn('first_name', col('first_name_temp')).drop('first_name_temp')
                               
batch_df = batch_df.withColumn('first_name_temp', when((trim(lower(col('first_name'))) == 'jane') & (trim(lower(col('last_name'))) == 'doe'), None).otherwise(col('first_name')))
batch_df = batch_df.withColumn('last_name', when((trim(lower(col('first_name'))) == 'jane') & (trim(lower(col('last_name'))) == 'doe'), None).otherwise(col('last_name')))
batch_df = batch_df.withColumn('first_name', col('first_name_temp')).drop('first_name_temp')
                               
batch_df = batch_df.withColumn('first_name_temp', when((trim(lower(col('first_name'))) == 'unknown') & (trim(lower(col('last_name'))) == 'unknown'), None).otherwise(col('first_name')))
batch_df = batch_df.withColumn('last_name', when((trim(lower(col('first_name'))) == 'unknown') & (trim(lower(col('last_name'))) == 'unknown'), None).otherwise(col('last_name')))
batch_df = batch_df.withColumn('first_name', col('first_name_temp')).drop('first_name_temp')

#Convert any non-numeric values to Null, by using the string library's ascii_uppercase variable containing all letters of the alphabet. We don't want
#to maintain a listing of any or all arbitrary non-numeric values in the age column, rather we'll split apart the column into an array and use an array_overlap to see 
#if any of the items are in an array of the alphabet
batch_df=batch_df.withColumn('ageArray',array_except(split(upper(col('AGE')),''),array(lit(""))))\
.withColumn('ascii',array_except(split(lit(string.ascii_uppercase),''),array(lit(""))))\
.withColumn('NonNumeric',when(arrays_overlap(col('ageArray'),('ascii')), 'Y').otherwise('N'))\
.withColumn('AGE',when(col('NonNumeric')=='Y',None).otherwise(batch_df.AGE))\
.drop('ageArray','ascii','NonNumeric')

#####################################
#     END OF CLEANSING
#####################################


In [0]:
#################################################################
#   Identify data worth passing to ER model
#################################################################

# er flag is true if nonnull count > 4.

batch_df = batch_df.withColumn("nonnull_count", nonnull_count(col("FIRST_NAME"), col("LAST_NAME"), col("MIDDLE_NAME"), col("address"),
                                                                    col('city'), col('state'), col('zip_code'), col('SSN'), col('county'),
                                                                    col('GENDER'), col('RACE'), col('dob'), col('MBI'), col('alternative_address'),
                                                                    col('AGE'), col('AGE_UNITS'), col('DRIVERS_LICENSE_NUMBER'), col('STATE_ISSUING_DRIVERS_LICENSE')
                                                                   ))

batch_df = batch_df.withColumn('er_flag', when(col('nonnull_count') > 10, lit(1)).otherwise(lit(0))).drop('nonnull_count')

  
mpid_df = spark.sql("select * from mp_id")                                 # Read mp_id data into dataframe

mpid_df = mpid_df.withColumn('mp_id', regexp_replace('mp_id','M','').cast(IntegerType())) #remove leading 'M' and cast to Integer

join_result = (batch_df                                                    # Left Join batch data with mp_id on recordId
               .join(mpid_df, "patient_id", "left")                            # (to assign original mp_id to existing records)
#                .withColumn("mp_id", coalesce(col("mp_id"), col("pcr_id")))
               .withColumn("mp_id", coalesce(col("mp_id"), col("patient_id")))
               .withColumn("pc_flag", lit("P"))
               .withColumn("pm_score", lit(None))
              )

join_result.createOrReplaceTempView("PCR_master_changes")

# mpid_merge_source = join_result.select("pcr_id", "mp_id")                  # Select recId:mp_id from join_result to update mp_id
mpid_merge_source = join_result.select("patient_id", "mp_id")                  # Select recId:mp_id from join_result to update mp_id
mpid_merge_source.createOrReplaceTempView("mpid_changes")

# # Merge new mp_ids into mp_id table
# spark.sql("""
#   MERGE INTO mp_id M USING mpid_changes C
#   ON M.pcr_id == C.pcr_id
#   WHEN MATCHED THEN UPDATE SET *
#   WHEN NOT MATCHED THEN INSERT *
# """)

# # Merge join_result into Master table
# spark.sql("""
#   MERGE INTO PCR_master M USING PCR_master_changes C
#   ON M.pcr_id == C.pcr_id
#   WHEN MATCHED
#     THEN UPDATE SET *
#   WHEN NOT MATCHED and C.action == "ADD"
#     THEN INSERT *
# """)

# Merge new mp_ids into mp_id table
spark.sql("""
  MERGE INTO mp_id M USING mpid_changes C
  ON M.patient_id == C.patient_id
  WHEN MATCHED 
    THEN UPDATE SET *
  WHEN NOT MATCHED 
    THEN INSERT *
""")

# Merge join_result into Master table
# <<TO DO>>: Discuss with NewWave: should we rely on key or action provided in the data
# spark.sql('''
#   MERGE INTO PCR_master M USING PCR_master_changes C
#   ON M.patient_id == C.patient_id
#   WHEN MATCHED and C.action == "U" 
#     THEN UPDATE SET *
#   WHEN NOT MATCHED and C.action == "A"
#     THEN INSERT *
#  ''')

## Take out the "action" flag. We will rely on patient_id solely to determine if we have processed the record or not
spark.sql('''
  MERGE INTO pcr_master M USING PCR_master_changes C
  ON M.patient_id == C.patient_id
  WHEN MATCHED 
    THEN UPDATE SET *
  WHEN NOT MATCHED
    THEN INSERT *
 ''')

In [0]:
%sql
select count(*) from mp_id

In [0]:
%sql
select count(*) from PCR_master

In [0]:
spark.sql('select * from pcr_master@v61').count()

In [0]:
match_cols=["FIRST_NAME","LAST_NAME","MIDDLE_NAME","address",'city','state','zip_code','SSN','county','GENDER','RACE','dob','MBI','AGE','DRIVERS_LICENSE_NUMBER','STATE_ISSUING_DRIVERS_LICENSE']

Results=[]
df=spark.sql('select * from pcr_master')
'''df=spark.sql('select * from pcr_master@v61')
df=df.withColumn("nonnull_count", nonnull_count(col("FIRST_NAME"), col("LAST_NAME"), col("MIDDLE_NAME"), col("address"),
                                                                    col('city'), col('state'), col('zip_code'), col('SSN'), col('county'),
                                                                    col('GENDER'), col('RACE'), col('dob'), col('MBI'), col('alternative_address'),
                                                                    col('AGE'), col('AGE_UNITS'), col('DRIVERS_LICENSE_NUMBER'), col('STATE_ISSUING_DRIVERS_LICENSE'))).withColumn('er_flag', when(col('nonnull_count') > 10, lit(1)).otherwise(lit(0))).drop('nonnull_count')'''
df = df.filter('er_flag==1')
count=df.count()
for i in match_cols:
  a = i
  b = df.filter('{} is not null'.format(i)).count()
  c = b/count
  Results.append([a,b,c])
  
Results_df=pd.DataFrame(Results,columns=['Variable','Count_NotNull','Pct_NotNull'])
print(count)
Results_df

In [0]:
archive_filepath=params["AzureET3Mount"]+"/prod/archive/inbound/"+inbound_filename
dbutils.fs.cp(inbound_file,
              archive_filepath)

In [0]:
dbutils.notebook.exit("Step 2: PCR Batch Ingest with Sample Data completed successfully")