In [0]:
from pyspark.sql import SparkSession, functions as f

df_hosa = spark.read.parquet("/mnt/bronze/hosa/encounters")
display(df_hosa)
df_hosb = spark.read.parquet("/mnt/bronze/hosb/encounters")
display(df_hosb)


In [0]:
df_merged = df_hosa.unionByName(df_hosb)
df_merged.createOrReplaceTempView("encounters")

In [0]:
%sql
select * from encounters;

In [0]:
%sql
create or replace temp view quality_check as 
select concat(EncounterID,'-',datasource) as EncounterID,
EncounterID SRC_EncounterID,
PatientID,
EncounterDate,
EncounterType,
ProviderID,
DepartmentID,
ProcedureCode,
InsertedDate SRC_InsertedDate,
ModifiedDate SRC_ModifiedDate,
datasource,
case when EncounterID is null or PatientID is null then true else false end as is_quarantined
from encounters;


In [0]:
%sql
select * from quality_check;

In [0]:
%sql
create table if not exists `hrcm-data-catalog`.silver.encounters(
  EncounterID string,
  SRC_EncounterID string,
  PatientID string,
  EncounterDate date,
  EncounterType string,
  ProviderID string,
  DepartmentID string,
  ProcedureCode integer,
  SRC_InsertedDate date,
  SRC_ModifiedDate date,
  datasource string,
  is_quarantined boolean,
  audit_inserteddate timestamp,
  audit_modifieddate timestamp,
  is_current boolean
) using delta;

In [0]:
%sql 
-- implementing scd2 old record update
merge into `hrcm-data-catalog`.silver.encounters as target
using quality_check as source
on target.EncounterID = source.EncounterID
and target.is_current = true
when matched and(
  target.SRC_EncounterID != source.SRC_EncounterID or
  target.PatientID != source.PatientID or
  target.EncounterDate != source.EncounterDate or
  target.EncounterType != source.EncounterType or
  target.ProviderID != source.ProviderID or
  target.DepartmentID != source.DepartmentID or
  target.ProcedureCode != source.ProcedureCode or
  target.SRC_InsertedDate != source.SRC_InsertedDate or
  target.SRC_ModifiedDate != source.SRC_ModifiedDate or
  target.datasource != source.datasource or
  target.is_quarantined != source.is_quarantined
)
then update 
set target.is_current = false,
    target.audit_modifieddate = current_timestamp()

In [0]:
%sql 
merge into `hrcm-data-catalog`.silver.encounters as target
using quality_check as source 
on target.EncounterID = source.EncounterID
and target.is_current = true
when not matched 
then insert (
  target.EncounterID,
  target.SRC_EncounterID,
  target.PatientID,
  target.EncounterDate,
  target.EncounterType,
  target.ProviderID,
  target.DepartmentID,
  target.ProcedureCode,
  target.SRC_InsertedDate,
  target.SRC_ModifiedDate,
  target.datasource,
  target.is_quarantined,
  target.audit_inserteddate,
  target.audit_modifieddate,
  target.is_current
) values (
  source.EncounterID,
  source.SRC_EncounterID,
  source.PatientID,
  source.EncounterDate,
  source.EncounterType,
  source.ProviderID,
  source.DepartmentID,
  source.ProcedureCode,
  source.SRC_InsertedDate,
  source.SRC_ModifiedDate,
  source.datasource,
  source.is_quarantined,
  current_timestamp(),
  current_timestamp(),
  true
)

In [0]:
df_silver = spark.sql("SELECT * FROM `hrcm-data-catalog`.silver.encounters")
df_silver.write.format("delta").mode("overwrite").save("/mnt/silver/encounters")