In [0]:
from pyspark.sql import SparkSession, functions as f

df_hosa = spark.read.parquet("/mnt/bronze/hosa/transactions")

display(df_hosa)

df_hosb = spark.read.parquet("/mnt/bronze/hosb/transactions")

display(df_hosb)

In [0]:
df_merged = df_hosa.unionByName(df_hosb)
df_merged.createOrReplaceTempView("transactions")
display(df_merged)

In [0]:
%sql 
create or replace temp view quality_check as 
select concat(TransactionID,'-',datasource) as TransactionID,
TransactionID as SRC_TransactionID,
EncounterID,
PatientID,
ProviderID,
DeptID,
VisitDate,
ServiceDate,
PaidDate,
VisitType,
Amount,
AmountType,
PaidAmount,
ClaimID,
PayorID,
ProcedureCode,
ICDCode,
LineOfBusiness,
MedicaidID,
MedicareID,
InsertDate as SRC_InsertDate,
ModifiedDate as SRC_ModifiedDate,
datasource,
case when TransactionID is null or PatientID is null or EncounterID is null or VisitDate is null then true else false end as is_quarantined
from transactions;


In [0]:
%sql 
select * from quality_check
order by is_quarantined desc;

In [0]:
%sql
create table if not exists `hrcm-data-catalog`.silver.transactions(
  TransactionID string,
  SRC_TransactionID string,
  EncounterID string,
  PatientID string,
  ProviderID string,
  DeptID string,
  VisitDate date,
  ServiceDate date,
  PaidDate date,
  VisitType string,
  Amount double,
  AmountType string,
  PaidAmount double,
  ClaimID string,
  PayorID string,
  ProcedureCode int,
  ICDCode string,
  LineOfBusiness string,
  MedicaidID string,
  MedicareID string,
  SRC_InsertDate date,
  SRC_ModifiedDate date,
  datasource string,
  is_quarantined boolean,
  audit_insertdate timestamp,
  audit_modifieddate timestamp,
  is_current boolean
)
using delta;

In [0]:
%sql
---Update old record to implement SCD Type 2
merge into `hrcm-data-catalog`.silver.transactions as target 
using quality_check as source 
on target.TransactionID = source.TransactionID
and target.is_current = true
when matched
and (
  target.SRC_TransactionID != source.SRC_TransactionID or
  target.EncounterID != source.EncounterID or
  target.PatientID != source.PatientID or
  target.ProviderID != source.ProviderID or
  target.DeptID != source.DeptID or
  target.VisitDate != source.VisitDate or
  target.ServiceDate != source.ServiceDate or
  target.PaidDate != source.PaidDate or
  target.VisitType != source.VisitType or
  target.Amount != source.Amount or
  target.AmountType != source.AmountType or
  target.PaidAmount != source.PaidAmount or
  target.ClaimID != source.ClaimID or
  target.PayorID != source.PayorID or
  target.ProcedureCode != source.ProcedureCode or
  target.ICDCode != source.ICDCode or
  target.LineOfBusiness != source.LineOfBusiness or
  target.MedicaidID != source.MedicaidID or
  target.MedicareID != source.MedicareID or
  target.SRC_InsertDate != source.SRC_InsertDate or
  target.SRC_ModifiedDate != source.SRC_ModifiedDate or
  target.datasource != source.datasource or
  target.is_quarantined != source.is_quarantined
) 
then update 
set target.audit_modifieddate = current_timestamp(),
    target.is_current = false

In [0]:
%sql
--- inserting new records SCD2
merge into `hrcm-data-catalog`.silver.transactions as target 
using quality_check as source 
on target.TransactionID = source.TransactionID
and target.is_current = true
when not matched then
insert (
  target.TransactionID,
  target.SRC_TransactionID,
  target.EncounterID,
  target.PatientID,
  target.ProviderID,
  target.DeptID,
  target.VisitDate,
  target.ServiceDate,
  target.PaidDate,
  target.VisitType,
  target.Amount,
  target.AmountType,
  target.PaidAmount,
  target.ClaimID,
  target.PayorID,
  target.ProcedureCode,
  target.ICDCode,
  target.LineOfBusiness,
  target.MedicaidID,
  target.MedicareID,
  target.SRC_InsertDate,
  target.SRC_ModifiedDate,
  target.datasource,
  target.is_quarantined,
  target.audit_insertdate,
  target.audit_modifieddate,
  target.is_current
) 
values(
  source.TransactionID,
  source.SRC_TransactionID,
  source.EncounterID,
  source.PatientID,
  source.ProviderID,
  source.DeptID,
  source.VisitDate,
  source.ServiceDate,
  source.PaidDate,
  source.VisitType,
  source.Amount,
  source.AmountType,
  source.PaidAmount,
  source.ClaimID,
  source.PayorID,
  source.ProcedureCode,
  source.ICDCode,
  source.LineOfBusiness,
  source.MedicaidID,
  source.MedicareID,
  source.SRC_InsertDate,
  source.SRC_ModifiedDate,
  source.datasource,
  source.is_quarantined,
  current_timestamp(),
  current_timestamp(),
  true
)


In [0]:
df_silver = spark.sql("select * from `hrcm-data-catalog`.silver.transactions")

df_silver.write.format("delta").mode("overwrite").save("/mnt/silver/transactions")