In [0]:
from pyspark.sql import SparkSession, functions as f

In [0]:
hosa_encounters_path = "/mnt/rcmabhi/bronze/hosa/transactions"
df_hosa = spark.read.parquet(hosa_encounters_path)

In [0]:
hosb_encounters_path = "/mnt/rcmabhi/bronze/hosb/transactions"
df_hosb = spark.read.parquet(hosa_encounters_path)

In [0]:
df_merged = df_hosa.unionByName(df_hosb)
display(df_merged)

In [0]:
df_merged.createOrReplaceTempView("transactions")

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW quality_checks AS
SELECT 
    CONCAT(TransactionID, '-', datasource) AS TransactionID,
    TransactionID AS SRC_TransactionID,
    EncounterID,
    PatientID,
    ProviderID,
    DeptID,
    VisitDate,
    ServiceDate,
    PaidDate,
    VisitType,
    Amount,
    AmountType,
    PaidAmount,
    ClaimID,
    PayorID,
    ProcedureCode,
    ICDCode,
    LineOfBusiness,
    MedicaidID,
    MedicareID,
    InsertDate AS SRC_InsertDate,
    ModifiedDate AS SRC_ModifiedDate,
    datasource,
    CASE 
        WHEN EncounterID IS NULL 
          OR PatientID IS NULL 
          OR TransactionID IS NULL 
          OR VisitDate IS NULL 
        THEN TRUE
        ELSE FALSE
    END AS is_quarantined
FROM transactions;


In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.transactions (
  TransactionID STRING,
  SRC_TransactionID STRING,
  EncounterID STRING,
  PatientID STRING,
  ProviderID STRING,
  DeptID STRING,
  VisitDate DATE,
  ServiceDate DATE,
  PaidDate DATE,
  VisitType STRING,
  Amount DOUBLE,
  AmountType STRING,
  PaidAmount DOUBLE,
  ClaimID STRING,
  PayorID STRING,
  ProcedureCode INT,
  ICDCode STRING,
  LineOfBusiness STRING,
  MedicaidID STRING,
  MedicareID STRING,
  SRC_InsertDate DATE,
  SRC_ModifiedDate DATE,
  datasource STRING,
  is_quarantined BOOLEAN,
  audit_insertdate TIMESTAMP,
  audit_modifieddate TIMESTAMP,
  is_current BOOLEAN
)
USING DELTA;


In [0]:
%sql
MERGE INTO silver.transactions AS tgt
USING quality_checks AS src
ON tgt.TransactionID = src.TransactionID
AND tgt.is_current = true

WHEN MATCHED
AND (
  tgt.SRC_TransactionID <> src.SRC_TransactionID
  OR tgt.EncounterID <> src.EncounterID
  OR tgt.PatientID <> src.PatientID
  OR tgt.ProviderID <> src.ProviderID
  OR tgt.DeptID <> src.DeptID
  OR tgt.VisitDate <> src.VisitDate
  OR tgt.ServiceDate <> src.ServiceDate
  OR tgt.PaidDate <> src.PaidDate
  OR tgt.VisitType <> src.VisitType
  OR tgt.Amount <> src.Amount
  OR tgt.AmountType <> src.AmountType
  OR tgt.PaidAmount <> src.PaidAmount
  OR tgt.ClaimID <> src.ClaimID
  OR tgt.PayorID <> src.PayorID
  OR tgt.ProcedureCode <> src.ProcedureCode
  OR tgt.ICDCode <> src.ICDCode
  OR tgt.LineOfBusiness <> src.LineOfBusiness
  OR tgt.MedicaidID <> src.MedicaidID
  OR tgt.MedicareID <> src.MedicareID
  OR tgt.SRC_InsertDate <> src.SRC_InsertDate
  OR tgt.SRC_ModifiedDate <> src.SRC_ModifiedDate
  OR tgt.datasource <> src.datasource
  OR tgt.is_quarantined <> src.is_quarantined
)
THEN UPDATE SET
  tgt.is_current = false,
  tgt.audit_modifieddate = current_timestamp();


In [0]:
%sql
MERGE INTO silver.transactions AS tgt
USING quality_checks AS src
ON  tgt.TransactionID = src.TransactionID
AND tgt.is_current      = true

WHEN NOT MATCHED THEN
INSERT (
    TransactionID,
    SRC_TransactionID,
    EncounterID,
    PatientID,
    ProviderID,
    DeptID,
    VisitDate,
    ServiceDate,
    PaidDate,
    VisitType,
    Amount,
    AmountType,
    PaidAmount,
    ClaimID,
    PayorID,
    ProcedureCode,
    ICDCode,
    LineOfBusiness,
    MedicaidID,
    MedicareID,
    SRC_InsertDate,
    SRC_ModifiedDate,
    datasource,
    is_quarantined,
    audit_insertdate,
    audit_modifieddate,
    is_current
)
VALUES (
    src.TransactionID,
    src.SRC_TransactionID,
    src.EncounterID,
    src.PatientID,
    src.ProviderID,
    src.DeptID,
    src.VisitDate,
    src.ServiceDate,
    src.PaidDate,
    src.VisitType,
    src.Amount,
    src.AmountType,
    src.PaidAmount,
    src.ClaimID,
    src.PayorID,
    src.ProcedureCode,
    src.ICDCode,
    src.LineOfBusiness,
    src.MedicaidID,
    src.MedicareID,
    src.SRC_InsertDate,
    src.SRC_ModifiedDate,
    src.datasource,
    src.is_quarantined,
    current_timestamp(),       -- audit_insertdate
    current_timestamp(),       -- audit_modifieddate
    true                       -- is_current
);
