In [0]:
from pyspark.sql import functions as f

In [0]:
df_hosa = spark.read.parquet("/mnt/rcmabhi/bronze/hosa/providers")
display(df_hosa)

In [0]:
df_hosb = spark.read.parquet("/mnt/rcmabhi/bronze/hosb/providers")
display(df_hosb)

In [0]:
df_merged = df_hosa.unionByName(df_hosb)

In [0]:
display(df_merged)

In [0]:
%sql
CREATE TABLE if not exists silver.providers (
  ProviderID string,
  FirstName string,
  LastName string,
  Specialization string,
  DeptID string,
  NPI long,
  datasource string,
  is_quarantined boolean
)
USING DELTA;

In [0]:
df_final = (
    df_merged
        .dropDuplicates()                                   # DISTINCT
        .withColumn("NPI", f.col("NPI").cast("bigint"))     # CAST to BIGINT (preserves 10‑digit NPIs)
        .withColumn(                                        # quarantine flag
            "is_quarantined",
            f.when(
                f.col("ProviderID").isNull() | f.col("DeptID").isNull(),
                f.lit(True)
            ).otherwise(f.lit(False))
        )
        .select(                                            # keep column order identical to SQL
            "ProviderID",
            "FirstName",
            "LastName",
            "Specialization",
            "DeptID",
            "NPI",
            "datasource",
            "is_quarantined"
        )
)


In [0]:
df_final.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("silver.providers")

In [0]:
display(df_final)