In [0]:
#define varibles use in the scripts
catalog_name = "capstone_aimie_dbk"
schema_name = "medisure"

#storage path
input_path = f"/Volumes/{catalog_name}/{schema_name}/inputs"
schem_path = f"/Volumes/{catalog_name}/{schema_name}/schem"
bronze_path = f"/Volumes/{catalog_name}/{schema_name}/schem/bronze"

from pyspark.sql.functions import col, explode, expr

In [0]:
bronze_claims_batch_df = spark.table(f"{catalog_name}.{schema_name}.bronze_claims_batch")
bronze_claims_stream_df = spark.table(f"{catalog_name}.{schema_name}.bronze_claims_stream")
bronze_diagnosis_df = spark.table(f"{catalog_name}.{schema_name}.bronze_diagnosis_ref")
bronze_members_df = spark.table(f"{catalog_name}.{schema_name}.bronze_members")
bronze_providers_df = spark.table(f"{catalog_name}.{schema_name}.bronze_providers")

In [0]:
silver_claims_batch_df = (
    bronze_claims_batch_df
    .withColumn("ClaimDate", col("ClaimDate").cast("date"))
    .withColumn("ServiceDate", col("ServiceDate").cast("date"))
    .withColumn("IngestTimestamp", col("IngestTimestamp").cast("timestamp"))
    .withColumn("ICD10Code", explode(expr("split(ICD10Codes, ',')")))
    .withColumn("CPTCode", explode(expr("split(CPTCodes, ',')")))
    .select(
        col("ClaimID"),
        col("MemberID"),
        col("ProviderID"),
        col("ClaimDate"),
        col("Amount"),
        col("Status"),
        col("ICD10Code").alias("ICD10Codes"),
        col("CPTCode").alias("CPTCodes"),
        col("IngestTimestamp")
    )
).dropna(subset=["ClaimID", "MemberID", "ProviderID"]).dropDuplicates(["ClaimID", "MemberID", "ProviderID"]).filter((col("ClaimID").isNotNull()) & (col("MemberID").isNotNull()) & (col("ProviderID").isNotNull()))

silver_claims_stream_df = (
    bronze_claims_stream_df
    .withColumn("ClaimDate", col("ClaimDate").cast("date"))
    .withColumn("IngestTimestamp", col("EventTimestamp").cast("timestamp"))
    .select(
        col("ClaimID"),
        col("MemberID"),
        col("ProviderID"),
        col("ClaimDate"),
        col("Amount"),
        col("Status"),
        col("ICD10Codes"),
        col("CPTCodes"),
        col("IngestTimestamp")
    )
).dropna(subset=["ClaimID", "MemberID", "ProviderID"]).dropDuplicates(["ClaimID", "MemberID", "ProviderID"]).filter((col("ClaimID").isNotNull()) & (col("MemberID").isNotNull()) & (col("ProviderID").isNotNull()))

silver_diagnosis_df = (
    bronze_diagnosis_df
).dropna(subset=["Code"]).dropDuplicates(["Code"]).filter((col("Code").isNotNull()) & (col("Description").isNotNull()))

silver_members_df = (
    bronze_members_df
    .withColumn("DOB", col("DOB").cast("date"))
    .withColumn("EffectiveDate", col("EffectiveDate").cast("date"))
    .withColumn("LastUpdated", col("LastUpdated").cast("date"))
).dropna(subset=["MemberID"]).dropDuplicates(["MemberID"]).filter((col("MemberID").isNotNull()))

silver_providers_df = (
    bronze_providers_df
    .withColumn("LastVerified", col("LastVerified").cast("date"))
    .withColumn("Specialties", explode(col("Specialties")))
    .withColumn("Location", explode(col("Locations")))
    .select(
        col("ProviderID"),
        col("Name").alias("ProviderName"),
        col("Specialties"),
        col("Location.Address").alias("Address"),
        col("Location.City").alias("City"),
        col("Location.State").alias("State"),
        col("IsActive").alias("IsActiveFlag"),
        col("TIN"),
        col("LastVerified")
    )
).dropna(subset=["ProviderID"]).dropDuplicates(["ProviderID"]).filter((col("ProviderID").isNotNull()))

(silver_claims_batch_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_claims_batch"))
(silver_claims_stream_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_claims_stream"))
(silver_diagnosis_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_diagnosis_ref"))
(silver_members_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_members"))
(silver_providers_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_providers"))

In [0]:
from pyspark.sql.functions import col, when, udf, lit

# Add source flag to each DataFrame
claims_batch_with_source = silver_claims_batch_df.withColumn("Source", lit("Batch"))
claims_stream_with_source = silver_claims_stream_df.withColumn("Source", lit("Stream"))

# Union the DataFrames
combined_claims_df = claims_batch_with_source.unionByName(claims_stream_with_source)

(combined_claims_df.write.format("delta").mode("overwrite").saveAsTable("medisure.silver_claims_transform"))