In [0]:
import dlt
from pyspark.sql.functions import col,regexp_replace,when,explode_outer,from_json
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

In [0]:
# Bronze → View
@dlt.view(
    name="silver_members_view",
    comment="Intermediate cleaned members data view"
)
def silver_members_view():
    return (
        dlt.read("bronze_members")
            .dropDuplicates(["MemberID"])
    )

# View → Silver Table
@dlt.table(
    name="silver_members",
    comment="Cleaned members data with quality checks",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
@dlt.expect_or_drop("memberid_not_null","MemberID IS NOT NULL")

def silver_members():
    return dlt.read("silver_members_view")


In [0]:
import dlt
from pyspark.sql.functions import col, when

# Bronze → View
@dlt.view(
    name="silver_diagnosis_ref_view",
    comment="Intermediate cleaned diagnosis reference data view"
)
def silver_diagnosis_ref_view():
    df = dlt.read("bronze_diagnosis_ref").dropDuplicates(["Code"])

    # Clean Code primary key column: if contains a dot inside alphanumeric code, remove the dot; otherwise leave as-is
    df = df.withColumn(
        "Code",
        when(
            col("Code").rlike("^[A-Za-z0-9]+\\.[A-Za-z0-9]+$"),  
            regexp_replace(col("Code"), "\\.", "")               
        ).when(
            col("Code").rlike("^[0-9]+(\\.[0-9]+)?$"),           
            col("Code").cast("double").cast("int").cast("string")
        ).otherwise(col("Code"))                               
    )

    return df


# View → Silver Table with expectation
@dlt.table(
    name="silver_diagnosis_ref",
    comment="Cleaned diagnosis_ref data with Code enforced as integer format if decimal",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)

@dlt.expect_or_drop("Code_not_null", "Code IS NOT NULL")

def silver_diagnosis_ref():
    return dlt.read("silver_diagnosis_ref_view")


In [0]:

location_schema = ArrayType(
    StructType([
        StructField("Address", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True)
    ])
)

@dlt.expect_or_drop("providerid_not_null", "ProviderID IS NOT NULL")
@dlt.table(
    name="silver_providers",
    comment="Normalized provider directory with flattened locations and specialties",
    table_properties={"quality": "silver"}
)
def silver_providers():
    p = dlt.read("bronze_providers")

    # Parse JSON string into array<struct>
    p_parsed = p.withColumn("Locations", from_json(col("Locations"), location_schema)) \
                .withColumn("Specialties", from_json(col("Specialties"), ArrayType(StringType())))

    # Explode parsed arrays
    p_flat = (
        p_parsed.withColumn("location", explode_outer("Locations"))
                .withColumn("Specialty", explode_outer("Specialties"))
                .select(
                    "ProviderID",
                    "Name",
                    "TIN",
                    "IsActive",
                    "LastVerified",
                    col("location.Address").alias("Address"),
                    col("location.City").alias("City"),
                    col("location.State").alias("State"),
                    "Specialty"
                )
    )

    return p_flat.dropDuplicates(["ProviderID", "Address", "Specialty"])
