In [0]:
import dlt
from pyspark.sql.functions import col, when

# Define your dataset folders
datasets = {
    "coaches": "abfss://silver@ahmedolympicsdatalake.dfs.core.windows.net/coaches/",
    "athletes": "abfss://silver@ahmedolympicsdatalake.dfs.core.windows.net/athletes/",
    "events": "abfss://silver@ahmedolympicsdatalake.dfs.core.windows.net/events/",
    "nocs": "abfss://silver@ahmedolympicsdatalake.dfs.core.windows.net/nocs/"
}

def clean_and_expect(df, table_name):
    # Drop duplicates
    df = df.dropDuplicates()

    # Replace nulls in string columns with "unknown"
    df = df.select([
        when(col(c).isNull(), "unknown").otherwise(col(c)).alias(c)
        if dtype == "string" else col(c)
        for c, dtype in df.dtypes
    ])

    # Add basic expectations
    for c, dtype in df.dtypes:
        dlt.expect_or_drop(f"{table_name}_{c}_not_null", f"{c} IS NOT NULL")
        if dtype == "string":
            dlt.expect(f"{table_name}_{c}_not_empty", f"LENGTH({c}) > 0")

    return df

# Loop over datasets and register DLT managed tables
for table_name, path in datasets.items():
    @dlt.table(name=f"{table_name}", comment=f"Cleaned {table_name} table in Gold layer")
    def _auto_clean_table(path=path, table_name=table_name):
        df = spark.read.format("delta").load(path)
        return clean_and_expect(df, table_name)


Name,Type
code,string
country,string
country_long,string
tag,string
note,string
