In [0]:
# Libs
from pyspark.sql.functions import col, count, when, isnan, lit, current_date
from pyspark.sql.types import *

In [0]:
#Datalake connection
blobAccessKey = dbutils.secrets.get(scope = "myscope", key = "accesskey")
spark.conf.set("fs.azure.account.key.datalakeetlproject.dfs.core.windows.net", 
               blobAccessKey) 

In [0]:
# Paths da camada Gold
path_gold_company = "abfss://gold@datalakeetlproject.dfs.core.windows.net/dim_company"
path_gold_date = "abfss://gold@datalakeetlproject.dfs.core.windows.net/dim_date"
path_gold_fact = "abfss://gold@datalakeetlproject.dfs.core.windows.net/fact_quote"

# Read data
df_company = spark.read.format("delta").load(path_gold_company)
df_date = spark.read.format("delta").load(path_gold_date)
df_fact = spark.read.format("delta").load(path_gold_fact)


In [0]:
def count_nulls(df):
    null_counts = df.select([
        count(
            when(
                col(c).isNull() |
                ((col(c) == "") & (df.schema[c].dataType.simpleString() == "string")) |
                (isnan(col(c)) if df.schema[c].dataType.simpleString() in ["double", "float"] else False),
                c
            )
        ).alias(c)
        for c in df.columns
    ])
    return null_counts

In [0]:
def validate_dataset(df, dataset_name, critical_columns=[]):
    print(f"Starting data quality validation for: {dataset_name}")
    print(f"Total records: {df.count()}")

    # Null counts
    null_counts_df = count_nulls(df)
    print("Null counts per column:")
    null_counts_df.show(truncate=False)

    # Save data quality report
    quality_report = (
        null_counts_df
        .withColumn("total_records", lit(df.count()))
        .withColumn("dataset_name", lit(dataset_name))
        .withColumn("validation_date", lit(current_date()))
    )

    # Raise an exception if critical columns contain nulls
    row = null_counts_df.collect()[0]
    for col_name in critical_columns:
        if row[col_name] > 0:
            raise Exception(f"Validation failed: column '{col_name}' in dataset '{dataset_name}' contains {row[col_name]} null values!")

    print(f"Validation for {dataset_name} completed successfully!")

In [0]:
validate_dataset(df_company, "dim_company", critical_columns=["ticker", "company_name"])
validate_dataset(df_date, "dim_date", critical_columns=["date"])
validate_dataset(df_fact, "fact_quote", critical_columns=["ticker", "date", "open", "close", "high", "low", "volume"])

print("All Gold layer datasets validated successfully!")