In [0]:
import dlt
from pyspark.sql.functions import *

@dlt.table(
  name="raw_data",
  comment="The raw data from the wind farm volume",
  table_properties={
    "quality": "bronze"
  }
)
def read_raw_data():
    return (
        spark.read.format("csv")
        .option("header", "true")
        .load("/Volumes/wind_farm/default/source_data/")
    )


Name,Type
timestamp,string
turbine_id,string
wind_speed,string
wind_direction,string
power_output,string


In [0]:
@dlt.table(
  name="transformed_data",
  comment="Transformed data with necessary transformations",
  table_properties={
    "quality": "silver"
  }
)
@dlt.expect_or_drop("valid_timestamp", "timestamp IS NOT NULL")
@dlt.expect_or_drop("timestamp_format", "CAST(timestamp AS TIMESTAMP) IS NOT NULL")
@dlt.expect_or_drop("valid_turbine_id", "turbine_id IS NOT NULL")
@dlt.expect_or_drop("turbine_id_is_integer", "CAST(turbine_id AS INTEGER) IS NOT NULL")
@dlt.expect_or_drop("valid_wind_speed", "wind_speed IS NOT NULL")
@dlt.expect_or_drop("wind_speed_is_double", "CAST(wind_speed AS DOUBLE) IS NOT NULL")
@dlt.expect_or_drop("valid_wind_direction", "wind_direction IS NOT NULL")
@dlt.expect_or_drop("wind_direction_is_integer", "CAST(wind_direction AS INTEGER) IS NOT NULL")
@dlt.expect_or_drop("wind_direction_range", "CAST(wind_direction AS INTEGER) <= 360")
@dlt.expect_or_drop("valid_power_output", "power_output IS NOT NULL")
@dlt.expect_or_drop("power_output_is_double", "CAST(power_output AS DOUBLE) IS NOT NULL")
def transform_data():
    raw_data = dlt.read("raw_data")
    return raw_data.select(
        col("timestamp").cast("timestamp").alias("timestamp"),
        col("turbine_id").cast("integer").alias("turbine_id"),
        col("wind_speed").cast("double").alias("wind_speed"),
        col("wind_direction").cast("integer").alias("wind_direction"),
        col("power_output").cast("double").alias("power_output")
    )

@dlt.view(
  name="bad_data",
  comment="Data that did not meet the expectations"
)
def bad_data():
    return dlt.read("raw_data").filter(
        "timestamp IS NULL OR CAST(timestamp AS TIMESTAMP) IS NULL OR turbine_id IS NULL OR wind_speed IS NULL OR CAST(wind_speed AS DOUBLE) IS NULL OR wind_direction IS NULL OR CAST(wind_direction AS INTEGER) IS NULL OR CAST(wind_direction AS INTEGER) > 360 OR power_output IS NULL OR CAST(power_output AS DOUBLE) IS NULL"
    )


Name,Type
timestamp,string
turbine_id,string
wind_speed,string
wind_direction,string
power_output,string
