In [0]:
!pip install --quiet databricks-dlt 
dbutils.library.restartPython()
import dlt


In [0]:
import dlt
from pyspark.sql.functions import concat, col, array, from_unixtime

@dlt.table(
    name="turbine_training_dataset",
    comment="Hourly sensor stats, used to describe signal and detect anomalies"
)
def turbine_training_dataset():
    sensor_hourly = dlt.read("sensor_silver")
    turbine = dlt.read("turbine_bronze")
    historical_turbine_status = dlt.read("historical_turbine_bronze")
    
    return (
        sensor_hourly.alias("m")
        .join(turbine.alias("t"), "turbine_id")
        .join(historical_turbine_status.alias("s"), 
              (col("m.turbine_id") == col("s.turbine_id")) & 
              (from_unixtime(col("s.start_time")) < col("m.hourly_timestamp")) & 
              (from_unixtime(col("s.end_time")) > col("m.hourly_timestamp"))
        )
        .select(
            concat(col("t.turbine_id"), "-", col("s.start_time")).alias("composite_key"),
            array(
                col("std_sensor_A"), col("std_sensor_B"), col("std_sensor_C"), 
                col("std_sensor_D"), col("std_sensor_E"), col("std_sensor_F")
            ).alias("sensor_vector"),
            *[col for col in sensor_hourly.columns if col not in ["_rescued_data", "turbine_id"]],
            *[col for col in turbine.columns if col != "_rescued_data"],
            *[col for col in historical_turbine_status.columns if col != "_rescued_data"]
        )
    )