# Automated STEDI ETL Pipeline

This notebook automates the ETL process that curates raw STEDI sensor data into a clean Silver dataset for machine learning. It is designed to run as a Databricks Job.


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW device_messages_clean AS
SELECT
  date AS event_time,
  device_id,
  sensor_type,
  CAST(REGEXP_REPLACE(distance, 'cm', '') AS INT) AS distance_cm,
  'device' AS source_label
FROM workspace.bronze.device_messages_raw;


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW step_tests_clean AS
SELECT
  device_id,
  start_time,
  stop_time
FROM workspace.bronze.rapid_step_tests_raw;


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW final_df AS
SELECT
  d.event_time,
  d.device_id,
  d.sensor_type,
  d.distance_cm,
  d.source_label,
  s.start_time,
  s.stop_time,
  CASE
    WHEN d.event_time BETWEEN s.start_time AND s.stop_time
      THEN 'step'
    ELSE 'no_step'
  END AS step_label
FROM device_messages_clean d
LEFT JOIN step_tests_clean s
  ON d.device_id = s.device_id;


In [0]:
%sql
CREATE OR REPLACE TABLE labeled_step_test AS
SELECT * FROM final_df;


In [0]:
%sql
SELECT *
FROM labeled_step_test
WHERE step_label NOT IN ('step', 'no_step')
   OR step_label IS NULL
LIMIT 50;


In [0]:
%sql
SELECT source_label, COUNT(*)
FROM labeled_step_test
GROUP BY source_label;


In [0]:
%sql
SELECT *
FROM labeled_step_test
WHERE source_label NOT IN ('device', 'step')
   OR source_label IS NULL
LIMIT 50;


In [0]:
%sql
SELECT step_label, COUNT(*)
FROM labeled_step_test
GROUP BY step_label;
