## 97 - Data Validation Tests: Ensuring Data Quality Across Layers

**Purpose**  
This notebook validates the integrity and quality of key datasets in the Silver and Gold layers before analytics or ML modeling.

### ✅ Tests Included
- **Missing values**: No nulls in critical columns like `event_ts`, `temperature`, `vehicle_id`
- **Reasonable ranges**:
  - Temperature between -50°F and 150°F
  - Latitude/longitude within Seattle bounds
- **Primary key uniqueness**:
  - No duplicate `(vehicle_id, event_ts)` in GTFS-RT
- **Optional**: Temporal logic (e.g., `forecast_time ≥ event_ts`)

Each test displays PASS/FAIL status and row counts for invalid records, supporting early anomaly detection before downstream tasks.


In [0]:
# Data Quality & Validation Layer for Seattle Transit & Weather Project
from pyspark.sql import functions as F
import datetime as dt

In [0]:
stakeholders_text = ""
# Get today's date for reporting
REPORT_DATE = dt.date.today().isoformat()


In [0]:
# ✅ Load Silver and Gold Delta Tables
df_rt = spark.read.format("delta").load("dbfs:/silver/gtfs_rt")
df_weather = spark.read.format("delta").load("dbfs:/silver/weather")
df_stops = spark.read.format("delta").load("dbfs:/silver/gtfs_static/2025-05-21/stops")
df_gold = spark.read.format("delta").load("dbfs:/gold/gtfs_rt_weather_joined")

In [0]:
# Test 1: Null Checks

print("\n🔍 Test 1: Nulls in Critical Columns")

flag_event_ts = flag_vehicle_id = flag_temp = flag_forecast_time = 0

null_event_ts = df_rt.filter(F.col("event_ts").isNull())
count_event_ts = null_event_ts.count()
if count_event_ts > 0:
    flag_event_ts = 1
    print(f"Null event_ts in GTFS RT: {count_event_ts}")
    stakeholders_text += f"Null event_ts in GTFS RT: {count_event_ts}\nSample rows:\n{null_event_ts.limit(5).toPandas().to_string(index=False)}\n\n"

null_vehicle_id = df_rt.filter(F.col("vehicle_id").isNull())
count_vehicle_id = null_vehicle_id.count()
if count_vehicle_id > 0:
    flag_vehicle_id = 1
    print(f"Null vehicle_id in GTFS RT: {count_vehicle_id}")
    stakeholders_text += f"Null vehicle_id in GTFS RT: {count_vehicle_id}\nSample rows:\n{null_vehicle_id.limit(5).toPandas().to_string(index=False)}\n\n"

null_temp = df_weather.filter(F.col("temperature").isNull())
count_temp = null_temp.count()
if count_temp > 0:
    flag_temp = 1
    print(f"Null temperature in Weather: {count_temp}")
    stakeholders_text += f"Null temperature in Weather: {count_temp}\nSample rows:\n{null_temp.limit(5).toPandas().to_string(index=False)}\n\n"

null_forecast_time = df_weather.filter(F.col("forecast_time").isNull())
count_forecast_time = null_forecast_time.count()
if count_forecast_time > 0:
    flag_forecast_time = 1
    print(f"Null forecast_time in Weather: {count_forecast_time}")
    stakeholders_text += f"Null forecast_time in Weather: {count_forecast_time}\nSample rows:\n{null_forecast_time.limit(5).toPandas().to_string(index=False)}\n\n"

if not any([flag_event_ts, flag_vehicle_id, flag_temp, flag_forecast_time]):
    print("✅ PASS: No nulls found in critical columns.")
else:
    print("\n🔔 Issues found — see stakeholders_text summary below.")



In [0]:
print("\n🔍 Test 2: Value Ranges")

flag_temp_range = flag_rt_coords = flag_static_coords = 0

# Temperature outliers
temp_outliers_df = df_weather.filter((F.col("temperature") < -50) | (F.col("temperature") > 150))
count_temp_outliers = temp_outliers_df.count()
if count_temp_outliers > 0:
    flag_temp_range = 1
    print(f"Unrealistic temperatures: {count_temp_outliers}")
    stakeholders_text += f"Unrealistic temperatures: {count_temp_outliers}\nSample rows:\n{temp_outliers_df.limit(5).toPandas().to_string(index=False)}\n\n"

# GTFS-RT lat/lon out of Seattle bounds
rt_coord_outliers_df = df_rt.filter((F.col("latitude") < 47) | (F.col("latitude") > 48) |
                                    (F.col("longitude") < -123) | (F.col("longitude") > -121))
count_rt_coord_outliers = rt_coord_outliers_df.count()
if count_rt_coord_outliers > 0:
    flag_rt_coords = 1
    print(f"GTFS RT records outside Seattle bounds: {count_rt_coord_outliers}")
    stakeholders_text += f"GTFS RT records outside Seattle bounds: {count_rt_coord_outliers}\nSample rows:\n{rt_coord_outliers_df.limit(5).toPandas().to_string(index=False)}\n\n"

# Static stops out of range
stop_coord_outliers_df = df_stops.filter(~((F.col("location.stop_lat").between(47.1, 47.9)) &
                                           (F.col("location.stop_lon").between(-122.6, -121.5))))
count_static_coord_outliers = stop_coord_outliers_df.count()
if count_static_coord_outliers > 0:
    flag_static_coords = 1
    print(f"Static stops with out-of-range lat/lon: {count_static_coord_outliers}")
    stakeholders_text += f"Static stops with out-of-range lat/lon: {count_static_coord_outliers}\nSample rows:\n{stop_coord_outliers_df.limit(5).toPandas().to_string(index=False)}\n\n"

if not any([flag_temp_range, flag_rt_coords, flag_static_coords]):
    print("✅ PASS: All values fall within expected geographic and temperature ranges.")
else:
    print("\n🔔 Issues found — see stakeholders_text summary below.")



In [0]:
print("\n🔍 Test 3: Duplicate Primary Keys")

flag_duplicates = 0

duplicates_rt_df = (
    df_rt.groupBy("vehicle_id", "event_ts")
    .count()
    .filter("count > 1")
    .orderBy(F.desc("count"))
)

count_duplicates = duplicates_rt_df.count()

if count_duplicates > 0:
    flag_duplicates = 1
    print(f"Duplicate vehicle_id + event_ts in GTFS RT: {count_duplicates}")
    stakeholders_text += (
        f"Duplicate GTFS RT records by vehicle_id + event_ts: {count_duplicates}\n"
        + "Sample rows:\n"
        + duplicates_rt_df.limit(5).toPandas().to_string(index=False)
        + "\n\n"
    )
else:
    print("✅ PASS: No duplicate primary key records found in GTFS RT.")


In [0]:
duplicates_rt_df.display()

In [0]:
print("\n🔍 Test 4: forecast_time Before event_ts")

flag_invalid_order = 0

invalid_order_df = df_gold.filter(F.col("forecast_time") < F.col("event_ts"))
invalid_order_count = invalid_order_df.count()

if invalid_order_count > 0:
    flag_invalid_order = 1
    print(f"Gold rows with forecast_time before event_ts: {invalid_order_count}")
    stakeholders_text += (
        f"Gold rows with forecast_time before event_ts: {invalid_order_count}\n"
        + "Sample rows:\n"
        + invalid_order_df.select("vehicle_id", "event_ts", "forecast_time")
            .limit(5)
            .toPandas()
            .to_string(index=False)
        + "\n\n"
    )
else:
    print("✅ PASS: All gold records have forecast_time after or equal to event_ts.")


In [0]:
if stakeholders_text != "":
    final_text = f"""
    Dear Stakeholders:
    Data quality issues found on {REPORT_DATE}:
    """ + stakeholders_text + """
    Please review the samples and address the above issues before proceeding with analytics or modeling.
    Best regards,
    Data Engineering Team
    """
else:
    final_text = f"""
    Dear Stakeholders:
    No data quality issues found on {REPORT_DATE}.
    The datasets passed all validation checks.
    Best regards,
    Data Engineering Team
    """

In [0]:
print(final_text)