# 01 - Ingest Vehicle Positions

**Version**: 2.0.0

**Purpose**: Fetch real-time vehicle positions from STM GTFS-RT API and save raw protobuf files.

**Pipeline Flow**:
1. Call STM API
2. Save raw .pb file to Volume
3. Log ingestion status

**Output**: Raw protobuf files in `/Volumes/workspace/stm_raw/vehicle_positions_pb/`

**Run Frequency**: Every 5 minutes (scheduled job)

**Note**: This notebook does NOT parse the protobuf. Parsing is done in `10_Bronze_Vehicle_Positions`.

## Cell 1: Configuration

In [0]:
import urllib.request
import urllib.error
import uuid
import time
import os
from datetime import datetime
from pyspark.sql.functions import lit, current_timestamp

# -----------------------------------------------------------------------------
# VERSION
# -----------------------------------------------------------------------------
NOTEBOOK_VERSION = "2.0.0"

# -----------------------------------------------------------------------------
# WIDGETS
# -----------------------------------------------------------------------------
dbutils.widgets.text("env", "dev", "Environment")

ENV = dbutils.widgets.get("env")

# -----------------------------------------------------------------------------
# CATALOG
# -----------------------------------------------------------------------------
CATALOG = "workspace"
spark.sql(f"USE CATALOG {CATALOG}")

# -----------------------------------------------------------------------------
# API CONFIGURATION
# -----------------------------------------------------------------------------
API_KEY = dbutils.secrets.get(scope="stm-secrets", key="api-key")
API_URL = "https://api.stm.info/pub/od/gtfs-rt/ic/v2/vehiclePositions"

# -----------------------------------------------------------------------------
# PATHS
# -----------------------------------------------------------------------------
VOLUME_PATH = "/Volumes/workspace/stm_raw/vehicle_positions_pb"
LOG_TABLE = "stm_bronze.ingestion_log"

# -----------------------------------------------------------------------------
# RUN IDENTIFIERS
# -----------------------------------------------------------------------------
BATCH_ID = str(uuid.uuid4())
CURRENT_EPOCH = int(time.time())
DATE_STR = time.strftime("%Y-%m-%d")
HOUR_STR = time.strftime("%H")
FILE_NAME = f"stm_positions_{CURRENT_EPOCH}.pb"

# -----------------------------------------------------------------------------
# STATUS
# -----------------------------------------------------------------------------
print(f"[INFO] Notebook: 01_Ingest_Vehicle_Positions v{NOTEBOOK_VERSION}")
print(f"[INFO] Environment: {ENV}")
print(f"[INFO] Batch ID: {BATCH_ID}")
print(f"[INFO] Target: {VOLUME_PATH}/date={DATE_STR}/hour={HOUR_STR}/")

[INFO] Notebook: 01_Ingest_Vehicle_Positions v2.0.0
[INFO] Environment: dev
[INFO] Batch ID: 2896d6f4-a053-4424-8391-7f6867f87493
[INFO] Target: /Volumes/workspace/stm_raw/vehicle_positions_pb/date=2026-01-29/hour=00/


## Cell 2: Helper Functions

In [0]:
def log_ingestion(file_name, file_path, status, error_message=None):
    """
    Log ingestion attempt to the ingestion_log table.
    """
    spark.sql(f"""
        INSERT INTO {LOG_TABLE}
        VALUES (
            '{BATCH_ID}',
            'vehicle_positions',
            '{file_name}',
            '{file_path}',
            NULL,
            '{status}',
            {f"'{error_message}'" if error_message else "NULL"},
            current_timestamp(),
            current_timestamp(),
            NULL
        )
    """)
    
    print(f"[INFO] Logged: {status} - {file_name}")


def fetch_api_data():
    """
    Fetch data from STM GTFS-RT API.
    Returns raw bytes or raises exception.
    """
    headers = {
        "apiKey": API_KEY,
        "Accept": "application/x-protobuf"
    }
    
    request = urllib.request.Request(API_URL, headers=headers)
    
    with urllib.request.urlopen(request, timeout=30) as response:
        return response.read()

## Cell 3: Main Ingestion

In [0]:
start_time = time.time()
file_path = None

try:
    # =========================================================================
    # STEP 1: FETCH FROM API
    # =========================================================================
    print(f"[INFO] Fetching from API: {API_URL}")
    
    raw_data = fetch_api_data()
    data_size_kb = len(raw_data) / 1024
    
    print(f"[INFO] Received {data_size_kb:.2f} KB")
    
    # Validate response (basic check)
    if len(raw_data) < 100:
        raise ValueError(f"Response too small ({len(raw_data)} bytes), possible API error")
    
    # =========================================================================
    # STEP 2: SAVE TO VOLUME
    # =========================================================================
    # Create directory structure: date=YYYY-MM-DD/hour=HH/
    landing_dir = f"{VOLUME_PATH}/date={DATE_STR}/hour={HOUR_STR}"
    os.makedirs(landing_dir, exist_ok=True)
    
    file_path = f"{landing_dir}/{FILE_NAME}"
    
    # Write raw bytes directly (FUSE path)
    with open(file_path, "wb") as f:
        f.write(raw_data)
    
    print(f"[INFO] Saved: {file_path}")
    
    # =========================================================================
    # STEP 3: LOG SUCCESS
    # =========================================================================
    duration = time.time() - start_time
    log_ingestion(FILE_NAME, file_path, "SUCCESS")
    
    print(f"[INFO] Ingestion complete | Duration: {duration:.2f}s | Size: {data_size_kb:.2f} KB")

except urllib.error.HTTPError as e:
    error_msg = f"HTTP Error {e.code}: {e.reason}"
    print(f"[ERROR] {error_msg}")
    log_ingestion(FILE_NAME, file_path or "not_saved", "FAILED", error_msg)
    raise

except urllib.error.URLError as e:
    error_msg = f"URL Error: {e.reason}"
    print(f"[ERROR] {error_msg}")
    log_ingestion(FILE_NAME, file_path or "not_saved", "FAILED", error_msg)
    raise

except Exception as e:
    error_msg = str(e)
    print(f"[ERROR] {error_msg}")
    log_ingestion(FILE_NAME, file_path or "not_saved", "FAILED", error_msg)
    raise

[INFO] Fetching from API: https://api.stm.info/pub/od/gtfs-rt/ic/v2/vehiclePositions
[INFO] Received 55.99 KB
[INFO] Saved: /Volumes/workspace/stm_raw/vehicle_positions_pb/date=2026-01-29/hour=00/stm_positions_1769647020.pb
[INFO] Logged: SUCCESS - stm_positions_1769647020.pb
[INFO] Ingestion complete | Duration: 0.68s | Size: 55.99 KB


## Cell 4: Verify Output

In [0]:
# List recent files in today's partition
print(f"[INFO] Files in {VOLUME_PATH}/date={DATE_STR}/hour={HOUR_STR}/:")
print("-" * 60)

try:
    files = os.listdir(f"{VOLUME_PATH}/date={DATE_STR}/hour={HOUR_STR}")
    files.sort(reverse=True)
    
    for f in files[:5]:  # Show last 5 files
        file_full_path = f"{VOLUME_PATH}/date={DATE_STR}/hour={HOUR_STR}/{f}"
        size_kb = os.path.getsize(file_full_path) / 1024
        print(f"  {f} ({size_kb:.2f} KB)")
    
    if len(files) > 5:
        print(f"  ... and {len(files) - 5} more files")
    
    print("-" * 60)
    print(f"[INFO] Total files today (hour {HOUR_STR}): {len(files)}")
    
except FileNotFoundError:
    print("[WARN] No files found in partition")

[INFO] Files in /Volumes/workspace/stm_raw/vehicle_positions_pb/date=2026-01-29/hour=00/:
------------------------------------------------------------
  stm_positions_1769647020.pb (55.99 KB)
------------------------------------------------------------
[INFO] Total files today (hour 00): 1
