In [6]:
%load_ext autoreload
%autoreload 2
import os 
os.chdir("/Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance")
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from datetime import datetime
from tqdm import tqdm  # Used in load_speed_data function


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
def find_speed_files(directory: str, start_date: str, end_date: str) -> list[str]:
    """Find all bus speed parquet files in the directory within date range"""
    speed_files = []
    for filename in os.listdir(directory):
        if filename.endswith('.parquet') and filename != 'stops.parquet':
            try:
                # Extract date from bus_speeds_20241204.parquet
                file_date = filename.split('_')[-1].split('.')[0]
                if start_date <= file_date <= end_date:
                    file_path = os.path.join(directory, filename)
                    speed_files.append(file_path)
            except (ValueError, IndexError):
                continue
    print(f"\nFound {len(speed_files)} speed files between {start_date} and {end_date} in {directory}")
    return speed_files
    

def load_speed_parquets(file_paths: List[str], max_workers: int = 4) -> pd.DataFrame:
    """Load and combine multiple speed parquet files in parallel"""
    speed_data_frames = []
    chunk_size = 5
    total_chunks = (len(file_paths) + chunk_size - 1) // chunk_size
    
    print("Loading speed parquets:")
    for i in range(0, len(file_paths), chunk_size):
        chunk_files = file_paths[i:i + chunk_size]
        print(f"\nProcessing chunk {i//chunk_size + 1}/{total_chunks}")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(pd.read_parquet, path) for path in chunk_files]
            
            for future, file_path in zip(as_completed(futures), chunk_files):
                try:
                    df = future.result()
                    if not df.empty:
                        speed_data_frames.append(df)
                        # print(f"Loaded: {os.path.basename(file_path)} ({len(df)} rows)")
                except Exception as e:
                    print(f"Error reading file {os.path.basename(file_path)}: {e}")
    
    successful_loads = len(speed_data_frames)
    print(f"Successfully loaded {successful_loads} speed parquets")
    
    if speed_data_frames:
        combined_data = pd.concat(speed_data_frames, ignore_index=True)
        print(f"Combined data shape: {combined_data.shape}")
        return combined_data
    return pd.DataFrame()

## Aggregation for chart

In [8]:
def calculate_hourly_speeds(speed_data: pd.DataFrame) -> pd.DataFrame:
    """Calculate hourly speeds for each route on each weekday"""
    
    # Group and sum distances and times
    hourly_totals = speed_data.groupby(['route_id', 'weekday', 'hour']).agg({
        'segment_length': 'sum',  # total distance in feet
        'time_elapsed': 'sum',    # total time in seconds
    }).reset_index()
    
    # Calculate average speeds
    hourly_totals['average_speed_mph'] = (
        (hourly_totals['segment_length'] / 5280) /  # convert feet to miles
        (hourly_totals['time_elapsed'] / 3600)      # convert seconds to hours
    )
    
    # Select and format final results
    hourly_speeds = hourly_totals[[
        'route_id', 
        'weekday', 
        'hour', 
        'average_speed_mph'
    ]].copy()
    
    # Round speeds for readability
    hourly_speeds['average_speed_mph'] = hourly_speeds['average_speed_mph'].round(2)

    print(f"\nGenerated {len(hourly_speeds)} hourly speed records")
    print(f"Routes analyzed: {hourly_speeds['route_id'].nunique()}")
    
    return hourly_speeds


def batch_process_hourly_speed(
    directory_paths: List[str], 
    start_date: str, 
    end_date: str, 
    output_path: str,
    max_workers: int = 4
) -> None:
    """
    Process speed data from multiple directories in parallel
    
    Args:
        directory_paths: List of directories containing speed files
        start_date: Start date in YYYYMMDD format
        end_date: End date in YYYYMMDD format
        output_path: Path to save processed speed data
        max_workers: Number of parallel workers
    """
    print(f"\nProcessing {len(directory_paths)} directories from {start_date} to {end_date}")
    speed_data_frames = []

    for directory_path in directory_paths:
        speed_files = find_speed_files(directory_path, start_date, end_date)
        if speed_files:
            speed_data = load_speed_parquets(speed_files)
            if not speed_data.empty:
                speed_data_frames.append(speed_data)

    if speed_data_frames:
        # Combine all speed data
        combined_speeds = pd.concat(speed_data_frames, ignore_index=True)
        
        # Calculate hourly averages
        hourly_speeds = calculate_hourly_speeds(combined_speeds)
        
        # Save results
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        hourly_speeds.to_parquet(output_path, index=False)
        print(f"\nSaved hourly speeds to: {output_path}")
    else:
        print("\nNo speed data found in any directory")

In [9]:
# Usage:
raw_speeds_dir = "/Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds"
# dir_lists is what inside data/raw-speeds
dir_lists = [
        os.path.join(raw_speeds_dir, d) 
        for d in os.listdir(raw_speeds_dir) 
        if os.path.isdir(os.path.join(raw_speeds_dir, d))
    ]

batch_process_hourly_speed(
    directory_paths=dir_lists,
    start_date='2024-12-03',
    end_date='2025-01-04',
    output_path='./data/chart-speeds/control_speeds.parquet'
)


Processing 12 directories from 2024-12-03 to 2025-01-04

Found 24 speed files between 2024-12-03 and 2025-01-04 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-513-202412120015
Loading speed parquets:

Processing chunk 1/5

Processing chunk 2/5

Processing chunk 3/5

Processing chunk 4/5

Processing chunk 5/5
Successfully loaded 24 speed parquets
Combined data shape: (239889, 12)

Found 0 speed files between 2024-12-03 and 2025-01-04 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-514-202502170029

Found 0 speed files between 2024-12-03 and 2025-01-04 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-513-202501230024

Found 23 speed files between 2024-12-03 and 2025-01-04 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-512-202412120015
Loading speed parquets:

Processing chunk 1/5

Processing c

In [10]:
batch_process_hourly_speed(
    directory_paths=dir_lists,
    start_date='2025-01-05',
    end_date='2025-02-06',
    output_path='./data/chart-speeds/treatment_speeds.parquet'
)


Processing 12 directories from 2025-01-05 to 2025-02-06

Found 0 speed files between 2025-01-05 and 2025-02-06 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-513-202412120015

Found 0 speed files between 2025-01-05 and 2025-02-06 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-514-202502170029

Found 14 speed files between 2025-01-05 and 2025-02-06 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-513-202501230024
Loading speed parquets:

Processing chunk 1/3

Processing chunk 2/3

Processing chunk 3/3
Successfully loaded 14 speed parquets
Combined data shape: (158100, 12)

Found 0 speed files between 2025-01-05 and 2025-02-06 in /Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance/data/raw-speeds/mdb-512-202412120015

Found 0 speed files between 2025-01-05 and 2025-02-06 in /Users/luohy/Documents/Projects/bus-observator