In [19]:
%load_ext autoreload
%autoreload 2
import os 
# changes the current working directory of your Jupyter Notebook (or Python script) to root directory
os.chdir("/Users/luohy/Documents/Projects/bus-observatory/gtfs-realtime-performance")
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List
from datetime import datetime
from tqdm import tqdm  # Used in load_speed_data function
import json


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
def find_speed_parquets(directory: str, start_date: str, end_date: str) -> list[str]:
    """Find all bus speed parquet files in the directory within date range"""
    speed_files = []
    for filename in os.listdir(directory):
        if filename.endswith('.parquet') and filename != 'stops.parquet':
            try:
                # Extract date from bus_speeds_20241204.parquet
                file_date = filename.split('_')[-1].split('.')[0]
                if start_date <= file_date <= end_date:
                    file_path = os.path.join(directory, filename)
                    speed_files.append(file_path)
            except (ValueError, IndexError):
                continue
    print(f"Found {len(speed_files)} speed parquets between {start_date} and {end_date} in {directory}")
    return speed_files
    

def load_speed_parquet_data(file_paths: List[str], max_workers: int = 4) -> pd.DataFrame:
    """Load and combine multiple speed parquet files in parallel"""
    speed_data_frames = []
    chunk_size = 5
    
    for i in range(0, len(file_paths), chunk_size):
        chunk_files = file_paths[i:i + chunk_size]
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(pd.read_parquet, path) for path in chunk_files]
            
            for future, file_path in zip(as_completed(futures), chunk_files):
                try:
                    df = future.result()
                    if not df.empty:
                        speed_data_frames.append(df)
                except Exception as e:
                    print(f"Error reading file {os.path.basename(file_path)}: {e}")
    
    successful_loads = len(speed_data_frames)
    print(f"Loaded {successful_loads} speed parquets")
    
    if speed_data_frames:
        combined_data = pd.concat(speed_data_frames, ignore_index=True)
        print(f"Combined speed data shape: {combined_data.shape}")
        return combined_data
    return pd.DataFrame()

## Aggregation for chart

In [27]:
def calculate_hourly_speeds(speed_data: pd.DataFrame) -> pd.DataFrame:
    """Calculate hourly speeds for each route on each weekday"""
    
    # Group and sum distances and times
    hourly_totals = speed_data.groupby(['route_id', 'weekday', 'hour']).agg({
        'segment_length': 'sum',  # total distance in feet
        'time_elapsed': 'sum',    # total time in seconds
    }).reset_index()
    
    # Calculate average speeds
    hourly_totals['average_speed_mph'] = (
        (hourly_totals['segment_length'] / 5280) /  # convert feet to miles
        (hourly_totals['time_elapsed'] / 3600)      # convert seconds to hours
    )
    
    # Select and format final results
    hourly_speeds = hourly_totals[[
        'route_id', 
        'weekday', 
        'hour', 
        'average_speed_mph'
    ]].copy()
    
    # Round speeds for readability
    hourly_speeds['average_speed_mph'] = hourly_speeds['average_speed_mph'].round(2)

    print(f"\nGenerated {len(hourly_speeds)} hourly speed records")
    print(f"Routes analyzed: {hourly_speeds['route_id'].nunique()}")
    
    return hourly_speeds


def batch_process_hourly_speed(
    directory_paths: List[str], 
    start_date: str, 
    end_date: str, 
    output_path: str,
    max_workers: int = 4
) -> None:
    """
    Process speed data from multiple directories in parallel
    
    Args:
        directory_paths: List of directories containing speed files
        start_date: Start date in YYYYMMDD format
        end_date: End date in YYYYMMDD format
        output_path: Path to save processed speed data
        max_workers: Number of parallel workers
    """
    print(f"\nProcessing {len(directory_paths)} directories from {start_date} to {end_date}")
    speed_data_frames = []

    for directory_path in directory_paths:
        speed_files = find_speed_parquets(directory_path, start_date, end_date)
        if speed_files:
            speed_data = load_speed_parquet_data(speed_files)
            if not speed_data.empty:
                speed_data_frames.append(speed_data)

    if speed_data_frames:
        # Combine all speed data
        combined_speeds = pd.concat(speed_data_frames, ignore_index=True)
        
        # Calculate hourly averages
        hourly_speeds = calculate_hourly_speeds(combined_speeds)
        
        # Save results
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        hourly_speeds.to_parquet(output_path, index=False)
        print(f"\nSaved hourly speeds to: {output_path}")
    else:
        print("\nNo speed data found in any directory")

### Usage

In [28]:
raw_speeds_dir = "data/raw-speeds"
# dir_lists is what inside data/raw-speeds
dir_lists = [
        os.path.join(raw_speeds_dir, d) 
        for d in os.listdir(raw_speeds_dir) 
        if os.path.isdir(os.path.join(raw_speeds_dir, d))
    ]

In [30]:
batch_process_hourly_speed(
    directory_paths=dir_lists,
    start_date='2024-12-03',
    end_date='2025-01-04',
    output_path='data/chart-speeds/control_speeds.parquet'
)

batch_process_hourly_speed(
    directory_paths=dir_lists,
    start_date='2025-01-05',
    end_date='2025-02-06',
    output_path='data/chart-speeds/treatment_speeds.parquet'
)


Processing 12 directories from 2024-12-03 to 2025-01-04
Found 24 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202412120015
Loaded 24 speed parquets
Combined speed data shape: (239889, 12)
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-514-202502170029
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202501230024
Found 23 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-512-202412120015
Loaded 23 speed parquets
Combined speed data shape: (1272, 12)
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-512-202502170011
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-514-202501020130
Found 9 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202409090026
Loaded 9 speed parquets
Combined speed data shape: (89201, 12)
Found 29 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-s

## Aggregation for Map

In [7]:
import geopandas as gpd
import os
from typing import List

def get_directories_by_mdb(mdb_id: str) -> List[str] :
    # for folder in data/raw-speeds, if folder name like mdb-513.., append it to directory_paths
    directory_paths = []
    for folder in os.listdir("data/raw-speeds"):
        if folder.startswith(mdb_id):
            directory_paths.append(os.path.join("data/raw-speeds", folder))
    return directory_paths

def merge_segment_files(directory_paths: List[str]) -> gpd.GeoDataFrame:
    """
    Compare segment.geojson files across different feed directories
    
    Args:
        directory_paths: List of directory paths containing segment files
        
    Returns:
        GeoDataFrame containing merged segment data
    """
    segment_gdfs = []
    
    print("\nCollecting segment files...")
    # Collect all segment files
    for directory in directory_paths:
        segment_path = os.path.join(directory, 'segments.geojson')
        if os.path.exists(segment_path):
            try:
                gdf = gpd.read_file(segment_path)
                segment_gdfs.append(gdf)
            except Exception as e:
                print(f"Error reading segment file in {os.path.basename(directory)}: {e}")
    
    if not segment_gdfs:
        raise ValueError("No segment files found")
    
    if len(segment_gdfs) == 1:
        print("Only one segment file found, using it as reference")
        return segment_gdfs[0]
    
    # Compare contents
    reference_gdf = segment_gdfs[0]
    all_match = all(gdf.equals(reference_gdf) for gdf in segment_gdfs[1:])
    
    if all_match:
        merged_gdf = reference_gdf
    
    # If files differ, merge them
    merged_gdf = pd.concat(segment_gdfs).drop_duplicates()
    print(f"Final merged segments shape: {merged_gdf.shape}")
    
    return merged_gdf


def calculate_rush_hour_speeds(speed_data: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate average speeds during morning and evening rush hours.
    
    Args:
        speed_data: DataFrame containing speed data with columns:
                   route_id, trip_id, shape_id, stop_id, prev_stop_id, 
                   segment_length, time_elapsed, speed_mph, route_id,
                   datetime_nyc, date, weekday, hour
    
    Returns:
        DataFrame with rush hour speed metrics: 
        route_id, trip_id, shape_id, stop_id, prev_stop_id, 
        weekday, rush_hour, avg_speed_mph
    """
    
    # Input validation
    required_columns = ['trip_id', 'shape_id', 'stop_id', 'prev_stop_id', 
                       'segment_length', 'time_elapsed', 'speed_mph', 'route_id',
                       'datetime_nyc', 'date', 'weekday', 'hour']
    missing_cols = [col for col in required_columns if col not in speed_data.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    
    # Define rush hours
    speed_data['rush_hour'] = 'non_rush'
    morning_mask = (speed_data['hour'] >= 7) & (speed_data['hour'] < 10)
    evening_mask = (speed_data['hour'] >= 16) & (speed_data['hour'] < 19)
    speed_data.loc[morning_mask, 'rush_hour'] = 'morning_rush'
    speed_data.loc[evening_mask, 'rush_hour'] = 'evening_rush'

    # Group and aggregate
    grouped = speed_data.groupby(['route_id', 'stop_id', 'prev_stop_id', 'weekday', 'rush_hour'])
    agg_data = grouped.agg(
        total_distance=('segment_length', 'sum'),
        total_time=('time_elapsed', 'sum'),
    ).reset_index()
    
    # Calculate speeds (feet/sec to mph)
    agg_data['avg_speed_mph'] = (agg_data['total_distance'] / 5280) / (agg_data['total_time'] / 3600)

    # Keep only the columns we need
    result = agg_data[['route_id', 'stop_id', 'prev_stop_id', 
                       'weekday', 'rush_hour', 'avg_speed_mph']]
    
    print(f"\nFinal rush hour speeds shape: {result.shape}")
    return result

def calculate_speeds_difference(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the difference in average speeds between two dataframes.
    
    Args:
        df1: First DataFrame with speed data
        df2: Second DataFrame with speed data
        
    Returns:
        DataFrame with speed differences between df1 and df2
    """
    # Ensure both dataframes have the same columns
    key_columns = ['route_id', 'stop_id', 'prev_stop_id', 'weekday', 'rush_hour']
    
    # Add suffixes to distinguish the dataframes when merging
    merged_df = pd.merge(
        df1, df2, 
        on=key_columns,
        how='inner',
        suffixes=('_df1', '_df2')
    )
    
    # Calculate the difference in speeds
    merged_df['avg_speed_diff'] = merged_df['avg_speed_mph_df1'] - merged_df['avg_speed_mph_df2']
    
    # Select only the columns we need for the result
    result_df = merged_df[key_columns + ['avg_speed_diff']]
    
    return result_df


def match_speeds_with_segments(speeds_diff: pd.DataFrame, segments_gdf: gpd.GeoDataFrame, route_id: str, weekday: int) -> gpd.GeoDataFrame:
    """
    Match speed differences with geographic segments for visualization.
    
    Args:
        speeds_diff: DataFrame with speed differences
        segments_gdf: GeoDataFrame with segment geometries
        route_id: Route ID to filter by
        weekday: Day of week to filter by (0-6, where 0 is Monday)
        
    Returns:
        GeoDataFrame with speed differences and segment geometries
    """
    # Filter speeds by route_id and weekday
    filtered_speeds = speeds_diff[(speeds_diff['route_id'] == route_id) & 
                                 (speeds_diff['weekday'] == weekday)]
    
    # Join segments to speeds on stop_id and prev_stop_id
    matched_segments = pd.merge(
        filtered_speeds,
        segments_gdf,
        on=['stop_id', 'prev_stop_id'],
        how='inner'
    )
    
    # Convert to GeoDataFrame if it's a regular DataFrame
    if not isinstance(matched_segments, gpd.GeoDataFrame):
        matched_segments = gpd.GeoDataFrame(matched_segments, geometry=segments_gdf.geometry.name)
    
    # Convert segment_gdf to web mercator projection (EPSG:3857) required for contextily
    matched_segments = matched_segments.to_crs(epsg=3857)
    print("\nFinal matched segments shape: ", matched_segments.shape)

    return matched_segments


In [14]:
def process_speed_comparison(
    mdb_id: str,
    route_ids: List[str],
    control_start_date: str,
    control_end_date: str,
    treatment_start_date: str,
    treatment_end_date: str,
    output_dir: str = 'data/map-speeds'
) -> None:
    """
    Process and compare bus speeds between control and treatment periods.
    
    Args:
        mdb_id: str, ID of the MDB (e.g., "mdb-513")
        route_ids: List[str], list of route IDs to process
        control_start_date: str, start date for control period (YYYY-MM-DD)
        control_end_date: str, end date for control period (YYYY-MM-DD)
        treatment_start_date: str, start date for treatment period (YYYY-MM-DD)
        treatment_end_date: str, end date for treatment period (YYYY-MM-DD)
        output_dir: str, directory to save output files (default: 'data/map-speeds')
    """
    # Get directories and load segments
    directory_paths = get_directories_by_mdb(mdb_id)
    segments_gdf = merge_segment_files(directory_paths)

    # Save the merged segments to a geojson file
    segments_gdf.to_file(os.path.join('data/map-segments', f'{mdb_id}_merged_segments.geojson'), driver='GeoJSON')

    # Initialize DataFrames
    control_data = pd.DataFrame()
    treatment_data = pd.DataFrame()

    # Load and combine data from all directories
    for directory in directory_paths:   
        print(f"\nProcessing directory: {directory}")
        
        # Load control period data
        control_speed_files = find_speed_parquets(directory, control_start_date, control_end_date)
        control_speed_data = load_speed_parquet_data(control_speed_files)
        control_data = pd.concat([control_data, control_speed_data], ignore_index=True)

        # Load treatment period data
        treatment_speed_files = find_speed_parquets(directory, treatment_start_date, treatment_end_date)
        treatment_speed_data = load_speed_parquet_data(treatment_speed_files)
        treatment_data = pd.concat([treatment_data, treatment_speed_data], ignore_index=True)

    print(f"\nControl data shape: {control_data.shape}")
    print(f"Treatment data shape: {treatment_data.shape}")

    # Calculate rush hour speeds
    control_rush_hour_speeds = calculate_rush_hour_speeds(control_data)  
    treatment_rush_hour_speeds = calculate_rush_hour_speeds(treatment_data)

    # Calculate speed differences
    speeds_diff = calculate_speeds_difference(control_rush_hour_speeds, treatment_rush_hour_speeds)

    # Process each route and weekday
    os.makedirs(output_dir, exist_ok=True)
    
    for route_id in route_ids:
        # Define output file path
        output_file = os.path.join(output_dir, f'{mdb_id}_{route_id}_speed_diff.parquet')
        
        # Filter speeds by route_id
        route_speeds_diff = speeds_diff[speeds_diff['route_id'] == route_id]
        
        if os.path.exists(output_file):
            # Append to existing file
            existing_data = pd.read_parquet(output_file)
            combined_data = pd.concat([existing_data, route_speeds_diff], ignore_index=True)
            combined_data.to_parquet(output_file, engine="pyarrow")
            print(f"Appended data to {output_file}")
        else:
            # Create new file
            route_speeds_diff.to_parquet(output_file, engine="pyarrow", index=False)
            print(f"Created new file: {output_file}")


### Usage

In [15]:
mdb_id = "mdb-513"
route_ids = ["M50", "M102"]

process_speed_comparison(
        mdb_id=mdb_id,
        route_ids=route_ids,
        control_start_date="2024-12-03",
        control_end_date="2025-01-04",
        treatment_start_date="2024-01-05",
        treatment_end_date="2025-02-06"
    )


Collecting segment files...
Final merged segments shape: (16545, 11)

Processing directory: data/raw-speeds/mdb-513-202412120015
Found 24 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202412120015
Loaded 24 speed parquets
Combined speed data shape: (239889, 12)
Found 24 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-513-202412120015
Loaded 24 speed parquets
Combined speed data shape: (239889, 12)

Processing directory: data/raw-speeds/mdb-513-202501230024
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202501230024
Loaded 0 speed parquets
Found 14 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-513-202501230024
Loaded 14 speed parquets
Combined speed data shape: (158100, 12)

Processing directory: data/raw-speeds/mdb-513-202409090026
Found 9 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-513-202409090026
Loaded 9 speed parquets
Combined speed data

In [16]:
mdb_id = "mdb-512"
route_ids = ["B39"]

process_speed_comparison(
        mdb_id=mdb_id,
        route_ids=route_ids,
        control_start_date="2024-12-03",
        control_end_date="2025-01-04",
        treatment_start_date="2024-01-05",
        treatment_end_date="2025-02-06"
    )


Collecting segment files...
Final merged segments shape: (42670, 11)

Processing directory: data/raw-speeds/mdb-512-202412120015
Found 23 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-512-202412120015
Loaded 23 speed parquets
Combined speed data shape: (1272, 12)
Found 23 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-512-202412120015
Loaded 23 speed parquets
Combined speed data shape: (1272, 12)

Processing directory: data/raw-speeds/mdb-512-202502170011
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-512-202502170011
Loaded 0 speed parquets
Found 0 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-512-202502170011
Loaded 0 speed parquets

Processing directory: data/raw-speeds/mdb-512-202408290005
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-512-202408290005
Loaded 0 speed parquets
Found 2 speed parquets between 2024-01-05 and 2025-02-06 in data/

In [17]:
mdb_id = "mdb-514"
route_ids = ["SIM24", "SIM4X"]

process_speed_comparison(
        mdb_id=mdb_id,
        route_ids=route_ids,
        control_start_date="2024-12-03",
        control_end_date="2025-01-04",
        treatment_start_date="2024-01-05",
        treatment_end_date="2025-02-06"
    )


Collecting segment files...
Final merged segments shape: (20500, 11)

Processing directory: data/raw-speeds/mdb-514-202502170029
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-514-202502170029
Loaded 0 speed parquets
Found 0 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-514-202502170029
Loaded 0 speed parquets

Processing directory: data/raw-speeds/mdb-514-202501020130
Found 0 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-514-202501020130
Loaded 0 speed parquets
Found 28 speed parquets between 2024-01-05 and 2025-02-06 in data/raw-speeds/mdb-514-202501020130
Loaded 28 speed parquets
Combined speed data shape: (9341, 12)

Processing directory: data/raw-speeds/mdb-514-202412120006
Found 29 speed parquets between 2024-12-03 and 2025-01-04 in data/raw-speeds/mdb-514-202412120006
Loaded 29 speed parquets
Combined speed data shape: (7966, 12)
Found 31 speed parquets between 2024-01-05 and 2025-02-06 in data

# Post Processing

In [3]:
# aggregation

import os
import glob
import geopandas as gpd

# Get all geojson files in the data/map-segments directory
geojson_files = glob.glob('data/map-segments/*.geojson')
print(geojson_files)

# Initialize an empty GeoDataFrame to store the merged data
merged_gdf = None

# Loop through all geojson files and merge them
for geojson_file in geojson_files:
    
    # Read the geojson file
    gdf = gpd.read_file(geojson_file)
    
    # If this is the first file, initialize the merged GeoDataFrame
    if merged_gdf is None:
        merged_gdf = gdf
    else:
        # Concatenate with the merged GeoDataFrame
        merged_gdf = gpd.GeoDataFrame(pd.concat([merged_gdf, gdf], ignore_index=True))

# Remove duplicate segments based on geometry and other key attributes
# Assuming segments with same prev_stop_id and stop_id are duplicates
if merged_gdf is not None and not merged_gdf.empty:
    merged_gdf = merged_gdf.drop_duplicates(subset=['prev_stop_id', 'stop_id', 'geometry'])

    # Save the merged and deduplicated GeoDataFrame to a new geojson file
    merged_gdf.to_file('data/map-segments/merged_segments.geojson', driver='GeoJSON')
    
    print(f"Merged {len(geojson_files)} files into data/map-segments/merged_segments.geojson")
    print(f"Final merged file contains {len(merged_gdf)} segments")
else:
    print("No geojson files found or all files were empty")


['data/map-segments/mdb-513_merged_segments.geojson', 'data/map-segments/mdb-514_merged_segments.geojson', 'data/map-segments/mdb-512_merged_segments.geojson']
Merged 3 files into data/map-segments/merged_segments.geojson
Final merged file contains 10373 segments


In [4]:
# Get all parquet files in the data/map-speeds directory
parquet_files = glob.glob('data/map-speeds/*.parquet')
print(f"Found {len(parquet_files)} parquet files: {parquet_files}")

# Initialize an empty DataFrame to store the merged data
merged_df = None

# Loop through all parquet files and merge them
for parquet_file in parquet_files:
    # Read the parquet file
    df = pd.read_parquet(parquet_file)
    
    # Remove geometry column if it exists
    if 'geometry' in df.columns:
        df = df.drop(columns=['geometry'])
    
    # If this is the first file, initialize the merged DataFrame
    if merged_df is None:
        merged_df = df
    else:
        # Concatenate with the merged DataFrame
        merged_df = pd.concat([merged_df, df], ignore_index=True)

# Remove duplicate segments based on key attributes
if merged_df is not None and not merged_df.empty:
    merged_df = merged_df.drop_duplicates(subset=['prev_stop_id', 'stop_id'])
    
    # Save the merged and deduplicated DataFrame to a new parquet file
    merged_df.to_parquet('data/map-speeds/merged_speeds_diff.parquet')
    
    print(f"Merged {len(parquet_files)} files into data/map-speeds/merged_speeds_diff.parquet")
    print(f"Final merged file contains {len(merged_df)} segments")
else:
    print("No parquet files found or all files were empty")

Found 5 parquet files: ['data/map-speeds/SIM4X_speed_diff_segments.parquet', 'data/map-speeds/B39_speed_diff_segments.parquet', 'data/map-speeds/SIM24_speed_diff_segments.parquet', 'data/map-speeds/M102_speed_diff_segments.parquet', 'data/map-speeds/M50_speed_diff_segments.parquet']
Merged 5 files into data/map-speeds/merged_speeds_diff.parquet
Final merged file contains 207 segments


In [32]:
# Function to check for segments with different coordinates for the same pair of (prev_stop_id, stop_id)
def remove_duplicate_segments(mdb_id, route_id):
    
    # Read the geojson file
    file_path = f'data/map-segments/{mdb_id}_merged_segments.geojson'
    segments_gdf = gpd.read_file(file_path)
    
    # Group by prev_stop_id and stop_id
    grouped = segments_gdf.groupby(['prev_stop_id', 'stop_id'])
    
    # Find groups with more than one entry
    duplicates = grouped.filter(lambda x: len(x) > 1)
    
    # If duplicates exist, keep only the first record for each group
    if not duplicates.empty:
        print(f"Found {len(duplicates)} duplicate segments. Keeping only the first record for each group.")
        
        # Get the indices of the first occurrence of each group
        idx = grouped.apply(lambda x: x.index[0])
        
        # Select only the first record for each group
        deduplicated_gdf = segments_gdf.loc[idx]
        
        # Keep only the specified columns
        columns_to_keep = ['stop_id', 'stop_name', 'prev_stop_id', 'prev_stop_name', 
                           'projected_position', 'prev_projected_position', 'segment_length', 'geometry']
        deduplicated_gdf = deduplicated_gdf[columns_to_keep]
        
        # Save the deduplicated GeoDataFrame
        output_path = f'data/map-segments/{mdb_id}_{route_id}_unique_segments.geojson'
        deduplicated_gdf.to_file(output_path, driver='GeoJSON')
        
    else:
        print(f"No duplicate segments found in {file_path}")

In [33]:
remove_duplicate_segments('mdb-512', 'B39')
remove_duplicate_segments('mdb-513', 'M50')
remove_duplicate_segments('mdb-513', 'M102')
remove_duplicate_segments('mdb-514', 'SIM4X')
remove_duplicate_segments('mdb-514', 'SIM24')

Found 42656 duplicate segments. Keeping only the first record for each group.


  idx = grouped.apply(lambda x: x.index[0])


Found 16539 duplicate segments. Keeping only the first record for each group.


  idx = grouped.apply(lambda x: x.index[0])


Found 16539 duplicate segments. Keeping only the first record for each group.


  idx = grouped.apply(lambda x: x.index[0])


Found 20481 duplicate segments. Keeping only the first record for each group.


  idx = grouped.apply(lambda x: x.index[0])


Found 20481 duplicate segments. Keeping only the first record for each group.


  idx = grouped.apply(lambda x: x.index[0])
