### Configurations

In [21]:

# Define the path to the GTFS directory
# original_gtfs_dir = '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018-20250626T135819Z-1-001/GTFS_Recap_-_Fall_2018' # Assuming the GTFS files are in this directory
original_gtfs_dir = '../1--Original_2018_GTFS_Recap_Data/GTFS_Recap_-_Fall_2018' # Assuming the GTFS files are in this directory

new_gtfs_files_dir = '../4--Updated_GTFS_Files' # Directory where the updated GTFS files will be saved

### GTFS Verification - October 2nd, 2018

In [22]:
import pandas as pd
import os

# Define the target date
target_date_str = '20181002' # YYYYMMDD format for GTFS calendar.txt
target_date_datetime = pd.to_datetime(target_date_str)

print(f"Analyzing GTFS data for the target date: {target_date_datetime.strftime('%Y-%m-%d')}\n")

try:
    # --- 1. Load calendar.txt ---
    calendar_path = os.path.join(original_gtfs_dir, 'calendar.txt')
    if not os.path.exists(calendar_path):
        raise FileNotFoundError(f"'{calendar_path}' not found. Please ensure the GTFS files are in the correct directory.")
    calendar_df = pd.read_csv(calendar_path)
    print(f"Loaded calendar.txt with {len(calendar_df)} entries.")
    # print("Calendar DataFrame head:\n", calendar_df.head())

    # --- 2. Load routes.txt ---
    routes_path = os.path.join(original_gtfs_dir, 'routes.txt')
    if not os.path.exists(routes_path):
        raise FileNotFoundError(f"'{routes_path}' not found. Please ensure the GTFS files are in the correct directory.")
    routes_df = pd.read_csv(routes_path)
    print(f"Loaded routes.txt with {len(routes_df)} entries.")
    # print("Routes DataFrame head:\n", routes_df.head())

    # --- 3. Determine active service IDs for the target date ---
    # GTFS calendar.txt uses 0 for no service and 1 for service on specific days
    # We need to check the day of the week for the target date
    day_of_week = target_date_datetime.strftime('%A').lower() # e.g., 'wednesday'

    # Filter services that are active on the target date's day of the week
    # and whose start_date and end_date encompass the target date
    active_services_on_day = calendar_df[
        (calendar_df[day_of_week] == 1) &
        (calendar_df['start_date'].astype(str) <= target_date_str) &
        (calendar_df['end_date'].astype(str) >= target_date_str)
    ]

    active_service_ids = active_services_on_day['service_id'].unique()
    print(f"\nFound {len(active_service_ids)} active service IDs on {target_date_datetime.strftime('%Y-%m-%d')}.")
    # print("Active Service IDs:\n", active_service_ids)

    # --- 4. Load trips.txt to link service_id to route_id ---
    # Trips.txt is crucial to link service_id from calendar.txt to route_id from routes.txt
    trips_path = os.path.join(original_gtfs_dir, 'trips.txt')
    if not os.path.exists(trips_path):
        raise FileNotFoundError(f"'{trips_path}' not found. This file is necessary to link services to routes.")
    trips_df = pd.read_csv(trips_path)
    print(f"Loaded trips.txt with {len(trips_df)} entries.")
    # print("Trips DataFrame head:\n", trips_df.head())

    # Filter trips that use the active service IDs
    active_trips_df = trips_df[trips_df['service_id'].isin(active_service_ids)]
    active_route_ids_on_date = active_trips_df['route_id'].unique()
    print(f"Found {len(active_route_ids_on_date)} unique route IDs active on {target_date_datetime.strftime('%Y-%m-%d')}.")

    # --- 5. Identify active routes on October 2nd, 2024 ---
    active_routes_on_date_df = routes_df[routes_df['route_id'].isin(active_route_ids_on_date)]
    num_active_routes = len(active_routes_on_date_df)

    print(f"\n--- Routes Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_active_routes > 0:
        print(f"Number of routes active: {num_active_routes}")
    else:
        print("No routes found to be active on this date based on the provided GTFS data.")


    # --- 6. Identify routes NOT active on October 2nd, 2024 ---
    all_route_ids = routes_df['route_id'].unique()
    inactive_route_ids_on_date = [route_id for route_id in all_route_ids if route_id not in active_route_ids_on_date]

    inactive_routes_on_date_df = routes_df[routes_df['route_id'].isin(inactive_route_ids_on_date)]
    num_inactive_routes = len(inactive_routes_on_date_df)

    print(f"\n--- Routes NOT Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_inactive_routes > 0:
        print(f"Number of routes not active: {num_inactive_routes}")
    else:
        print("All routes found to be active on this date based on the provided GTFS data.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'calendar.txt', 'routes.txt', and 'trips.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Analyzing GTFS data for the target date: 2018-10-02

Loaded calendar.txt with 119 entries.
Loaded routes.txt with 207 entries.

Found 20 active service IDs on 2018-10-02.
Loaded trips.txt with 70793 entries.
Found 190 unique route IDs active on 2018-10-02.

--- Routes Active on 2018-10-02 ---
Number of routes active: 190

--- Routes NOT Active on 2018-10-02 ---
Number of routes not active: 17


### GTFS Verification - October 10th, 2018

In [23]:
import pandas as pd
import os

# Define the target date
target_date_str = '20181010' # YYYYMMDD format for GTFS calendar.txt
target_date_datetime = pd.to_datetime(target_date_str)

print(f"Analyzing GTFS data for the target date: {target_date_datetime.strftime('%Y-%m-%d')}\n")

try:
    # --- 1. Load calendar.txt ---
    calendar_path = os.path.join(original_gtfs_dir, 'calendar.txt')
    if not os.path.exists(calendar_path):
        raise FileNotFoundError(f"'{calendar_path}' not found. Please ensure the GTFS files are in the correct directory.")
    calendar_df = pd.read_csv(calendar_path)
    print(f"Loaded calendar.txt with {len(calendar_df)} entries.")
    # print("Calendar DataFrame head:\n", calendar_df.head())

    # --- 2. Load routes.txt ---
    routes_path = os.path.join(original_gtfs_dir, 'routes.txt')
    if not os.path.exists(routes_path):
        raise FileNotFoundError(f"'{routes_path}' not found. Please ensure the GTFS files are in the correct directory.")
    routes_df = pd.read_csv(routes_path)
    print(f"Loaded routes.txt with {len(routes_df)} entries.")
    # print("Routes DataFrame head:\n", routes_df.head())

    # --- 3. Determine active service IDs for the target date ---
    # GTFS calendar.txt uses 0 for no service and 1 for service on specific days
    # We need to check the day of the week for the target date
    day_of_week = target_date_datetime.strftime('%A').lower() # e.g., 'wednesday'

    # Filter services that are active on the target date's day of the week
    # and whose start_date and end_date encompass the target date
    active_services_on_day = calendar_df[
        (calendar_df[day_of_week] == 1) &
        (calendar_df['start_date'].astype(str) <= target_date_str) &
        (calendar_df['end_date'].astype(str) >= target_date_str)
    ]

    active_service_ids = active_services_on_day['service_id'].unique()
    print(f"\nFound {len(active_service_ids)} active service IDs on {target_date_datetime.strftime('%Y-%m-%d')}.")
    # print("Active Service IDs:\n", active_service_ids)

    # --- 4. Load trips.txt to link service_id to route_id ---
    # Trips.txt is crucial to link service_id from calendar.txt to route_id from routes.txt
    trips_path = os.path.join(original_gtfs_dir, 'trips.txt')
    if not os.path.exists(trips_path):
        raise FileNotFoundError(f"'{trips_path}' not found. This file is necessary to link services to routes.")
    trips_df = pd.read_csv(trips_path)
    print(f"Loaded trips.txt with {len(trips_df)} entries.")
    # print("Trips DataFrame head:\n", trips_df.head())

    # Filter trips that use the active service IDs
    active_trips_df = trips_df[trips_df['service_id'].isin(active_service_ids)]
    active_route_ids_on_date = active_trips_df['route_id'].unique()
    print(f"Found {len(active_route_ids_on_date)} unique route IDs active on {target_date_datetime.strftime('%Y-%m-%d')}.")

    # --- 5. Identify active routes on October 2nd, 2024 ---
    active_routes_on_date_df = routes_df[routes_df['route_id'].isin(active_route_ids_on_date)]
    num_active_routes = len(active_routes_on_date_df)

    print(f"\n--- Routes Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_active_routes > 0:
        print(f"Number of routes active: {num_active_routes}")
    else:
        print("No routes found to be active on this date based on the provided GTFS data.")


    # --- 6. Identify routes NOT active on October 2nd, 2024 ---
    all_route_ids = routes_df['route_id'].unique()
    inactive_route_ids_on_date = [route_id for route_id in all_route_ids if route_id not in active_route_ids_on_date]

    inactive_routes_on_date_df = routes_df[routes_df['route_id'].isin(inactive_route_ids_on_date)]
    num_inactive_routes = len(inactive_routes_on_date_df)

    print(f"\n--- Routes NOT Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_inactive_routes > 0:
        print(f"Number of routes not active: {num_inactive_routes}")
    else:
        print("All routes found to be active on this date based on the provided GTFS data.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'calendar.txt', 'routes.txt', and 'trips.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Analyzing GTFS data for the target date: 2018-10-10

Loaded calendar.txt with 119 entries.
Loaded routes.txt with 207 entries.

Found 32 active service IDs on 2018-10-10.
Loaded trips.txt with 70793 entries.
Found 191 unique route IDs active on 2018-10-10.

--- Routes Active on 2018-10-10 ---
Number of routes active: 191

--- Routes NOT Active on 2018-10-10 ---
Number of routes not active: 16


### Identical route data for the two 2018 October dates indicate any date inbetween the two work when converting to 2024

### Verifying October 2nd of 2024 currently has no active routes with 2018 GTFS Data

In [24]:
import pandas as pd
import os

# Define the target date
target_date_str = '20241002' # YYYYMMDD format for GTFS calendar.txt
target_date_datetime = pd.to_datetime(target_date_str)

print(f"Analyzing GTFS data for the target date: {target_date_datetime.strftime('%Y-%m-%d')}\n")

try:
    # --- 1. Load calendar.txt ---
    calendar_path = os.path.join(original_gtfs_dir, 'calendar.txt')
    if not os.path.exists(calendar_path):
        raise FileNotFoundError(f"'{calendar_path}' not found. Please ensure the GTFS files are in the correct directory.")
    calendar_df = pd.read_csv(calendar_path)
    print(f"Loaded calendar.txt with {len(calendar_df)} entries.")
    # print("Calendar DataFrame head:\n", calendar_df.head())

    # --- 2. Load routes.txt ---
    routes_path = os.path.join(original_gtfs_dir, 'routes.txt')
    if not os.path.exists(routes_path):
        raise FileNotFoundError(f"'{routes_path}' not found. Please ensure the GTFS files are in the correct directory.")
    routes_df = pd.read_csv(routes_path)
    print(f"Loaded routes.txt with {len(routes_df)} entries.")
    # print("Routes DataFrame head:\n", routes_df.head())

    # --- 3. Determine active service IDs for the target date ---
    # GTFS calendar.txt uses 0 for no service and 1 for service on specific days
    # We need to check the day of the week for the target date
    day_of_week = target_date_datetime.strftime('%A').lower() # e.g., 'wednesday'

    # Filter services that are active on the target date's day of the week
    # and whose start_date and end_date encompass the target date
    active_services_on_day = calendar_df[
        (calendar_df[day_of_week] == 1) &
        (calendar_df['start_date'].astype(str) <= target_date_str) &
        (calendar_df['end_date'].astype(str) >= target_date_str)
    ]

    active_service_ids = active_services_on_day['service_id'].unique()
    print(f"\nFound {len(active_service_ids)} active service IDs on {target_date_datetime.strftime('%Y-%m-%d')}.")
    # print("Active Service IDs:\n", active_service_ids)

    # --- 4. Load trips.txt to link service_id to route_id ---
    # Trips.txt is crucial to link service_id from calendar.txt to route_id from routes.txt
    trips_path = os.path.join(original_gtfs_dir, 'trips.txt')
    if not os.path.exists(trips_path):
        raise FileNotFoundError(f"'{trips_path}' not found. This file is necessary to link services to routes.")
    trips_df = pd.read_csv(trips_path)
    print(f"Loaded trips.txt with {len(trips_df)} entries.")
    # print("Trips DataFrame head:\n", trips_df.head())

    # Filter trips that use the active service IDs
    active_trips_df = trips_df[trips_df['service_id'].isin(active_service_ids)]
    active_route_ids_on_date = active_trips_df['route_id'].unique()
    print(f"Found {len(active_route_ids_on_date)} unique route IDs active on {target_date_datetime.strftime('%Y-%m-%d')}.")

    # --- 5. Identify active routes on October 2nd, 2024 ---
    active_routes_on_date_df = routes_df[routes_df['route_id'].isin(active_route_ids_on_date)]
    num_active_routes = len(active_routes_on_date_df)

    print(f"\n--- Routes Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_active_routes > 0:
        print(f"Number of routes active: {num_active_routes}")
    else:
        print("No routes found to be active on this date based on the provided GTFS data.")


    # --- 6. Identify routes NOT active on October 2nd, 2024 ---
    all_route_ids = routes_df['route_id'].unique()
    inactive_route_ids_on_date = [route_id for route_id in all_route_ids if route_id not in active_route_ids_on_date]

    inactive_routes_on_date_df = routes_df[routes_df['route_id'].isin(inactive_route_ids_on_date)]
    num_inactive_routes = len(inactive_routes_on_date_df)

    print(f"\n--- Routes NOT Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_inactive_routes > 0:
        print(f"Number of routes not active: {num_inactive_routes}")
    else:
        print("All routes found to be active on this date based on the provided GTFS data.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'calendar.txt', 'routes.txt', and 'trips.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Analyzing GTFS data for the target date: 2024-10-02

Loaded calendar.txt with 119 entries.
Loaded routes.txt with 207 entries.

Found 0 active service IDs on 2024-10-02.
Loaded trips.txt with 70793 entries.
Found 0 unique route IDs active on 2024-10-02.

--- Routes Active on 2024-10-02 ---
No routes found to be active on this date based on the provided GTFS data.

--- Routes NOT Active on 2024-10-02 ---
Number of routes not active: 207


# **Run [2--gtfs_file_data_updater.ipynb]**

## Verify October 2nd, 2024 date to be working with updated GTFS Data

In [25]:
import pandas as pd
import os

# Define the target date
target_date_str = '20241002' # YYYYMMDD format for GTFS calendar.txt
target_date_datetime = pd.to_datetime(target_date_str)

print(f"Analyzing GTFS data for the target date: {target_date_datetime.strftime('%Y-%m-%d')}\n")

try:
    # --- 1. Load calendar.txt ---
    calendar_path = os.path.join(new_gtfs_files_dir, 'calendar.txt')
    if not os.path.exists(calendar_path):
        raise FileNotFoundError(f"'{calendar_path}' not found. Please ensure the GTFS files are in the correct directory.")
    calendar_df = pd.read_csv(calendar_path)
    print(f"Loaded calendar.txt with {len(calendar_df)} entries.")
    # print("Calendar DataFrame head:\n", calendar_df.head())

    # --- 2. Load routes.txt ---
    routes_path = os.path.join(new_gtfs_files_dir, 'routes.txt')
    if not os.path.exists(routes_path):
        raise FileNotFoundError(f"'{routes_path}' not found. Please ensure the GTFS files are in the correct directory.")
    routes_df = pd.read_csv(routes_path)
    print(f"Loaded routes.txt with {len(routes_df)} entries.")
    # print("Routes DataFrame head:\n", routes_df.head())

    # --- 3. Determine active service IDs for the target date ---
    # GTFS calendar.txt uses 0 for no service and 1 for service on specific days
    # We need to check the day of the week for the target date
    day_of_week = target_date_datetime.strftime('%A').lower() # e.g., 'wednesday'

    # Filter services that are active on the target date's day of the week
    # and whose start_date and end_date encompass the target date
    active_services_on_day = calendar_df[
        (calendar_df[day_of_week] == 1) &
        (calendar_df['start_date'].astype(str) <= target_date_str) &
        (calendar_df['end_date'].astype(str) >= target_date_str)
    ]

    active_service_ids = active_services_on_day['service_id'].unique()
    print(f"\nFound {len(active_service_ids)} active service IDs on {target_date_datetime.strftime('%Y-%m-%d')}.")
    # print("Active Service IDs:\n", active_service_ids)

    # --- 4. Load trips.txt to link service_id to route_id ---
    # Trips.txt is crucial to link service_id from calendar.txt to route_id from routes.txt
    trips_path = os.path.join(new_gtfs_files_dir, 'trips.txt')
    if not os.path.exists(trips_path):
        raise FileNotFoundError(f"'{trips_path}' not found. This file is necessary to link services to routes.")
    trips_df = pd.read_csv(trips_path)
    print(f"Loaded trips.txt with {len(trips_df)} entries.")
    # print("Trips DataFrame head:\n", trips_df.head())

    # Filter trips that use the active service IDs
    active_trips_df = trips_df[trips_df['service_id'].isin(active_service_ids)]
    active_route_ids_on_date = active_trips_df['route_id'].unique()
    print(f"Found {len(active_route_ids_on_date)} unique route IDs active on {target_date_datetime.strftime('%Y-%m-%d')}.")

    # --- 5. Identify active routes on October 2nd, 2024 ---
    active_routes_on_date_df = routes_df[routes_df['route_id'].isin(active_route_ids_on_date)]
    num_active_routes = len(active_routes_on_date_df)

    print(f"\n--- Routes Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_active_routes > 0:
        print(f"Number of routes active: {num_active_routes}")
    else:
        print("No routes found to be active on this date based on the provided GTFS data.")


    # --- 6. Identify routes NOT active on October 2nd, 2024 ---
    all_route_ids = routes_df['route_id'].unique()
    inactive_route_ids_on_date = [route_id for route_id in all_route_ids if route_id not in active_route_ids_on_date]

    inactive_routes_on_date_df = routes_df[routes_df['route_id'].isin(inactive_route_ids_on_date)]
    num_inactive_routes = len(inactive_routes_on_date_df)

    print(f"\n--- Routes NOT Active on {target_date_datetime.strftime('%Y-%m-%d')} ---")
    if num_inactive_routes > 0:
        print(f"Number of routes not active: {num_inactive_routes}")
    else:
        print("All routes found to be active on this date based on the provided GTFS data.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'calendar.txt', 'routes.txt', and 'trips.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Analyzing GTFS data for the target date: 2024-10-02

Loaded calendar.txt with 119 entries.
Loaded routes.txt with 207 entries.

Found 20 active service IDs on 2024-10-02.
Loaded trips.txt with 70793 entries.
Found 190 unique route IDs active on 2024-10-02.

--- Routes Active on 2024-10-02 ---
Number of routes active: 190

--- Routes NOT Active on 2024-10-02 ---
Number of routes not active: 17


### Verify All missing Lat Long stops are removed

In [26]:
import pandas as pd
import os

print(f"Attempting to remove stops with missing latitude/longitude from stops.txt in '{new_gtfs_files_dir}'.\n")

try:
    # Define the path to stops.txt
    stops_path = os.path.join(new_gtfs_files_dir, 'stops.txt')

    if not os.path.exists(stops_path):
        raise FileNotFoundError(f"'{stops_path}' not found. Please ensure the GTFS files are in the correct directory.")

    # Load stops.txt
    stops_df = pd.read_csv(stops_path)
    print(f"Loaded stops.txt with {len(stops_df)} entries.")

    # Identify rows where 'stop_lat' or 'stop_lon' are NaN (missing)
    # The .isna() method checks for NaN values.
    # We use | (OR) to find rows where EITHER lat OR lon is missing.
    stops_with_missing_latlong = stops_df[stops_df['stop_lat'].isna() | stops_df['stop_lon'].isna()]
    num_missing = len(stops_with_missing_latlong)

    if num_missing > 0:
        print(f"\nFound {num_missing} stops with missing latitude or longitude.")
    else:
        print("\nNo stops with missing latitude or longitude were found. No changes made to stops.txt.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure the 'MBTA_2018_gtfs' directory exists and contains 'stops.txt'.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



Attempting to remove stops with missing latitude/longitude from stops.txt in '../4--Updated_GTFS_Files'.

Loaded stops.txt with 8649 entries.

No stops with missing latitude or longitude were found. No changes made to stops.txt.
