In [4]:
# !pip install pyrosm
# !pip install folium
# !pip install geojson
# !pip install r5py
import datetime
import geopandas as gpd
import pandas as pd
import datetime
import time 
import r5py
import os

### 1. Building transportation network using OSM and GTFS data

In [3]:

# Ensure the file paths are correct
OSM_path = "/Users/max/Desktop/Transit_Dashboard/data/OSM_data/Toronto.osm.pbf" #"../../data/OSM_data/Toronto.osm.pbf"
GTFS_path = "/Users/max/Desktop/Transit_Dashboard/data/GTFS_data/raw/latest_feed_version_2024-10-22.zip"  #"../../data/GTFS_data/raw/latest_feed_version_2024-10-22.zip"


# Make the R5 transport network using OSM and GTFS 
network_start = time.time()
transport_network = r5py.TransportNetwork(OSM_path, [GTFS_path])
network_total_time = time.time()-network_start
print("Transportion network building time", network_total_time, "seconds." )

transport_modes = [
    r5py.TransportMode.TRANSIT,
    r5py.TransportMode.WALK,
]



Transportion network building time 58.49365305900574 seconds.


### 2. Computing travel time matrix (Between origins and destinations)

In [4]:
import geopandas as gpd
import time
import r5py
import datetime

def compute_travel_time_matrix(origins_file_path, destinations_file_path, transport_network, output_path, origin_id_col='', destination_id_col='', origin_is_point=True, destination_is_point=True):
    '''
    This function computes the travel time matrix between origins and destinations using a transport network.
    
    Parameters:
    origins_file (str): Path to the GeoJSON file containing the origin points or polygons.
    destinations_file (str): Path to the GeoJSON file containing the destination points or polygons.
    transport_network (TransportNetwork): The transport network object (R5 or similar).
    output_file (str): Path to save the resulting travel time matrix CSV file.
    origin_id_col (str, optional): The name of the column in the origins file that serves as the ID.
    destination_id_col (str, optional): The name of the column in the destinations file that serves as the ID.
    origin_is_point (bool, optional): Whether the origins are points. If False, the centroid will be used. Defaults to True.
    destination_is_point (bool, optional): Whether the destinations are points. If False, the centroid will be used. Defaults to True.
    
    Returns:
    travel_time_matrix (DataFrame): Matrix with columns: origin_id, destination_id, and travel time in minutes.
    '''
    
    # Load the GeoJSON files
    raw_origins = gpd.read_file(origins_file_path)
    raw_destinations = gpd.read_file(destinations_file_path)
    
    # Process origins based on the origin_is_point flag
    if not origin_is_point:
        origins = raw_origins.copy()
        origins["geometry"] = origins.geometry.centroid
    else:
        origins = raw_origins
    
    # Process destinations based on the destination_is_point flag
    if not destination_is_point:
        destinations = raw_destinations.copy()
        destinations["geometry"] = destinations.geometry.centroid
    else:
        destinations = raw_destinations
    
    # Check if the origin ID column exists, if not, raise an error or rename it
    if "id" not in origins.columns:
        if origin_id_col not in origins.columns:
            raise ValueError(f"Origin ID column '{origin_id_col}' not found in the origins file.")
        origins['id'] = origins[origin_id_col]
    
    # Check if the destination ID column exists, if not, raise an error or rename it
    if "id" not in destinations.columns:
        if destination_id_col not in destinations.columns:
            raise ValueError(f"Destination ID column '{destination_id_col}' not found in the destinations file.")
        destinations['id'] = destinations[destination_id_col]
    
    # Start timer
    start = time.time()
    
    # Compute travel time matrix
    travel_time_matrix = r5py.TravelTimeMatrixComputer(
        transport_network,
        origins=origins,
        destinations=destinations,
        transport_modes=[r5py.TransportMode.TRANSIT],
        departure=datetime.datetime(2024, 10, 21, 7, 0, 0),
        departure_time_window=datetime.timedelta(hours=2),
    ).compute_travel_times()
    
    # End timer
    end = time.time()
    running_time = end - start
    print(f"Running time: {running_time} seconds")
    
    # Save the results to a CSV file
    travel_time_matrix.to_csv(output_path, index=False)
    
    return travel_time_matrix


2.1 Sample usages of compute_travel_time_matrix

In [10]:
# Path to shape file
SHAPE_PATH = "../../../data/boundaries_visualization/lct_000b21a_e/lct_000b21a_e.shp"
gdf = gpd.read_file(SHAPE_PATH)

toronto_gdf = gdf[gdf['CTUID'].str.startswith('535000')]
central_toronto_gdf = toronto_gdf[toronto_gdf['CTUID'].between('5350001', '5350379')]
# central_toronto_gdf['CTUID'] = central_toronto_gdf['CTUID'].astype(str)
# central_toronto_gdf = central_toronto_gdf.to_crs('EPSG:4326')
OUTPUT_PATH = "../../../data/census_tract_data/toronto_ct_boundaries.geojson"
central_toronto_gdf.to_file(OUTPUT_PATH, driver='GeoJSON')

In [11]:
#  census tract to healthcare facilities 
compute_travel_time_matrix(
    origins_file_path="../../../data/census_tract_data/toronto_ct_boundaries.geojson",
    destinations_file_path="../../../data/key_destination_data/Healthcare.geojson",
    transport_network=transport_network,
    output_path="../results/TTM_CT_hospitals.csv",
    origin_id_col='CTUID',  # Specify the correct column name for origin ID if it's different
    destination_id_col='ADDRESS_POINT_ID',  # Specify the correct column name for destination ID if it's different
    origin_is_point= False,
    destination_is_point=True,
)

Running time: 3.182096004486084 seconds


Unnamed: 0,from_id,to_id,travel_time
0,5350001.00,7792696,78.0
1,5350001.00,9638452,119.0
2,5350001.00,54248,
3,5350001.00,6362828,
4,5350001.00,9035631,
...,...,...,...
1095,5350007.02,14120064,
1096,5350007.02,3256200,
1097,5350007.02,9942216,
1098,5350007.02,3232727,


In [None]:
# Census tract to census tract
compute_travel_time_matrix(
    origins_file_path="../../../data/census_tract_data/boundaries_centroid_combined_data.geojson",
    destinations_file_path="../../../data/census_tract_data/boundaries_centroid_combined_data.geojson",
    transport_network=transport_network,
    output_path="../results/TTM_CT_CT.csv",
    origin_id_col='CTUID',  # Specify the correct column name for origin ID if it's different
    destination_id_col='CTUID',
    origin_is_point= False,
    destination_is_point=False, 
)

Running time: 159.29474210739136 seconds


Unnamed: 0,from_id,to_id,travel_time
0,5350128.04,5350128.04,0.0
1,5350128.04,5350363.06,80.0
2,5350128.04,5350363.07,82.0
3,5350128.04,5350378.23,84.0
4,5350128.04,5350378.24,85.0
...,...,...,...
339884,5350017.02,5350210.04,80.0
339885,5350017.02,5350062.03,37.0
339886,5350017.02,5350062.04,32.0
339887,5350017.02,5350017.01,20.0


### 3. Filtering destinations from Travel time matrix based on travel time

In [9]:
import pandas as pd
import os

def get_destinations(raw_travel_time_matrix_path, output_path, top_n=None, threshold=None):
    
    """
    Processes a travel time matrix to find either the top N closest destinations for each origin
    or all destinations within a specified travel time threshold.

    Parameters:
    raw_travel_time_matrix_path (str): Path to the input CSV file containing the travel time matrix.
    output_path (str): Path to save the output CSV file with the filtered destinations.
    top_n (int, optional): The number of closest destinations to select for each origin.
    threshold (float or list of floats, optional): The maximum travel time to include destinations. If set, overrides top_n.

    Returns:
    None
    """
    # Load the travel time matrix CSV file
    travel_time_matrix = pd.read_csv(raw_travel_time_matrix_path)
    
    # Filter out rows where from_id == to_id (self-loops)
    data = travel_time_matrix[travel_time_matrix['from_id'] != travel_time_matrix['to_id']]
    
    if isinstance(threshold, list):
        for t in threshold:
            # Filter destinations within the specified travel time threshold
            filtered_destinations = data[data['travel_time'] <= t]
            # Sort the result by 'from_id' and 'travel_time' for clarity
            filtered_destinations = filtered_destinations[['from_id', 'to_id', 'travel_time']].sort_values(['from_id', 'travel_time'])
            # Generate a specific output path for each threshold
            specific_output_path = output_path.replace('.csv', f'_threshold_{t}.csv')
            # Save the result to a CSV file without including the index
            filtered_destinations.to_csv(specific_output_path, index=False)
            print(f"Generated {specific_output_path}")
            display(filtered_destinations.head())
    elif threshold is not None:
        # Filter destinations within the specified travel time threshold
        filtered_destinations = data[data['travel_time'] <= threshold]
        # Sort the result by 'from_id' and 'travel_time' for clarity
        filtered_destinations = filtered_destinations[['from_id', 'to_id', 'travel_time']].sort_values(['from_id', 'travel_time'])
        # Save the result to a CSV file without including the index
        filtered_destinations.to_csv(output_path, index=False)
        print(f"Generated {output_path}")
        display(filtered_destinations.head())
    elif top_n is not None:
        # Sort by travel time, group by 'from_id', and take the top N by travel time
        filtered_destinations = (
            data.sort_values('travel_time')
            .groupby('from_id')
            .head(top_n)
        )
        # Sort the result by 'from_id' and 'travel_time' for clarity
        filtered_destinations = filtered_destinations[['from_id', 'to_id', 'travel_time']].sort_values(['from_id', 'travel_time'])
        # Save the result to a CSV file without including the index
        # Generate a specific output path for each threshold
        specific_output_path = output_path.replace('.csv', f'_Top_{top_n}.csv')
        filtered_destinations.to_csv(specific_output_path, index=False)
        print(f"Generated {specific_output_path}")
        display(filtered_destinations.head())
    else:
        raise ValueError("Either top_n or threshold must be specified.")
    


3.1 Get destinations within travel time threshold

In [28]:
# Get the hospitals within 10, 20, 30 minutes of travel time
raw_travel_time_matrix_path = "../results/TTM_CT_hospitals.csv"
output_path = "../results/TTM_CT_hospitals.csv"
thresholds = [10, 20, 30]

get_destinations(raw_travel_time_matrix_path, output_path, threshold=thresholds)

Generated ../results/TTM_CT_hospitals_threshold_10.csv


Unnamed: 0,from_id,to_id,travel_time
1228,5350006.0,8168605,1.0
42928,5350007.01,8168605,4.0
1324,5350009.0,20232082,3.0
524,5350010.01,20232082,8.0
1520,5350015.0,773891,10.0


Generated ../results/TTM_CT_hospitals_threshold_20.csv


Unnamed: 0,from_id,to_id,travel_time
918,5350003.0,10757635,17.0
1028,5350004.0,8168605,16.0
1018,5350004.0,10757635,17.0
1128,5350005.0,8168605,13.0
1124,5350005.0,20232082,18.0


Generated ../results/TTM_CT_hospitals_threshold_30.csv


Unnamed: 0,from_id,to_id,travel_time
918,5350003.0,10757635,17.0
928,5350003.0,8168605,30.0
1028,5350004.0,8168605,16.0
1018,5350004.0,10757635,17.0
1024,5350004.0,20232082,22.0


In [19]:
# Get the other census tract within 10, 20, 30 minutes of travel time
raw_travel_time_matrix_path = "../results/TTM_CT_CT.csv"
output_path = "../results/TTM_CT_CT_Jobs.csv"
thresholds = [10, 20, 30]

get_destinations(raw_travel_time_matrix_path, output_path, threshold=thresholds)

Generated ../results/TTM_CT_CT_Jobs_threshold_10.csv


Unnamed: 0,from_id,to_id,travel_time
7425,5350006.0,5350007.01,2.0
250119,5350007.01,5350006.0,2.0
7619,5350009.0,5350043.0,9.0
7584,5350009.0,5350010.01,10.0
2928,5350010.01,5350009.0,10.0


Generated ../results/TTM_CT_CT_Jobs_threshold_20.csv


Unnamed: 0,from_id,to_id,travel_time
5291,5350003.0,5350049.0,19.0
5842,5350004.0,5350006.0,14.0
6247,5350004.0,5350047.04,14.0
5873,5350004.0,5350048.0,15.0
6259,5350004.0,5350007.01,16.0


Generated ../results/TTM_CT_CT_Jobs_threshold_30.csv


Unnamed: 0,from_id,to_id,travel_time
5291,5350003.0,5350049.0,19.0
5290,5350003.0,5350048.0,22.0
5257,5350003.0,5350004.0,24.0
5258,5350003.0,5350005.0,28.0
5259,5350003.0,5350006.0,28.0


3.2 Get closest 3 destinations for each origin

In [11]:
# Get the closest 3 hospitals of each census tract
raw_travel_time_matrix_path = "../results/TTM_CT_hospitals.csv"
output_path = "../results/TTM_CT_hospitals.csv"
top_n = 3

get_destinations(raw_travel_time_matrix_path, output_path, top_n=top_n)

Generated ../results/TTM_CT_hospitals_Top_3.csv


Unnamed: 0,from_id,to_id,travel_time
1,5350001.0,6388009,73.0
0,5350001.0,773891,73.0
2,5350001.0,7792695,75.0
3,5350002.0,7792696,
4,5350002.0,9638452,


### 4. Generating Fake Job data for each census tract

In [34]:
import geopandas as gpd
import pandas as pd
import numpy as np

# Read the GeoJSON file to get the CTUID values
geojson_path = "../../../data/census_tract_data/toronto_ct_centroids1.geojson"
gdf = gpd.read_file(geojson_path)

# Extract CTUID values
ctuid_values = gdf['CTUID']

# Generate random numbers for each column
np.random.seed(42)  # For reproducibility
data = {
    'CTUID': ctuid_values,
    'Manufacturing_and_Warehousing': np.random.randint(0, 1000, size=len(ctuid_values)),
    'Retail': np.random.randint(0, 1000, size=len(ctuid_values)),
    'Service': np.random.randint(0, 1000, size=len(ctuid_values)),
    'Office': np.random.randint(0, 1000, size=len(ctuid_values)),
    'Institutional': np.random.randint(0, 1000, size=len(ctuid_values)),
    'Community_and_Entertainment': np.random.randint(0, 1000, size=len(ctuid_values))
}

# Create a DataFrame
df = pd.DataFrame(data)

# Calculate the total_jobs column as the sum of the other columns
df['total_jobs'] = (
    df['Manufacturing_and_Warehousing'] +
    df['Retail'] +
    df['Service'] +
    df['Office'] +
    df['Institutional'] +
    df['Community_and_Entertainment']
)

# Save the DataFrame to a CSV file
output_path = "../results/random_jobs_data.csv"
df.to_csv(output_path, index=False)

# Print the DataFrame without the index
print(df.to_string(index=False))

     CTUID  Manufacturing_and_Warehousing  Retail  Service  Office  Institutional  Community_and_Entertainment  total_jobs
5350128.04                            102     991       50     977            284                          776        3180
5350363.06                            435     883      402       6            715                            2        2443
5350363.07                            860     349      788     329            675                           84        3085
5350378.23                            270      46        4     774            762                          793        2649
5350378.24                            106     866      465     672            226                          327        2662
5350010.01                             71     822      603     150            947                          815        3408
5350010.02                            700     935      681      84            380                          449        3229
5350001.00      