In [42]:
import geopandas as gpd
import cudf
import cuspatial
from shapely.geometry import Point
import pandas as pd
import os
from tqdm import tqdm
import cugraph
import pickle
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv


In [43]:
# Define file paths
LANDUSE_NDVI_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/neighborhoods_with_ndvi_numerical.geojson"
OSM_BUILDINGS_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/Taipei_Buildings_fulldata.geojson"
OSM_ROADS_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_segments_cleaned_verified.geoparquet"
OSM_TREES_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_land.geoparquet"
OSM_TRANSIT_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_infrastructure.geoparquet"
URBAN_MASTERPLAN_PATH = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/Taipei_urban_masterplan.geojson"
SUBGRAPH_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs"
CHECKPOINT_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/checkpoints"

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def validate_geometries(gdf, name):
    """Validate and filter out invalid geometries."""
    print(f"Validating geometries for {name}...")
    gdf = gdf[gdf.geometry.is_valid]
    gdf = gdf.reset_index(drop=True)
    print(f"{name} after validation: {len(gdf)} rows")
    return gdf


Stage 1: Data Loading and Preprocessing
This stage loads geospatial data, validates geometries, and prepares it for graph construction. It uses GPU-accelerated libraries (cudf, cuspatial) alongside geopandas for efficiency.

In [44]:
def load_and_prepare_data():
    """Load and preprocess geospatial data."""
    print("Stage 1: Loading and preparing data...")
    with tqdm(total=6, desc="Loading files") as pbar:
        neighborhoods_gdf = gpd.read_file(LANDUSE_NDVI_PATH, encoding='utf-8-sig')
        neighborhoods_gdf = validate_geometries(neighborhoods_gdf, "neighborhoods")
        pbar.update(1)

        buildings_gdf = gpd.read_file(OSM_BUILDINGS_PATH, encoding='utf-8-sig')
        buildings_gdf = validate_geometries(buildings_gdf, "buildings")
        pbar.update(1)

        roads_gdf = gpd.read_parquet(OSM_ROADS_PATH)
        roads_gdf = validate_geometries(roads_gdf, "roads")
        # Ensure CRS is projected for meter-based lengths
        if roads_gdf.crs.is_geographic:
            print("Reprojecting roads_gdf to EPSG:3826 for length calculations in meters...")
            roads_gdf = roads_gdf.to_crs('EPSG:3826')
        roads_gdf['length_m'] = roads_gdf.geometry.length
        print(f"roads_gdf columns after adding length_m: {roads_gdf.columns.tolist()}")
        pbar.update(1)

        trees_gdf = gpd.read_parquet(OSM_TREES_PATH)
        trees_gdf = trees_gdf[trees_gdf['subtype'] == 'tree']
        trees_gdf = validate_geometries(trees_gdf, "trees")
        pbar.update(1)

        transit_gdf = gpd.read_parquet(OSM_TRANSIT_PATH)
        transit_gdf = transit_gdf[transit_gdf['class'].isin(['stop_position', 'bus_stop'])]
        transit_gdf = transit_gdf[transit_gdf.geometry.geom_type == 'Point']
        transit_gdf = validate_geometries(transit_gdf, "transit")
        pbar.update(1)

        urban_masterplan_gdf = gpd.read_file(URBAN_MASTERPLAN_PATH)
        urban_masterplan_gdf = validate_geometries(urban_masterplan_gdf, "urban_masterplan")
        pbar.update(1)

    # Convert geometries to cuspatial for GPU-accelerated spatial operations
    neighborhoods_geo = cuspatial.from_geopandas(neighborhoods_gdf['geometry'])
    trees_geo = cuspatial.from_geopandas(trees_gdf['geometry'])
    transit_geo = cuspatial.from_geopandas(transit_gdf['geometry'])

    # Calculate tree_count and transit_count for each neighborhood
    neighborhoods_gdf['tree_count'] = 0
    neighborhoods_gdf['transit_count'] = 0
    for idx in range(len(neighborhoods_gdf)):
        neighborhood_geom = neighborhoods_geo.iloc[idx]
        if neighborhood_geom.is_valid:
            # Create a GeoSeries with a single polygon for this neighborhood
            neighborhood_geo = cuspatial.GeoSeries([neighborhood_geom])
            # Count trees within the neighborhood
            trees_mask = cuspatial.point_in_polygon(trees_geo, neighborhood_geo)
            neighborhoods_gdf.at[idx, 'tree_count'] = trees_mask.sum().values[0]
            # Count transit points within the neighborhood
            transit_mask = cuspatial.point_in_polygon(transit_geo, neighborhood_geo)
            neighborhoods_gdf.at[idx, 'transit_count'] = transit_mask.sum().values[0]
        else:
            print(f"Warning: Invalid geometry at index {idx} in neighborhoods_geo")

    return {
        'neighborhoods': neighborhoods_gdf,
        'buildings': buildings_gdf,
        'roads': roads_gdf,
        'trees': trees_gdf,
        'transit': transit_gdf,
        'urban_masterplan': urban_masterplan_gdf
    }

Stage 2: Graph Construction
This stage constructs a graph network for the city, creating subgraphs for each neighborhood and a road network using cugraph.

In [45]:
def build_graph(data):
    """Build graph network with subgraphs for each neighborhood."""
    print("Stage 2: Building graph network...")
    subgraphs = {}
    road_network_nodes = []

    neighborhoods_gdf = data['neighborhoods']
    buildings_gdf = data['buildings']
    roads_gdf = data['roads']
    trees_gdf = data['trees']
    transit_gdf = data['transit']

    for idx in tqdm(range(len(neighborhoods_gdf)), desc="Building subgraphs"):
        lie_name = neighborhoods_gdf['LIE_NAME'].iloc[idx]
        subgraph_path = os.path.join(SUBGRAPH_DIR, f"subgraph_{lie_name}.pkl")
        
        if os.path.exists(subgraph_path):
            with open(subgraph_path, 'rb') as f:
                subgraphs[lie_name] = pickle.load(f)
            continue

        G_sub = cugraph.Graph(directed=False)
        all_nodes = []
        all_edges = []

        # Neighborhood node
        node_id = f"neighborhood_{lie_name}"
        node_data = {
            'vertex': node_id,
            'type': 'neighborhood',
            'lie_name': lie_name,
            'population': neighborhoods_gdf['2024population'].iloc[idx],
            'land_use_residential_percent': neighborhoods_gdf['land_use_residential_percent'].iloc[idx],
            'land_use_commercial_percent': neighborhoods_gdf['land_use_commercial_percent'].iloc[idx],
            'land_use_education_percent': neighborhoods_gdf['land_use_education_percent'].iloc[idx],
            'ndvi_mean': neighborhoods_gdf['ndvi_mean'].iloc[idx],
            'tree_count': neighborhoods_gdf['tree_count'].iloc[idx],
            'transit_count': neighborhoods_gdf['transit_count'].iloc[idx]
        }
        all_nodes.append(node_data)

        # Buffer and spatial filtering
        buffer_distance = 200
        neigh_geom = neighborhoods_gdf.geometry.iloc[idx]
        neigh_buffer = neigh_geom.buffer(buffer_distance)

        relevant_buildings = buildings_gdf[buildings_gdf.geometry.within(neigh_buffer)]
        relevant_roads = roads_gdf[roads_gdf.geometry.intersects(neigh_buffer)]
        relevant_trees = trees_gdf[trees_gdf.geometry.within(neigh_buffer)]
        relevant_transit = transit_gdf[transit_gdf.geometry.within(neigh_buffer)]

        # Building nodes
        for b_idx, building in relevant_buildings.iterrows():
            node_id = f"building_{b_idx}"
            building_data = {
                'vertex': node_id,
                'type': 'building',
                'building_type': building['building'],
                'area_m2': building['area_m2']
            }
            all_nodes.append(building_data)

        # Road nodes
        road_points = []
        for r_idx, road in relevant_roads.iterrows():
            geom = road.geometry
            if geom.geom_type == 'LineString':
                start_point = Point(geom.coords[0])
                end_point = Point(geom.coords[-1])
                road_points.extend([(f"road_start_{r_idx}", start_point), (f"road_end_{r_idx}", end_point)])

        for node_id, geom in road_points:
            r_idx = int(node_id.split('_')[2])
            road_data = {
                'vertex': node_id,
                'type': 'road',
                'road_class': roads_gdf['class'].iloc[r_idx],
                'length_m': roads_gdf['length_m'].iloc[r_idx]
            }
            all_nodes.append(road_data)
            road_network_nodes.append(road_data)

        # Tree nodes
        for t_idx, tree in relevant_trees.iterrows():
            node_id = f"tree_{t_idx}"
            tree_data = {'vertex': node_id, 'type': 'tree'}
            all_nodes.append(tree_data)

        # Transit nodes
        for t_idx, transit in relevant_transit.iterrows():
            node_id = f"transit_{t_idx}"
            transit_data = {
                'vertex': node_id,
                'type': 'transit',
                'class': transit['class']
            }
            all_nodes.append(transit_data)

        # Edges (simplified for brevity)
        # Add edges based on spatial proximity (e.g., neighborhood to buildings, roads, etc.)
        # This can be expanded as needed

        nodes_df = cudf.DataFrame(all_nodes)
        edges_df = cudf.DataFrame(all_edges) if all_edges else None
        subgraph_data = {'nodes': nodes_df, 'edges': edges_df}
        subgraphs[lie_name] = subgraph_data

        with open(subgraph_path, 'wb') as f:
            pickle.dump(subgraph_data, f)

    return subgraphs, road_network_nodes

Stage 3: Walkability Calculation (Rule-Based)
This stage calculates walkability scores using a rule-based approach, based on land use, NDVI, trees, transit, and open spaces.

In [46]:
def compute_walkability_scores(nodes_df):
    """Compute rule-based walkability scores for neighborhood nodes."""
    neighborhood_mask = nodes_df['type'] == 'neighborhood'
    neighborhood_df = nodes_df[neighborhood_mask].copy()

    if len(neighborhood_df) == 0:
        if 'walkability_rule' not in nodes_df.columns:
            nodes_df['walkability_rule'] = 0.0
        return nodes_df

    residential = neighborhood_df['land_use_residential_percent']
    commercial = neighborhood_df['land_use_commercial_percent']
    education = neighborhood_df['land_use_education_percent']
    ndvi = neighborhood_df['ndvi_mean'].fillna(0.0)
    tree_count = neighborhood_df['tree_count']
    transit_count = neighborhood_df['transit_count']

    land_use_score = (residential * 0.4 + commercial * 0.3 + education * 0.2) / 100
    ndvi_score = ndvi * 0.5
    tree_score = (tree_count / 100).clip(upper=1.0) * 0.2
    transit_score = (transit_count / 20).clip(upper=1.0) * 0.2

    walkability = (land_use_score + ndvi_score * 0.4 + tree_score + transit_score).clip(upper=1.0)

    if 'walkability_rule' not in nodes_df.columns:
        nodes_df['walkability_rule'] = 0.0
    nodes_df.loc[neighborhood_mask, 'walkability_rule'] = walkability

    return nodes_df

def calculate_walkability(subgraphs, neighborhoods_gdf):
    """Calculate rule-based walkability scores for all subgraphs."""
    print("Stage 3: Calculating rule-based walkability scores...")
    for lie_name, subgraph_data in tqdm(subgraphs.items(), desc="Calculating walkability"):
        nodes_df = subgraph_data['nodes']
        nodes_df = compute_walkability_scores(nodes_df)
        subgraph_data['nodes'] = nodes_df

        neighborhood_walkability = nodes_df[nodes_df['type'] == 'neighborhood']['walkability_rule']
        if not neighborhood_walkability.empty:
            walkability_value = neighborhood_walkability.iloc[0]
            neighborhoods_gdf.loc[neighborhoods_gdf['LIE_NAME'] == lie_name, 'walkability_rule'] = walkability_value

    return subgraphs

Stage 4: GNN Model Integration
This stage integrates a GNN model to predict walkability scores, enhancing the rule-based approach with machine learning using PyTorch Geometric.

In [47]:
class GCNWalkability(torch.nn.Module):
    """Graph Convolutional Network for walkability prediction."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNWalkability, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

def prepare_gnn_data(subgraphs):
    """Prepare graph data for GNN training."""
    data_list = []
    for lie_name, subgraph_data in subgraphs.items():
        nodes_df = subgraph_data['nodes'].to_pandas()
        edges_df = subgraph_data['edges'].to_pandas() if subgraph_data['edges'] is not None else pd.DataFrame()

        # Node features
        feature_cols = ['land_use_residential_percent', 'land_use_commercial_percent', 
                        'land_use_education_percent', 'ndvi_mean', 'tree_count', 'transit_count']
        nodes_df[feature_cols] = nodes_df[feature_cols].fillna(0)
        x = torch.tensor(nodes_df[feature_cols].values, dtype=torch.float)

        # Edge index
        if not edges_df.empty:
            edge_index = torch.tensor(edges_df[['src', 'dst']].values.T, dtype=torch.long)
        else:
            edge_index = torch.tensor([[], []], dtype=torch.long)  # Empty edge index

        # Labels (rule-based walkability)
        y = torch.tensor(nodes_df['walkability_rule'].values, dtype=torch.float)

        data = Data(x=x, edge_index=edge_index, y=y)
        data_list.append(data)

    return data_list

def train_gnn_model(data_list):
    """Train the GNN model."""
    print("Stage 4: Training GNN model...")
    model = GCNWalkability(in_channels=6, hidden_channels=64, out_channels=1)  # 6 features
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.MSELoss()

    for epoch in tqdm(range(200), desc="Training GNN"):
        for data in data_list:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()

    return model

def predict_walkability(subgraphs, model):
    """Predict walkability scores using the trained GNN."""
    for lie_name, subgraph_data in subgraphs.items():
        nodes_df = subgraph_data['nodes'].to_pandas()
        data = prepare_gnn_data({lie_name: subgraph_data})[0]
        
        with torch.no_grad():
            pred = model(data.x, data.edge_index)
        nodes_df['walkability_gnn'] = pred.numpy().flatten()
        subgraph_data['nodes'] = cudf.from_pandas(nodes_df)
    return subgraphs

Stage 5: Interactive Visualization
This stage creates an interactive Kepler.gl map, allowing users to visualize and modify the urban environment.

In [48]:
from keplergl import KeplerGl

# Replace with your Mapbox token
MAPBOX_ACCESS_TOKEN = "your_mapbox_access_token_here"

def create_interactive_map(subgraphs, data):
    """Generate an interactive Kepler.gl map."""
    print("Stage 5: Generating interactive Kepler.gl map...")
    neighborhoods_gdf = data['neighborhoods'].to_crs('EPSG:4326')

    # Update neighborhoods with walkability scores
    for lie_name, subgraph_data in subgraphs.items():
        nodes_df = subgraph_data['nodes'].to_pandas()
        neigh_data = nodes_df[nodes_df['type'] == 'neighborhood'].iloc[0]
        neighborhoods_gdf.loc[neighborhoods_gdf['LIE_NAME'] == lie_name, 'walkability_rule'] = neigh_data['walkability_rule']
        neighborhoods_gdf.loc[neighborhoods_gdf['LIE_NAME'] == lie_name, 'walkability_gnn'] = neigh_data['walkability_gnn']

    geojson_data = neighborhoods_gdf.to_json()

    map_1 = KeplerGl(height=600, width=800, mapbox_api_access_token=MAPBOX_ACCESS_TOKEN)
    map_1.add_data(data=geojson_data, name="Neighborhoods")

    output_path = os.path.join("/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data", "walkability_map_city_level.html")
    map_1.save_to_html(file_name=output_path)
    print(f"Interactive Kepler.gl map saved at {output_path}")

Main Execution
This section ties all stages together in a main function.

In [49]:
def main():
    """Execute the full walkability analysis pipeline."""
    try:
        # Stage 1
        data = load_and_prepare_data()

        # Stage 2
        subgraphs, road_network_nodes = build_graph(data)

        # Stage 3
        subgraphs = calculate_walkability(subgraphs, data['neighborhoods'])

        # Stage 4
        data_list = prepare_gnn_data(subgraphs)
        gnn_model = train_gnn_model(data_list)
        subgraphs = predict_walkability(subgraphs, gnn_model)

        # Stage 5
        create_interactive_map(subgraphs, data)

        print("Analysis completed successfully.")
    except Exception as e:
        print(f"Error during analysis: {e}")
        import traceback
        print(traceback.format_exc())

if __name__ == "__main__":
    main()

Stage 1: Loading and preparing data...


Loading files:   0%|          | 0/6 [00:00<?, ?it/s]

Validating geometries for neighborhoods...
neighborhoods after validation: 456 rows


Loading files:  33%|███▎      | 2/6 [00:01<00:03,  1.22it/s]

Validating geometries for buildings...
buildings after validation: 74306 rows
Validating geometries for roads...
roads after validation: 81444 rows
roads_gdf columns after adding length_m: ['class', 'geometry', 'length_m']
Validating geometries for trees...
trees after validation: 3399 rows


Loading files:  83%|████████▎ | 5/6 [00:01<00:00,  2.98it/s]

Validating geometries for transit...
transit after validation: 6844 rows


Loading files: 100%|██████████| 6/6 [00:02<00:00,  2.30it/s]

Validating geometries for urban_masterplan...
urban_masterplan after validation: 15392 rows





Stage 2: Building graph network...


Building subgraphs: 100%|██████████| 456/456 [00:57<00:00,  7.89it/s]


Stage 3: Calculating rule-based walkability scores...


Calculating walkability: 100%|██████████| 454/454 [00:38<00:00, 11.90it/s]


Stage 4: Training GNN model...


Training GNN: 100%|██████████| 200/200 [01:50<00:00,  1.81it/s]


Stage 5: Generating interactive Kepler.gl map...
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/walkability_map_city_level.html!
Interactive Kepler.gl map saved at /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/walkability_map_city_level.html
Analysis completed successfully.
