Cell 0: CUDA Version check

In [310]:
import cudf, cugraph
print(cudf.__version__, cugraph.__version__)

25.02.02 25.02.00


Cell 1: Imports ,Global Constants

In [311]:
# Standard library imports
import os
import logging
import hashlib
import json
import time

# Third-party imports
import geopandas as gpd
import pandas as pd
import cudf
import cugraph
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from scipy.stats import pearsonr, spearmanr
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
from tqdm import tqdm
from keplergl import KeplerGl

# Plotting configuration
%matplotlib inline
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC', 'Noto Serif CJK TC', 'Noto Sans Mono CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Directory and file paths
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical_corrected.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population_corrected.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
INTERSECTION_CACHE_PATH = os.path.join(BASE_DIR, "neighborhoods_with_intersections.geoparquet")
GRAPH_NODES_CACHE_PATH = os.path.join(BASE_DIR, "graph_nodes.parquet")
GRAPH_EDGES_CACHE_PATH = os.path.join(BASE_DIR, "graph_edges.parquet")
GRAPH_NODE_ID_CACHE_PATH = os.path.join(BASE_DIR, "graph_node_id_to_index.json")
GRAPH_DATA_HASH_PATH = os.path.join(BASE_DIR, "graph_data_hash.txt")

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Constants for spatial analysis
BUFFER_DISTANCE = 10  # Meters, buffer distance for border sharing of accidents (tunable based on spatial resolution)
MIN_ROAD_LENGTH = 10  # Meters, minimum road length to avoid inflated accident density (tunable based on dataset)

# Land use category priorities for area assignment
CATEGORY_PRIORITY = {
    'City_Open_Area': 10,
    'Pedestrian': 9,
    'Public_Transportation': 8,
    'Amenity': 7,
    'Education': 6,
    'Medical': 5,
    'Commercial': 4,
    'Residential': 3,
    'Natural': 2,
    'Road': 1,
    'River': 1,
    'Infrastructure': 1,
    'Government': 1,
    'Special_Zone': 1,
    'Military': 1,
    'Industrial': 1,
    'Agriculture': 1
}

# Weights for land use diversity in walkability scoring
land_use_weights = {
    'city_open_area': 0.8,
    'commercial': 0.7,
    'infrastructure': 0.4,
    'government': 0.5,
    'public_transportation': 0.8,
    'education': 0.7,
    'medical': 0.6,
    'amenity': 0.8,
    'road': 0.3,
    'pedestrian': 1.0,
    'natural': 0.7,
    'special_zone': 0.4,
    'river': 0.7,
    'military': 0.2,
    'residential': 0.6,
    'industrial': 0.3,
    'agriculture': 0.4
}

Cell 2: Utility Functions

In [312]:
import logging
import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

def compute_road_type_accident_correlation(roads_gdf, neighborhoods_gdf, accidents_gdf):
    """
    Compute correlation between OSM road types and accident density (accidents per km of road length).
    Uses road class as a proxy for width, with ordinal ranking based on OSM hierarchy.
    Generates bar, box, and scatter plots for visualization.
    """
    logging.info("Computing correlation between road types and accident density...")
    
    # Validate input data
    validate_data(roads_gdf, ['class', 'geometry', 'length_m'], "roads_gdf")
    validate_data(neighborhoods_gdf, ['LIE_NAME', 'geometry'], "neighborhoods_gdf")
    validate_data(accidents_gdf, ['geometry'], "accidents_gdf")
    
    # Make local copies for roads and accidents to avoid modifying originals
    roads_gdf_local = roads_gdf.copy()
    accidents_gdf_local = accidents_gdf.copy()
    
    # Add unique identifier to accidents
    accidents_gdf_local['accident_id'] = range(len(accidents_gdf_local))
    
    # Define width ranking
    width_ranking = {
        'motorway': 5, 'trunk': 5, 'primary': 4, 'secondary': 4, 'tertiary': 3,
        'residential': 3, 'living_street': 3, 'service': 2, 'track': 2,
        'path': 1, 'footway': 1, 'cycleway': 1, 'steps': 1, 'pedestrian': 1,
        'unclassified': 0, 'bridleway': 0, 'unknown': 0
    }
    roads_gdf_local['width_rank'] = roads_gdf_local['class'].map(width_ranking).fillna(0).astype(int)
    
    # Buffer wider roads for accident assignment
    roads_gdf_buffered = roads_gdf_local.copy()
    roads_gdf_buffered['geometry'] = roads_gdf_buffered.apply(
        lambda row: row['geometry'].buffer(5) if row['width_rank'] >= 4 else row['geometry'], axis=1
    )
    
    logging.info("Assigning accidents to nearest road...")
    accidents_gdf_local['geometry'] = accidents_gdf_local['geometry'].apply(fix_geometry)
    accidents_gdf_local = accidents_gdf_local[accidents_gdf_local['geometry'].is_valid & ~accidents_gdf_local['geometry'].is_empty]
    
    if accidents_gdf_local.empty:
        logging.warning("No valid accidents after geometry fixing.")
        return pd.DataFrame()
    
    # Assign accidents to nearest road
    nearest = gpd.sjoin_nearest(
        accidents_gdf_local,
        roads_gdf_buffered[['geometry', 'class', 'width_rank']],
        how='left',
        distance_col='distance'
    )
    nearest['weighted_distance'] = nearest['distance'] / (nearest['width_rank'].replace(0, 1) ** 2)
    nearest = nearest.sort_values('weighted_distance').drop_duplicates(subset=['accident_id'], keep='first')
    
    matched_accidents = nearest[['accident_id', 'index_right']].copy()
    matched_accidents.columns = ['accident_id', 'road_idx']
    matched_accidents = matched_accidents.dropna(subset=['road_idx'])
    matched_accidents['road_idx'] = matched_accidents['road_idx'].astype(int)
    
    logging.info(f"Matched {len(matched_accidents)} accidents out of {len(accidents_gdf_local)}")
    
    # Reassign accidents from footway/cycleway to wider roads if possible
    footway_cycleway_accidents = matched_accidents[
        matched_accidents['road_idx'].isin(
            roads_gdf_local[roads_gdf_local['class'].isin(['footway', 'cycleway'])].index
        )
    ]
    if not footway_cycleway_accidents.empty:
        logging.info(f"Reassigning {len(footway_cycleway_accidents)} accidents from footway/cycleway...")
        accidents_to_reassign = accidents_gdf_local[accidents_gdf_local['accident_id'].isin(footway_cycleway_accidents['accident_id'])].copy()
        wider_roads = roads_gdf_buffered[roads_gdf_buffered['width_rank'] >= 4]
        if not wider_roads.empty:
            reassigned = gpd.sjoin_nearest(
                accidents_to_reassign,
                wider_roads[['geometry', 'class']],
                how='left',
                max_distance=10
            )
            reassigned_matches = reassigned[['accident_id', 'index_right']].copy()
            reassigned_matches.columns = ['accident_id', 'road_idx']
            reassigned_matches = reassigned_matches.dropna(subset=['road_idx'])
            reassigned_matches['road_idx'] = reassigned_matches['road_idx'].astype(int)
            matched_accidents = matched_accidents[~matched_accidents['accident_id'].isin(reassigned_matches['accident_id'])]
            matched_accidents = pd.concat([matched_accidents, reassigned_matches], ignore_index=True)
            logging.info(f"Reassigned {len(reassigned_matches)} accidents to wider roads")
    
    # Count accidents per road
    accident_counts = matched_accidents.groupby('road_idx').size().reindex(roads_gdf_local.index, fill_value=0)
    roads_gdf_local['accident_count'] = accident_counts
    
    logging.info(f"Accidents by road type:\n{roads_gdf_local.groupby('class')['accident_count'].sum()}")
    
    # Filter roads by minimum length
    roads_gdf_local = roads_gdf_local[roads_gdf_local['length_m'] >= MIN_ROAD_LENGTH]
    
    # Calculate accident density
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_count'] / (roads_gdf_local['length_m'] / 1000)
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'].fillna(0).replace([np.inf, -np.inf], 0)
    
    # Adjust density by width rank
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'] * (roads_gdf_local['width_rank'].replace(0, 1) / 5)
    
    logging.info(f"Road type counts:\n{roads_gdf_local['class'].value_counts()}")
    print(f"Road type counts:\n{roads_gdf_local['class'].value_counts()}")
    
    # Summarize by road type
    summary = roads_gdf_local.groupby('class').agg({
        'length_m': 'sum',
        'accident_count': 'sum',
        'accident_density': 'mean',
        'width_rank': 'first'
    }).reset_index()
    
    summary = summary[summary['length_m'] > 1000]
    summary = summary[summary['width_rank'] > 0]
    
    print("\n--- Road Type Accident Density Summary ---")
    print(summary[['class', 'length_m', 'accident_count', 'accident_density', 'width_rank']].round(2))
    
    if len(summary) >= 2:
        corr, p_value = spearmanr(summary['width_rank'], summary['accident_density'])
        logging.info(f"Spearman's correlation between road width rank and accident density: {corr:.3f} (p-value: {p_value:.3f})")
        print(f"Spearman's correlation: {corr:.3f} (p-value: {p_value:.3f})")
    else:
        logging.warning("Insufficient road types for correlation analysis.")
        print("Insufficient road types for correlation analysis.")
    
    # Compute average road accident density per neighborhood
    logging.info("Computing average road accident density per neighborhood...")
    logging.info(f"Roads bounds: {roads_gdf_local.total_bounds}")
    logging.info(f"Neighborhoods bounds: {neighborhoods_gdf.total_bounds}")
    
    roads_gdf_with_idx = roads_gdf_local[['geometry', 'class', 'length_m', 'width_rank', 'accident_density']].reset_index()
    neighborhoods_gdf_with_idx = neighborhoods_gdf[['geometry', 'LIE_NAME']].reset_index()
    
    logging.info(f"Roads DataFrame shape before join: {roads_gdf_with_idx.shape}")
    logging.info(f"Neighborhoods DataFrame shape before join: {neighborhoods_gdf_with_idx.shape}")
    
    # Perform spatial join
    road_neighborhoods = gpd.sjoin(
        roads_gdf_with_idx,
        neighborhoods_gdf_with_idx,
        how='left',
        predicate='intersects'
    )
    logging.info(f"Road-neighborhood join resulted in {len(road_neighborhoods)} matches with columns: {road_neighborhoods.columns.tolist()}")
    
    if road_neighborhoods['LIE_NAME'].isna().all():
        logging.warning("No roads intersect with neighborhoods. Checking for geometry validity and CRS mismatch...")
        # Check CRS
        logging.info(f"Roads CRS: {roads_gdf_with_idx.crs}")
        logging.info(f"Neighborhoods CRS: {neighborhoods_gdf_with_idx.crs}")
        # Check geometry validity
        logging.info(f"Roads geometry validity: {roads_gdf_with_idx.geometry.is_valid.all()}")
        logging.info(f"Neighborhoods geometry validity: {neighborhoods_gdf_with_idx.geometry.is_valid.all()}")
        
        # Fix geometries if necessary
        roads_gdf_with_idx['geometry'] = roads_gdf_with_idx['geometry'].apply(fix_geometry)
        neighborhoods_gdf_with_idx['geometry'] = neighborhoods_gdf_with_idx['geometry'].apply(fix_geometry)
        
        # Reproject if CRS mismatch
        target_crs = 'EPSG:3826'
        if roads_gdf_with_idx.crs != target_crs:
            logging.info(f"Reprojecting roads to {target_crs}")
            roads_gdf_with_idx = roads_gdf_with_idx.to_crs(target_crs)
        if neighborhoods_gdf_with_idx.crs != target_crs:
            logging.info(f"Reprojecting neighborhoods to {target_crs}")
            neighborhoods_gdf_with_idx = neighborhoods_gdf_with_idx.to_crs(target_crs)
        
        # Retry spatial join
        road_neighborhoods = gpd.sjoin(
            roads_gdf_with_idx,
            neighborhoods_gdf_with_idx,
            how='left',
            predicate='intersects'
        )
        logging.info(f"Retried road-neighborhood join resulted in {len(road_neighborhoods)} matches")
        
        if road_neighborhoods['LIE_NAME'].isna().all():
            logging.warning("Still no intersections found. Buffering geometries...")
            roads_buffered = roads_gdf_with_idx.copy()
            neighborhoods_buffered = neighborhoods_gdf_with_idx.copy()
            roads_buffered['geometry'] = roads_buffered['geometry'].buffer(50)
            neighborhoods_buffered['geometry'] = neighborhoods_buffered['geometry'].buffer(50)
            
            road_neighborhoods = gpd.sjoin(
                roads_buffered,
                neighborhoods_buffered,
                how='left',
                predicate='intersects'
            )
            logging.info(f"Buffered road-neighborhood join resulted in {len(road_neighborhoods)} matches")
            
            if road_neighborhoods['LIE_NAME'].isna().all():
                logging.error("No intersections found even after buffering. Assigning default 0.")
                neighborhoods_gdf['avg_road_accident_density'] = 0
            else:
                avg_accident_density = road_neighborhoods.groupby('LIE_NAME')['accident_density'].mean()
                avg_accident_density = avg_accident_density.reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
                neighborhoods_gdf['avg_road_accident_density'] = avg_accident_density.fillna(0)
        else:
            avg_accident_density = road_neighborhoods.groupby('LIE_NAME')['accident_density'].mean()
            avg_accident_density = avg_accident_density.reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
            neighborhoods_gdf['avg_road_accident_density'] = avg_accident_density.fillna(0)
    else:
        avg_accident_density = road_neighborhoods.groupby('LIE_NAME')['accident_density'].mean()
        avg_accident_density = avg_accident_density.reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
        neighborhoods_gdf['avg_road_accident_density'] = avg_accident_density.fillna(0)
    
    assigned_count = sum(~neighborhoods_gdf['avg_road_accident_density'].isna())
    logging.info(f"Assigned avg_road_accident_density to {assigned_count} neighborhoods")
    logging.info(f"Avg road accident density stats:\n{neighborhoods_gdf['avg_road_accident_density'].describe()}")
    
    # Generate plots
    plt.figure(figsize=(12, 6))
    summary_sorted = summary.sort_values('width_rank', ascending=False)
    sns.barplot(data=summary_sorted, x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Mean Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    bar_path = os.path.join(BASE_DIR, 'road_type_accident_bar.png')
    plt.savefig(bar_path)
    plt.close()
    logging.info(f"Bar chart saved to {bar_path}")
    print(f"Bar chart saved to {bar_path}")
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=roads_gdf_local[roads_gdf_local['class'].isin(summary['class'])], 
                x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Accident Density (Accidents per km)')
    plt.title('Distribution of Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.yscale('log')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    box_path = os.path.join(BASE_DIR, 'road_type_accident_box.png')
    plt.savefig(box_path)
    plt.close()
    logging.info(f"Box chart saved to {box_path}")
    print(f"Box chart saved to {box_path}")
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=summary, x='width_rank', y='accident_density', 
                    size='length_m', sizes=(50, 500), hue='class', style='class', alpha=0.7)
    z = np.polyfit(summary['width_rank'], summary['accident_density'], 1)
    p = np.poly1d(z)
    plt.plot(summary['width_rank'], p(summary['width_rank']), "r--", alpha=0.5)
    plt.xlabel('Road Width Rank (1=Path, 5=Motorway)')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Road Type vs. Accident Density')
    plt.yscale('log')
    plt.grid(True)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    scatter_path = os.path.join(BASE_DIR, 'road_type_accident_scatter.png')
    plt.savefig(scatter_path)
    plt.close()
    logging.info(f"Scatter plot saved to {scatter_path}")
    print(f"Scatter plot saved to {scatter_path}")
    
    top_types = summary.nlargest(3, 'accident_density')[['class', 'accident_density']]
    logging.info(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    print(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    
    return summary

Cell 3: Walkability Computation Functions

In [313]:
def compute_walkability_components(neighborhoods_gdf, sample_size=5):
    """
    Compute walkability components for a sample of neighborhoods.
    
    Args:
        neighborhoods_gdf (gpd.GeoDataFrame): GeoDataFrame of neighborhoods.
        sample_size (int): Number of neighborhoods to sample.
    
    Returns:
        pd.DataFrame: DataFrame with walkability components for sampled neighborhoods.
    """
    if not all(col in neighborhoods_gdf.columns for col in ['ndvi_mean', 'tree_count', 'transit_count', 'intersection_density', 'accident_count', 'area_km2', 'avg_road_accident_density', 'elderly_percentage']):
        logging.error("Required columns missing in neighborhoods_gdf for walkability computation.")
        raise KeyError("Missing required columns in neighborhoods_gdf.")
    
    sample_gdf = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'pedestrian_infrastructure_score': [],
        'walkability_score': [],
        'walkability_category': []
    }
    
    ndvi_min, ndvi_max = neighborhoods_gdf['ndvi_mean'].min(), neighborhoods_gdf['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_gdf['tree_count'].min(), neighborhoods_gdf['tree_count'].max()
    transit_min, transit_max = neighborhoods_gdf['transit_count'].min(), neighborhoods_gdf['transit_count'].max()
    intersection_density_min = neighborhoods_gdf['intersection_density'].min()
    intersection_density_max = neighborhoods_gdf['intersection_density'].max()
    accident_count_min, accident_count_max = neighborhoods_gdf['accident_count'].min(), neighborhoods_gdf['accident_count'].max()
    accident_density_max = neighborhoods_gdf['avg_road_accident_density'].max() if not neighborhoods_gdf['avg_road_accident_density'].isna().all() else 1.0
    pedestrian_road_max = neighborhoods_gdf.get('pedestrian_road_density', pd.Series(0)).max() or 1.0
    
    accident_count_density = neighborhoods_gdf['accident_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    accident_count_density_max = accident_count_density.max() if accident_count_density.max() > 0 else 1.0
    
    for idx, row in sample_gdf.iterrows():
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.1) / 100 for col in land_use_cols]  # Default to 0.1 (10%) if missing
        logging.debug(f"LIE_NAME: {row['LIE_NAME']}, Land use values: {land_use_values}")
        land_use_weights_list = [land_use_weights.get(category.lower(), 1.0) for category in CATEGORY_PRIORITY.keys()]
        weighted_values = [p * w for p, w in zip(land_use_values, land_use_weights_list) if p > 0]
        if not weighted_values:
            weighted_values = [1.0 / len(land_use_values)] * len(land_use_values)
        total = sum(weighted_values)
        if total > 0:
            weighted_values = [v / total for v in weighted_values]
            entropy = -np.sum([p * np.log2(p + 1e-10) for p in weighted_values])
            max_entropy = np.log2(len(weighted_values))
            land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0.5
        else:
            land_use_diversity = 0.5
        
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6)) if (ndvi_max - ndvi_min) > 0 else 0
        tree_density = row['tree_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        tree_density_max = (neighborhoods_gdf['tree_count'] / neighborhoods_gdf['area_km2']).replace(0, 1e-6).max()
        tree_density_normalized = (tree_density / (tree_density_max + 1e-6)) if tree_density_max > 0 else 0
        open_area = row.get('land_use_city_open_area_percent', 0.1) / 100  # Default to 0.1 if missing
        green_space_score = (0.4 * ndvi_normalized + 0.3 * tree_density_normalized + 0.3 * open_area)
        
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6) if (transit_max - transit_min) > 0 else 0
        transit_score = transit_raw
        
        intersection_density = row['intersection_density']
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6) if (intersection_density_max - intersection_density_min) > 0 else 0
        road_connectivity = intersection_density_normalized
        
        accident_count_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_count_density = min(accident_count_density, accident_count_density_max * 0.5)
        safety_score_count = 1 - (accident_count_density / (accident_count_density_max + 1e-6)) if accident_count_density_max > 0 else 1
        accident_density = row['avg_road_accident_density'] if pd.notna(row['avg_road_accident_density']) else 0
        safety_score_roads = 1 - (accident_density / (accident_density_max + 1e-6)) if accident_density_max > 0 else 1
        pedestrian_roads = row.get('pedestrian_road_density', 0.0)
        pedestrian_roads_safety = min(pedestrian_roads / (pedestrian_road_max + 1e-6), 1.0) if pedestrian_road_max > 0 else 0.1
        safety_score = (0.3 * safety_score_count + 0.3 * safety_score_roads + 0.4 * pedestrian_roads_safety) if row['accident_count'] != 0 else (0.5 * safety_score_roads + 0.5 * pedestrian_roads_safety)
        safety_score = np.clip(safety_score, 0, 1)
        
        elderly_percentage = row['elderly_percentage'] / 100
        medical_access = row.get('land_use_medical_percent', 0.1) / 100  # Default to 0.1 if missing
        elderly_accessibility = 0.3 * elderly_percentage + 0.4 * medical_access + 0.3 * pedestrian_roads_safety
        elderly_accessibility = np.clip(elderly_accessibility, 0, 1)
        
        pedestrian_road_score = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        amenity_access = row.get('land_use_amenity_percent', 0.1) / 100  # Default to 0.1 if missing
        pedestrian_infrastructure_score = 0.5 * pedestrian_road_score + 0.5 * amenity_access
        pedestrian_infrastructure_score = np.clip(pedestrian_infrastructure_score, 0, 1)
        
        base_score = (
            0.35 * land_use_diversity +
            0.35 * green_space_score +
            0.15 * transit_score +
            0.10 * road_connectivity +
            0.05 * pedestrian_infrastructure_score
        )
        safety_modifier = 0.7 + 0.3 * safety_score
        elderly_modifier = 0.9 + 0.1 * elderly_accessibility
        walkability_score = base_score * safety_modifier * elderly_modifier
        walkability_score = np.clip(walkability_score, 0, 1)
        
        if pd.isna(walkability_score):
            logging.warning(f"Walkability score is NaN for {row['LIE_NAME']}. Setting to 0.")
            walkability_score = 0
            category = 'low'
        elif walkability_score < 0.33:
            category = 'low'
        elif walkability_score < 0.66:
            category = 'medium'
        else:
            category = 'high'
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['pedestrian_infrastructure_score'].append(pedestrian_infrastructure_score)
        components['walkability_score'].append(walkability_score)
        components['walkability_category'].append(category)
    
    return pd.DataFrame(components)

def compute_walkability_components_all(neighborhoods_df, data):
    """
    Compute walkability components for all neighborhoods.
    
    Args:
        neighborhoods_df (gpd.GeoDataFrame): GeoDataFrame of neighborhoods.
        data (dict): Dictionary containing roads and other datasets.
    
    Returns:
        pd.DataFrame: DataFrame with walkability components for all neighborhoods.
    """
    if neighborhoods_df.empty:
        logging.error("neighborhoods_df is empty.")
        raise ValueError("neighborhoods_df cannot be empty.")
    if 'roads' not in data or data['roads'].empty:
        logging.error("Roads data is missing or empty.")
        raise ValueError("Roads data cannot be empty.")
    
    required_cols = ['LIE_NAME', 'geometry', 'ndvi_mean', 'tree_count', 'transit_count', 'intersection_density', 
                     'accident_count', 'area_km2', 'avg_road_accident_density', 'elderly_percentage']
    missing_cols = [col for col in required_cols if col not in neighborhoods_df.columns]
    if missing_cols:
        logging.error(f"Missing columns in neighborhoods_df: {missing_cols}")
        raise KeyError(f"Missing columns in neighborhoods_df: {missing_cols}")
    
    for col in required_cols:
        if col == 'geometry':
            null_count = neighborhoods_df[col].isna().sum()
            invalid_count = (~neighborhoods_df[col].is_valid).sum()
            logging.info(f"Geometry column: {null_count} nulls, {invalid_count} invalid geometries.")
        else:
            null_count = neighborhoods_df[col].isna().sum()
            zero_count = (neighborhoods_df[col] == 0).sum()
            logging.info(f"{col}: {null_count} nulls, {zero_count} zeros.")
    
    land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
    for col in land_use_cols:
        if col not in neighborhoods_df.columns:
            logging.warning(f"Land use column {col} not found in neighborhoods_df. Using default value 0.1.")
            neighborhoods_df[col] = 0.1  # Default value for missing land use columns
        else:
            logging.info(f"{col} stats:\n{neighborhoods_df[col].describe()}")
    
    neighborhoods_df = neighborhoods_df.copy()
    neighborhoods_df = neighborhoods_df.to_crs('EPSG:3826')
    roads_df = data['roads'].copy()
    roads_df = roads_df.to_crs('EPSG:3826')
    
    neighborhoods_df['geometry'] = neighborhoods_df['geometry'].apply(fix_geometry)
    invalid_geoms = neighborhoods_df[~neighborhoods_df.geometry.is_valid]
    if not invalid_geoms.empty:
        logging.warning(f"Found {len(invalid_geoms)} invalid geometries in neighborhoods_df after fixing.")
        neighborhoods_df = neighborhoods_df[neighborhoods_df.geometry.is_valid]
        if neighborhoods_df.empty:
            logging.error("All geometries in neighborhoods_df are invalid after fixing.")
            raise ValueError("No valid geometries in neighborhoods_df.")
    
    roads_df['geometry'] = roads_df['geometry'].apply(fix_geometry)
    invalid_roads = roads_df[~roads_df.geometry.is_valid]
    if not invalid_roads.empty:
        logging.warning(f"Found {len(invalid_roads)} invalid geometries in roads_df after fixing.")
        roads_df = roads_df[roads_df.geometry.is_valid]
        if roads_df.empty:
            logging.error("All geometries in roads_df are invalid after fixing.")
            raise ValueError("No valid geometries in roads_df.")
    
    pedestrian_road_types = ['footway', 'pedestrian', 'cycleway']
    pedestrian_roads_df = roads_df[roads_df['class'].isin(pedestrian_road_types)].copy()
    logging.info(f"Filtered {len(pedestrian_roads_df)} roads of types {pedestrian_road_types} out of {len(roads_df)} total roads.")
    
    overlap = check_spatial_overlap(neighborhoods_df, pedestrian_roads_df, "neighborhoods", "pedestrian_roads")
    pedestrian_roads = gpd.sjoin(
        neighborhoods_df[['geometry', 'LIE_NAME']],
        pedestrian_roads_df[['geometry', 'length_m']],
        how='left', predicate='intersects'
    )
    if not overlap or pedestrian_roads['length_m'].isna().all():
        logging.warning("No spatial overlap or matches between neighborhoods and pedestrian roads. Buffering geometries...")
        neighborhoods_buffered = neighborhoods_df.copy()
        neighborhoods_buffered['geometry'] = neighborhoods_buffered['geometry'].buffer(50)
        pedestrian_roads = gpd.sjoin(
            neighborhoods_buffered[['geometry', 'LIE_NAME']],
            pedestrian_roads_df[['geometry', 'length_m']],
            how='left', predicate='intersects'
        )
    
    logging.info(f"Pedestrian roads join resulted in {len(pedestrian_roads)} matches.")
    pedestrian_roads['length_m'] = pedestrian_roads['length_m'].fillna(0)
    pedestrian_road_lengths = pedestrian_roads.groupby('LIE_NAME')['length_m'].sum()
    pedestrian_road_lengths = pedestrian_road_lengths.reindex(neighborhoods_df['LIE_NAME'], fill_value=0)
    
    neighborhoods_df['pedestrian_road_density'] = (pedestrian_road_lengths / 1000) / neighborhoods_df['area_km2'].replace(0, 1e-6)
    logging.info(f"Pedestrian road density stats:\n{neighborhoods_df['pedestrian_road_density'].describe()}")
    
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'pedestrian_infrastructure_score': [],
        'walkability_score': [],
        'walkability_category': []
    }
    
    ndvi_min, ndvi_max = neighborhoods_df['ndvi_mean'].min(), neighborhoods_df['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_df['tree_count'].min(), neighborhoods_df['tree_count'].max()
    transit_min, transit_max = neighborhoods_df['transit_count'].min(), neighborhoods_df['transit_count'].max()
    intersection_density_min = neighborhoods_df['intersection_density'].min()
    intersection_density_max = neighborhoods_df['intersection_density'].max()
    accident_count_min, accident_count_max = neighborhoods_df['accident_count'].min(), neighborhoods_df['accident_count'].max()
    accident_density_max = neighborhoods_df['avg_road_accident_density'].max() if not neighborhoods_df['avg_road_accident_density'].isna().all() else 1.0
    pedestrian_road_max = neighborhoods_df['pedestrian_road_density'].max() if neighborhoods_df['pedestrian_road_density'].max() > 0 else 1.0
    
    accident_count_density = neighborhoods_df['accident_count'] / neighborhoods_df['area_km2'].replace(0, 1e-6)
    accident_count_density_max = accident_count_density.max() if accident_count_density.max() > 0 else 1.0
    
    for idx, row in tqdm(neighborhoods_df.iterrows(), total=len(neighborhoods_df), desc="Computing walkability scores"):
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.1) / 100 for col in land_use_cols]  # Default to 0.1 (10%) if missing
        logging.debug(f"LIE_NAME: {row['LIE_NAME']}, Land use values: {land_use_values}")
        land_use_weights_list = [land_use_weights.get(category.lower(), 1.0) for category in CATEGORY_PRIORITY.keys()]
        weighted_values = [p * w for p, w in zip(land_use_values, land_use_weights_list) if p > 0]
        if not weighted_values:
            weighted_values = [1.0 / len(land_use_values)] * len(land_use_values)
        total = sum(weighted_values)
        if total > 0:
            weighted_values = [v / total for v in weighted_values]
            entropy = -np.sum([p * np.log2(p + 1e-10) for p in weighted_values])
            max_entropy = np.log2(len(weighted_values))
            land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0.5
        else:
            land_use_diversity = 0.5
        
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6)) if (ndvi_max - ndvi_min) > 0 else 0
        tree_density = row['tree_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        tree_density_max = (neighborhoods_df['tree_count'] / neighborhoods_df['area_km2']).replace(0, 1e-6).max()
        tree_density_normalized = (tree_density / (tree_density_max + 1e-6)) if tree_density_max > 0 else 0
        open_area = row.get('land_use_city_open_area_percent', 0.1) / 100  # Default to 0.1 if missing
        green_space_score = (0.4 * ndvi_normalized + 0.3 * tree_density_normalized + 0.3 * open_area)
        
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6) if (transit_max - transit_min) > 0 else 0
        transit_score = transit_raw
        
        intersection_density = row['intersection_density']
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6) if (intersection_density_max - intersection_density_min) > 0 else 0
        road_connectivity = intersection_density_normalized
        
        accident_count_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_count_density = min(accident_count_density, accident_count_density_max * 0.5)
        safety_score_count = 1 - (accident_count_density / (accident_count_density_max + 1e-6)) if accident_count_density_max > 0 else 1
        accident_density = row['avg_road_accident_density'] if pd.notna(row['avg_road_accident_density']) else 0
        safety_score_roads = 1 - (accident_density / (accident_density_max + 1e-6)) if accident_density_max > 0 else 1
        pedestrian_roads = row.get('pedestrian_road_density', 0.0)
        pedestrian_roads_safety = min(pedestrian_roads / (pedestrian_road_max + 1e-6), 1.0) if pedestrian_road_max > 0 else 0.1
        safety_score = (0.3 * safety_score_count + 0.3 * safety_score_roads + 0.4 * pedestrian_roads_safety) if row['accident_count'] != 0 else (0.5 * safety_score_roads + 0.5 * pedestrian_roads_safety)
        safety_score = np.clip(safety_score, 0, 1)
        
        elderly_percentage = row['elderly_percentage'] / 100
        medical_access = row.get('land_use_medical_percent', 0.1) / 100  # Default to 0.1 if missing
        elderly_accessibility = 0.3 * elderly_percentage + 0.4 * medical_access + 0.3 * pedestrian_roads_safety
        elderly_accessibility = np.clip(elderly_accessibility, 0, 1)
        
        pedestrian_road_score = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        amenity_access = row.get('land_use_amenity_percent', 0.1) / 100  # Default to 0.1 if missing
        pedestrian_infrastructure_score = 0.5 * pedestrian_road_score + 0.5 * amenity_access
        pedestrian_infrastructure_score = np.clip(pedestrian_infrastructure_score, 0, 1)
        
        base_score = (
            0.35 * land_use_diversity +
            0.35 * green_space_score +
            0.15 * transit_score +
            0.10 * road_connectivity +
            0.05 * pedestrian_infrastructure_score
        )
        safety_modifier = 0.7 + 0.3 * safety_score
        elderly_modifier = 0.9 + 0.1 * elderly_accessibility
        walkability_score = base_score * safety_modifier * elderly_modifier
        walkability_score = np.clip(walkability_score, 0, 1)
        
        if pd.isna(walkability_score):
            logging.warning(f"Walkability score is NaN for {row['LIE_NAME']}. Setting to 0.")
            walkability_score = 0
            category = 'low'
        elif walkability_score < 0.33:
            category = 'low'
        elif walkability_score < 0.66:
            category = 'medium'
        else:
            category = 'high'
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['pedestrian_infrastructure_score'].append(pedestrian_infrastructure_score)
        components['walkability_score'].append(walkability_score)
        components['walkability_category'].append(category)
    
    df_components = pd.DataFrame(components)
    logging.info(f"Land use diversity distribution:\n{df_components['land_use_diversity'].describe()}")
    logging.info(f"Green space score distribution:\n{df_components['green_space_score'].describe()}")
    logging.info(f"Transit score distribution:\n{df_components['transit_score'].describe()}")
    logging.info(f"Road connectivity distribution:\n{df_components['road_connectivity'].describe()}")
    logging.info(f"Safety score distribution:\n{df_components['safety_score'].describe()}")
    logging.info(f"Elderly accessibility distribution:\n{df_components['elderly_accessibility'].describe()}")
    logging.info(f"Pedestrian infrastructure score distribution:\n{df_components['pedestrian_infrastructure_score'].describe()}")
    logging.info(f"Walkability score distribution:\n{df_components['walkability_score'].describe()}")
    
    return df_components

Cell 4 Main Data Loading and Processing

In [314]:
import logging
import os
import json
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely import make_valid

def load_and_prepare_data():
    logging.info("Stage 1: Loading and preparing data...")
    
    # Define file paths and their corresponding keys
    data_files = {
        'neighborhoods': LANDUSE_NDVI_PATH,
        'buildings': OSM_BUILDINGS_PATH,
        'roads': OSM_ROADS_PATH,
        'trees': OSM_TREES_PATH,
        'transit': OSM_TRANSIT_PATH,
        'urban_masterplan': URBAN_MASTERPLAN_PATH,
        'accidents': ACCIDENTS_PATH,
        'population': POPULATION_PATH
    }
    
    data = {}
    
    # Load data with progress bar
    for key, path in tqdm(data_files.items(), desc="Loading files"):
        try:
            if key == 'population':
                with open(path, 'r') as f:
                    data[key] = pd.DataFrame(json.load(f))
                # Log columns of population_df to debug missing columns
                logging.info(f"Columns in population_df after loading: {list(data[key].columns)}")
            elif path.endswith('.geoparquet'):
                data[key] = gpd.read_parquet(path)
            else:
                data[key] = gpd.read_file(path)
            logging.info(f"Loaded {key} with shape {data[key].shape}")
        except Exception as e:
            logging.error(f"Failed to load {key} from {path}: {e}")
            raise
    
    # Log columns of neighborhoods_gdf to debug missing 'area_km2'
    neighborhoods_gdf = data['neighborhoods']
    logging.info(f"Columns in neighborhoods_gdf after loading: {list(neighborhoods_gdf.columns)}")
    
    # Ensure all GeoDataFrames are in the same CRS
    target_crs = 'EPSG:3826'
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            if data[key].crs != target_crs:
                data[key] = data[key].to_crs(target_crs)
                logging.info(f"Converted {key} to CRS {target_crs}")
    
    # Fix geometries in all GeoDataFrames
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            data[key]['geometry'] = data[key]['geometry'].apply(fix_geometry)
            invalid_geoms = data[key][~data[key].geometry.is_valid]
            if not invalid_geoms.empty:
                logging.warning(f"Found {len(invalid_geoms)} invalid geometries in {key} after fixing.")
                data[key] = data[key][data[key].geometry.is_valid]
    
    # Compute intersections for neighborhoods
    logging.info("Computing intersections for neighborhoods...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf after loading: {list(roads_gdf.columns)}")
    
    # Extract endpoints from road segments
    logging.info("Extracting endpoints from road segments...")
    endpoints = []
    road_indices = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        geom = row['geometry']
        if geom.geom_type == 'LineString':
            coords = list(geom.coords)
            start_point = Point(coords[0])
            end_point = Point(coords[-1])
            if start_point.is_valid and end_point.is_valid:
                endpoints.extend([start_point, end_point])
                road_indices.extend([idx, idx])
        elif geom.geom_type == 'MultiLineString':
            for line in geom.geoms:
                coords = list(line.coords)
                start_point = Point(coords[0])
                end_point = Point(coords[-1])
                if start_point.is_valid and end_point.is_valid:
                    endpoints.extend([start_point, end_point])
                    road_indices.extend([idx, idx])
    
    if not endpoints:
        logging.warning("No valid endpoints extracted from road segments. Using fallback method for intersections.")
        neighborhoods_gdf = data['neighborhoods']
        road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
        intersection_counts = road_neighborhoods.groupby('index_right').size()
        neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    else:
        endpoints_gdf = gpd.GeoDataFrame({'geometry': endpoints, 'road_idx': road_indices}, crs=target_crs)
        
        # Create a spatial index for endpoints
        endpoints_sindex = endpoints_gdf.sindex
        
        # Cluster endpoints to identify intersections (points shared by 3 or more roads)
        logging.info("Building endpoint-to-road mapping...")
        endpoint_to_roads = {}
        for idx, point in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
            point_geom = point['geometry']
            road_idx = point['road_idx']
            point_key = (round(point_geom.x, 6), round(point_geom.y, 6))  # Round to avoid floating-point precision issues
            if point_key not in endpoint_to_roads:
                endpoint_to_roads[point_key] = set()
            endpoint_to_roads[point_key].add(road_idx)
        
        logging.info("Identifying intersections...")
        intersections = []
        for point_key, road_ids in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
            if len(road_ids) >= 3:  # Intersection if shared by 3 or more roads
                intersections.append(Point(point_key))
        
        if not intersections:
            logging.warning("No intersections found using endpoint clustering. Using fallback method.")
            neighborhoods_gdf = data['neighborhoods']
            road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = road_neighborhoods.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
        else:
            intersections_gdf = gpd.GeoDataFrame({'geometry': intersections}, crs=target_crs)
            
            # Count intersections per neighborhood
            logging.info("Counting intersections per neighborhood...")
            neighborhoods_gdf = data['neighborhoods']
            intersections_joined = gpd.sjoin(intersections_gdf, neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = intersections_joined.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    
    # Compute or verify area_km2
    if 'area_km2' not in neighborhoods_gdf.columns:
        logging.warning("'area_km2' column missing in neighborhoods_gdf. Computing from geometry...")
        # Compute area in square meters, then convert to square kilometers
        neighborhoods_gdf['area_m2'] = neighborhoods_gdf['geometry'].area
        neighborhoods_gdf['area_km2'] = neighborhoods_gdf['area_m2'] / 1_000_000  # Convert m² to km²
        logging.info(f"Computed area_km2 stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    else:
        logging.info(f"area_km2 already present. Stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    
    # Compute intersection density
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"Intersection count stats:\n{neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats:\n{neighborhoods_gdf['intersection_density'].describe()}")
    
    # Cache the result
    try:
        neighborhoods_gdf.to_parquet(INTERSECTION_CACHE_PATH)
        logging.info(f"Saved neighborhoods with intersections to {INTERSECTION_CACHE_PATH}")
    except Exception as e:
        logging.error(f"Failed to save neighborhoods with intersections: {e}")
    
    data['neighborhoods'] = neighborhoods_gdf
    
    # Compute tree count per neighborhood
    logging.info("Computing tree count per neighborhood...")
    trees_gdf = data['trees']
    trees_joined = gpd.sjoin(trees_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    tree_counts = trees_joined.groupby('index_right').size()
    data['neighborhoods']['tree_count'] = tree_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute transit count per neighborhood
    logging.info("Computing transit count per neighborhood...")
    transit_gdf = data['transit']
    transit_joined = gpd.sjoin(transit_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    transit_counts = transit_joined.groupby('index_right').size()
    data['neighborhoods']['transit_count'] = transit_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute accident count per neighborhood
    logging.info("Computing accident count per neighborhood...")
    accidents_gdf = data['accidents']
    accidents_buffered = accidents_gdf.copy()
    accidents_buffered['geometry'] = accidents_buffered['geometry'].buffer(BUFFER_DISTANCE)
    accidents_joined = gpd.sjoin(accidents_buffered[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    accident_counts = accidents_joined.groupby('index_right').size()
    data['neighborhoods']['accident_count'] = accident_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute road density per neighborhood
    logging.info("Computing road density per neighborhood...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf before computing road density: {list(roads_gdf.columns)}")
    
    # Compute length_m if missing
    if 'length_m' not in roads_gdf.columns:
        logging.warning("'length_m' column missing in roads_gdf. Computing from geometry...")
        roads_gdf['length_m'] = roads_gdf['geometry'].length  # Length in meters (since CRS is EPSG:3826)
        logging.info(f"Computed length_m stats:\n{roads_gdf['length_m'].describe()}")
    
    roads_joined = gpd.sjoin(roads_gdf[['geometry', 'length_m']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    road_lengths = roads_joined.groupby('index_right')['length_m'].sum()
    data['neighborhoods']['road_density'] = road_lengths.reindex(data['neighborhoods'].index, fill_value=0) / (data['neighborhoods']['area_km2'] * 1000)
    logging.info(f"Road density stats:\n{data['neighborhoods']['road_density'].describe()}")
    
    # Merge population data
    logging.info("Merging population data...")
    population_df = data['population']
    population_df['LIE_NAME'] = population_df['LIE_NAME'].astype(str).str.strip()
    data['neighborhoods']['LIE_NAME'] = data['neighborhoods']['LIE_NAME'].astype(str).str.strip()
    
    # Check for possible column names for total_population and elderly_percentage
    expected_cols = ['total_population', 'elderly_percentage']
    population_cols = list(population_df.columns)
    missing_cols = [col for col in expected_cols if col not in population_cols]
    
    if missing_cols:
        logging.warning(f"Expected columns {missing_cols} not found in population_df. Attempting to find alternatives...")
        # Possible alternative names
        total_pop_alt = None
        elderly_alt = None
        for col in population_cols:
            col_lower = col.lower()
            if 'population' in col_lower and total_pop_alt is None:
                total_pop_alt = col
                logging.info(f"Found alternative for total_population: {col}")
            if 'elderly' in col_lower and elderly_alt is None:
                elderly_alt = col
                logging.info(f"Found alternative for elderly_percentage: {col}")
        
        # Rename columns if alternatives are found
        if total_pop_alt:
            population_df = population_df.rename(columns={total_pop_alt: 'total_population'})
        else:
            logging.warning("No alternative found for total_population. Setting to 0.")
            population_df['total_population'] = 0
        if elderly_alt:
            population_df = population_df.rename(columns={elderly_alt: 'elderly_percentage'})
        else:
            logging.warning("No alternative found for elderly_percentage. Setting to 0.")
            population_df['elderly_percentage'] = 0
    
    # Perform the merge
    data['neighborhoods'] = data['neighborhoods'].merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']],
        on='LIE_NAME',
        how='left'
    )
    
    # Compute land use percentages
    logging.info("Computing land use percentages for neighborhoods...")
    urban_masterplan_gdf = data['urban_masterplan']
    print_percentage_calculation(data['neighborhoods'], urban_masterplan_gdf, sample_size=3)
    
    for idx, row in data['neighborhoods'].iterrows():
        neighborhood_geom = row['geometry']
        if not neighborhood_geom.is_valid:
            continue
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area = intersected.geometry.union_all().area
        remaining_geom = intersected.geometry.union_all()
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                remaining_geom = remaining_geom.difference(category_area_geom)
            except Exception as e:
                logging.warning(f"Topology error for category {category} in neighborhood {row['LIE_NAME']}: {e}")
                category_areas[category] = 0.0
        
        for category in CATEGORY_PRIORITY.keys():
            area = category_areas.get(category, 0.0)
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            data['neighborhoods'].at[idx, f'land_use_{category.lower()}_percent'] = percentage
    
    # Fill NaN values in land use percentages
    for category in CATEGORY_PRIORITY.keys():
        col = f'land_use_{category.lower()}_percent'
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0.0)
    
    # Fill NaN values in other columns
    for col in ['intersection_count', 'intersection_density', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'total_population', 'elderly_percentage']:
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0)
    
    # Print data structure summary
    print_data_structure(data)
    
    logging.info("Finished loading and preparing data.")
    return data

Cell 5 compute_intersection_counts

In [315]:
def compute_intersection_counts(neighborhoods_gdf, roads_gdf):
    logging.info("Computing intersection counts for neighborhoods...")
    
    # Extract endpoints from road segments
    def get_endpoints(line):
        if line is None or line.is_empty:
            return []
        coords = list(line.coords)
        return [Point(coords[0]), Point(coords[-1])]
    
    endpoints = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        points = get_endpoints(row['geometry'])
        for point in points:
            endpoints.append({'geometry': point, 'road_idx': idx})
    
    endpoints_gdf = gpd.GeoDataFrame(endpoints, crs='EPSG:3826')
    
    # Build a mapping of endpoints to road indices
    endpoint_to_roads = {}
    for idx, row in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
        point = row['geometry']
        road_idx = row['road_idx']
        point_tuple = (point.x, point.y)
        if point_tuple not in endpoint_to_roads:
            endpoint_to_roads[point_tuple] = set()
        endpoint_to_roads[point_tuple].add(road_idx)
    
    # Identify intersections (endpoints shared by 3 or more roads)
    intersections = []
    for point_tuple, road_indices in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
        if len(road_indices) >= 3:  # Intersection if 3 or more roads share the endpoint
            intersections.append({'geometry': Point(point_tuple)})
    
    if not intersections:
        logging.warning("No intersections found. Setting intersection counts to 0.")
        neighborhoods_gdf['intersection_count'] = 0
        neighborhoods_gdf['intersection_density'] = 0.0
        return neighborhoods_gdf
    
    intersections_gdf = gpd.GeoDataFrame(intersections, crs='EPSG:3826')
    
    # Spatial join to count intersections per neighborhood
    intersection_counts = gpd.sjoin(
        neighborhoods_gdf[['geometry', 'LIE_NAME']],
        intersections_gdf,
        how='left',
        predicate='contains'
    )
    intersection_counts = intersection_counts.groupby('LIE_NAME').size().reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
    neighborhoods_gdf['intersection_count'] = intersection_counts
    
    # Compute intersection density (intersections per km²)
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2']
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_density'].replace([np.inf, -np.inf], 0).fillna(0)
    
    logging.info(f"Intersection count stats: {neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats: {neighborhoods_gdf['intersection_density'].describe()}")
    
    return neighborhoods_gdf

def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building the graph...")
    
    # Compute data hash to check if graph needs recomputing
    data_hash = compute_data_hash(data)
    cached_hash = None
    if os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read().strip()
    
    if not force_recompute and cached_hash == data_hash and all(
        os.path.exists(path) for path in [GRAPH_NODES_CACHE_PATH, GRAPH_EDGES_CACHE_PATH, GRAPH_NODE_ID_CACHE_PATH]
    ):
        logging.info("Data unchanged. Loading graph from cache...")
        nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
            node_id_to_index = json.load(f)
        G = cugraph.Graph()
        G.from_cudf_edgelist(
            edges_df,
            source='src',
            destination='dst',
            edge_attr='weight'
        )
        G._nodes = nodes_df
        logging.info("Graph loaded from cache.")
        return G
    
    neighborhoods_gdf = data['neighborhoods'].copy()
    buildings_gdf = data['buildings'].copy()
    roads_gdf = data['roads'].copy()
    trees_gdf = data['trees'].copy()
    transit_gdf = data['transit'].copy()
    
    # Create nodes for neighborhoods, buildings, roads, trees, and transit
    nodes = []
    node_id_to_index = {}
    current_idx = 0
    
    # Neighborhood nodes
    for idx, row in neighborhoods_gdf.iterrows():
        node_id = f"neighborhood_{row['LIE_NAME']}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'geometry': row['geometry'].centroid,
            'ndvi_mean': row['ndvi_mean'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            'area_km2': row['area_km2']
        })
        current_idx += 1
    
    # Building nodes
    for idx, row in buildings_gdf.iterrows():
        node_id = f"building_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'building',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'building_type': row['building'],
            'area_m2': row['area_m2']
        })
        current_idx += 1
    
    # Road nodes
    for idx, row in roads_gdf.iterrows():
        node_id = f"road_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'road',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'class': row['class'],
            'length_m': row['length_m']
        })
        current_idx += 1
    
    # Tree nodes
    for idx, row in trees_gdf.iterrows():
        node_id = f"tree_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'tree',
            'LIE_NAME': None,
            'geometry': row['geometry']
        })
        current_idx += 1
    
    # Transit nodes
    for idx, row in transit_gdf.iterrows():
        node_id = f"transit_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'transit',
            'LIE_NAME': None,
            'geometry': row['geometry'],
            'class': row['class']
        })
        current_idx += 1
    
    nodes_df = pd.DataFrame(nodes)
    nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs='EPSG:3826')
    nodes_df = cudf.from_pandas(nodes_df.drop(columns=['geometry']))
    
    # Create edges based on spatial proximity
    edges = []
    nodes_gdf_sindex = nodes_gdf.sindex
    
    # Neighborhood-to-neighborhood edges (shared borders)
    logging.info("Creating neighborhood-to-neighborhood edges...")
    for idx1, row1 in neighborhoods_gdf.iterrows():
        geom1 = row1['geometry']
        node_idx1 = node_id_to_index[f"neighborhood_{row1['LIE_NAME']}"]
        possible_matches = list(nodes_gdf_sindex.query(geom1, predicate='intersects'))
        for idx2 in possible_matches:
            row2 = nodes_gdf.iloc[idx2]
            if row2['type'] != 'neighborhood':
                continue
            if row1['LIE_NAME'] == row2['LIE_NAME']:
                continue
            geom2 = neighborhoods_gdf[neighborhoods_gdf['LIE_NAME'] == row2['LIE_NAME']]['geometry'].iloc[0]
            if geom1.intersects(geom2):
                node_idx2 = node_id_to_index[f"neighborhood_{row2['LIE_NAME']}"]
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': 1.0
                })
    
    # Other edges (neighborhood to building, road, tree, transit)
    logging.info("Creating edges between neighborhoods and other entities...")
    for idx, row in tqdm(nodes_gdf.iterrows(), total=len(nodes_gdf), desc="Creating edges"):
        if row['type'] == 'neighborhood':
            continue
        geom = row['geometry']
        possible_matches = list(neighborhoods_gdf.sindex.query(geom, predicate='contains'))
        for match_idx in possible_matches:
            neighborhood = neighborhoods_gdf.iloc[match_idx]
            if neighborhood['geometry'].contains(geom):
                node_idx1 = node_id_to_index[row['node_id']]
                node_idx2 = node_id_to_index[f"neighborhood_{neighborhood['LIE_NAME']}"]
                weight = 1.0
                if row['type'] == 'transit':
                    weight = 2.0  # Higher weight for transit nodes
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': weight
                })
                edges.append({
                    'src': node_idx2,
                    'dst': node_idx1,
                    'weight': weight
                })
    
    edges_df = cudf.DataFrame(edges)
    
    # Build the graph
    G = cugraph.Graph()
    G.from_cudf_edgelist(
        edges_df,
        source='src',
        destination='dst',
        edge_attr='weight'
    )
    G._nodes = nodes_df
    
    # Cache the graph
    nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
    edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
    with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
        json.dump(node_id_to_index, f)
    with open(GRAPH_DATA_HASH_PATH, 'w') as f:
        f.write(data_hash)
    
    logging.info("Graph construction completed.")
    return G

def prepare_gnn_data(G):
    logging.info("Stage 3: Preparing data for GNN...")
    
    nodes_df = G._nodes.to_pandas()
    edges_df = G.edgelist.edgelist_df.to_pandas()
    
    # Create node features
    feature_columns = [
        'ndvi_mean', 'total_population', 'elderly_percentage', 'area_km2',
        'area_m2', 'length_m'
    ]
    features = []
    for idx, row in nodes_df.iterrows():
        node_features = []
        for col in feature_columns:
            value = row.get(col, 0.0)
            if pd.isna(value):
                value = 0.0
            node_features.append(value)
        
        # One-hot encode node type
        node_type = row['type']
        type_encoding = [0] * 5  # 5 types: neighborhood, building, road, tree, transit
        type_mapping = {
            'neighborhood': 0,
            'building': 1,
            'road': 2,
            'tree': 3,
            'transit': 4
        }
        type_idx = type_mapping.get(node_type, 0)
        type_encoding[type_idx] = 1
        node_features.extend(type_encoding)
        
        features.append(node_features)
    
    feature_matrix = np.array(features, dtype=np.float32)
    
    # Normalize numerical features
    numerical_features = feature_matrix[:, :len(feature_columns)]
    means = numerical_features.mean(axis=0)
    stds = numerical_features.std(axis=0)
    stds[stds == 0] = 1  # Avoid division by zero
    numerical_features = (numerical_features - means) / stds
    feature_matrix[:, :len(feature_columns)] = numerical_features
    
    # Create edge indices for PyG
    edge_index = torch.tensor(
        np.array([edges_df['src'].values, edges_df['dst'].values]),
        dtype=torch.long
    )
    edge_attr = torch.tensor(edges_df['weight'].values, dtype=torch.float)
    
    # Create target (walkability score) for neighborhood nodes
    y = np.zeros(len(nodes_df), dtype=np.float32)
    if 'walkability_score' in nodes_df.columns:
        walkability_scores = nodes_df['walkability_score'].fillna(0).values
        mask = nodes_df['type'] == 'neighborhood'
        y[mask] = walkability_scores[mask]
    else:
        logging.warning("Walkability scores not found in nodes_df. Setting targets to 0.")
    
    y = torch.tensor(y, dtype=torch.float)
    
    node_type_mapping = {
        'neighborhood': 0,
        'building': 1,
        'road': 2,
        'tree': 3,
        'transit': 4
    }
    node_type = nodes_df['type'].map(node_type_mapping).fillna(-1).astype(int).values
    node_type = torch.tensor(node_type, dtype=torch.long)
    
    data = Data(
        x=torch.tensor(feature_matrix, dtype=torch.float),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=y,
        node_type=node_type
    )
    
    logging.info("GNN data prepared.")
    return data

Cell 6: Graph Construction (build_graph)

In [316]:
from multiprocessing import Pool, cpu_count

def compute_neighborhood_neighborhood_edges(args):
    idx, row, neighborhoods_gdf, neighborhood_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(neighborhood_sindex.intersection(geom.bounds))
    for other_idx in possible_matches_index:
        if other_idx != idx:
            other_row = neighborhoods_gdf.iloc[other_idx]
            other_geom = other_row['geometry']
            try:
                if geom.buffer(1e-3).intersects(other_geom.buffer(1e-3)) or geom.buffer(1e-3).touches(other_geom.buffer(1e-3)):
                    src = f"neighborhood_{idx}"
                    dst = f"neighborhood_{other_idx}"
                    edges.append({'src': src, 'dst': dst})
            except Exception as e:
                logging.warning(f"Error checking intersection between neighborhood {idx} and {other_idx}: {e}")
    return edges

def compute_neighborhood_building_edges(args):
    idx, row, buildings_gdf, building_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(building_sindex.intersection(geom.bounds))
    for building_idx in possible_matches_index:
        building_row = buildings_gdf.iloc[building_idx]
        building_geom = building_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(building_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"building_{building_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and building {building_idx}: {e}")
    return edges

def compute_neighborhood_road_edges(args):
    idx, row, roads_gdf, road_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(road_sindex.intersection(geom.bounds))
    for road_idx in possible_matches_index:
        road_row = roads_gdf.iloc[road_idx]
        road_geom = road_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(road_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"road_{road_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and road {road_idx}: {e}")
    return edges



In [317]:
def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building city graph...")
    current_hash = compute_data_hash(data)
    nodes_df = None
    edges_df = None
    node_id_to_vertex = {}
    vertex_to_index = {}

    if not force_recompute and os.path.exists(GRAPH_NODES_CACHE_PATH) and os.path.exists(GRAPH_EDGES_CACHE_PATH):
        try:
            with open(GRAPH_DATA_HASH_PATH, 'r') as f:
                cached_hash = f.read()
            if cached_hash == current_hash:
                logging.info("Data hash matches cached hash. Loading graph from cache...")
                nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
                edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
                with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
                    node_id_to_vertex = json.load(f)
                G = cugraph.Graph()
                G._nodes = nodes_df
                if not edges_df.empty:
                    G.from_cudf_edgelist(edges_df, source='src', destination='dst')
                logging.info(f"Loaded graph from cache: {len(nodes_df)} nodes, {len(edges_df)} edges")
                return G
        except Exception as e:
            logging.warning(f"Failed to load cached graph: {e}. Recomputing graph...")

    logging.info("Constructing graph nodes...")
    neighborhoods_gdf = data['neighborhoods']
    buildings_gdf = data['buildings']
    roads_gdf = data['roads']

    # Compute area_m2 for buildings if not present
    if 'area_m2' not in buildings_gdf.columns:
        logging.warning("'area_m2' column missing in buildings_gdf. Computing from geometry...")
        buildings_gdf['area_m2'] = buildings_gdf['geometry'].area
        logging.info(f"Computed area_m2 stats:\n{buildings_gdf['area_m2'].describe()}")

    # Create nodes with both vertex IDs and integer indices
    logging.info("Adding neighborhood nodes...")
    neighborhood_nodes = []
    for idx, row in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood nodes"):
        vertex = f"neighborhood_{idx}"
        node_id_to_vertex[idx] = vertex
        vertex_to_index[vertex] = idx  # Map vertex to integer index
        neighborhood_nodes.append({
            'index': idx,  # Integer index for GNN
            'vertex': vertex,  # String vertex ID
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'ndvi_mean': row['ndvi_mean'],
            'tree_count': row['tree_count'],
            'transit_count': row['transit_count'],
            'accident_count': row['accident_count'],
            'road_density': row['road_density'],
            'intersection_density': row['intersection_density'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            **{f'land_use_{category.lower()}_percent': row.get(f'land_use_{category.lower()}_percent', 0.0) for category in CATEGORY_PRIORITY.keys()}
        })

    logging.info("Adding building nodes...")
    building_nodes = []
    for idx, row in tqdm(buildings_gdf.iterrows(), total=len(buildings_gdf), desc="Building nodes"):
        offset = len(neighborhoods_gdf)
        vertex = f"building_{idx}"
        node_id_to_vertex[idx + offset] = vertex
        vertex_to_index[vertex] = idx + offset
        building_nodes.append({
            'index': idx + offset,
            'vertex': vertex,
            'type': 'building',
            'building': row['building'] if pd.notna(row['building']) else 'unknown',
            'area_m2': row['area_m2']
        })

    logging.info("Adding road nodes...")
    road_nodes = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Road nodes"):
        offset = len(neighborhoods_gdf) + len(buildings_gdf)
        vertex = f"road_{idx}"
        node_id_to_vertex[idx + offset] = vertex
        vertex_to_index[vertex] = idx + offset
        road_nodes.append({
            'index': idx + offset,
            'vertex': vertex,
            'type': 'road',
            'class': row['class'] if pd.notna(row['class']) else 'unknown',
            'length_m': row['length_m']
        })

    # Combine all nodes
    nodes = neighborhood_nodes + building_nodes + road_nodes
    nodes_df = cudf.DataFrame(nodes)

    # Convert GeoDataFrames to cudf for GPU-accelerated operations
    logging.info("Converting GeoDataFrames to cudf for GPU processing...")
    neighborhoods_cudf = cudf.from_pandas(neighborhoods_gdf.drop(columns=['geometry']))
    buildings_cudf = cudf.from_pandas(buildings_gdf.drop(columns=['geometry']))
    roads_cudf = cudf.from_pandas(roads_gdf.drop(columns=['geometry']))

    # Extract bounding box coordinates as separate columns, ensuring scalar values
    logging.info("Extracting bounding box coordinates...")
    # Apply fix_geometry to ensure all geometries are valid
    neighborhoods_gdf['geometry'] = neighborhoods_gdf['geometry'].apply(fix_geometry)
    buildings_gdf['geometry'] = buildings_gdf['geometry'].apply(fix_geometry)
    roads_gdf['geometry'] = roads_gdf['geometry'].apply(fix_geometry)

    # Extract bounds and convert to scalar floats
    bounds_df = neighborhoods_gdf['geometry'].apply(lambda geom: pd.Series(geom.bounds, index=['min_x', 'min_y', 'max_x', 'max_y'])).astype(float).fillna(0.0)
    neighborhoods_cudf['min_x'] = cudf.Series(bounds_df['min_x'].values, dtype='float64')
    neighborhoods_cudf['min_y'] = cudf.Series(bounds_df['min_y'].values, dtype='float64')
    neighborhoods_cudf['max_x'] = cudf.Series(bounds_df['max_x'].values, dtype='float64')
    neighborhoods_cudf['max_y'] = cudf.Series(bounds_df['max_y'].values, dtype='float64')

    bounds_df = buildings_gdf['geometry'].apply(lambda geom: pd.Series(geom.bounds, index=['min_x', 'min_y', 'max_x', 'max_y'])).astype(float).fillna(0.0)
    buildings_cudf['min_x'] = cudf.Series(bounds_df['min_x'].values, dtype='float64')
    buildings_cudf['min_y'] = cudf.Series(bounds_df['min_y'].values, dtype='float64')
    buildings_cudf['max_x'] = cudf.Series(bounds_df['max_x'].values, dtype='float64')
    buildings_cudf['max_y'] = cudf.Series(bounds_df['max_y'].values, dtype='float64')

    bounds_df = roads_gdf['geometry'].apply(lambda geom: pd.Series(geom.bounds, index=['min_x', 'min_y', 'max_x', 'max_y'])).astype(float).fillna(0.0)
    roads_cudf['min_x'] = cudf.Series(bounds_df['min_x'].values, dtype='float64')
    roads_cudf['min_y'] = cudf.Series(bounds_df['min_y'].values, dtype='float64')
    roads_cudf['max_x'] = cudf.Series(bounds_df['max_x'].values, dtype='float64')
    roads_cudf['max_y'] = cudf.Series(bounds_df['max_y'].values, dtype='float64')

    # Log the data types to debug
    logging.info(f"neighborhoods_cudf['min_x'] dtype: {neighborhoods_cudf['min_x'].dtype}")
    logging.info(f"buildings_cudf['min_x'] dtype: {buildings_cudf['min_x'].dtype}")
    logging.info(f"roads_cudf['min_x'] dtype: {roads_cudf['min_x'].dtype}")

    # Create edges using integer indices
    logging.info("Creating edges using GPU-accelerated spatial joins...")
    edges = []

    # Neighborhood-Neighborhood edges
    logging.info("Computing neighborhood-neighborhood edges...")
    for i in tqdm(range(len(neighborhoods_cudf)), desc="Neighborhood-Neighborhood edges"):
        row = neighborhoods_cudf.iloc[i]
        # Extract scalar bounding box coordinates and convert to Python scalar
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        # Log the type of geom_min_x for debugging
        logging.debug(f"geom_min_x type: {type(geom_min_x)}, value: {geom_min_x}")
        # Find potential matches based on bounding box overlap using direct scalar comparisons
        matches = neighborhoods_cudf[
            ~((geom_max_x < neighborhoods_cudf['min_x']) |
              (geom_min_x > neighborhoods_cudf['max_x']) |
              (geom_max_y < neighborhoods_cudf['min_y']) |
              (geom_min_y > neighborhoods_cudf['max_y'])) &
            (neighborhoods_cudf.index != i)
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"neighborhood_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})

    # Neighborhood-Building edges
    logging.info("Computing neighborhood-building edges...")
    for i in tqdm(range(len(neighborhoods_cudf)), desc="Neighborhood-Building edges"):
        row = neighborhoods_cudf.iloc[i]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = buildings_cudf[
            ~((geom_max_x < buildings_cudf['min_x']) |
              (geom_min_x > buildings_cudf['max_x']) |
              (geom_max_y < buildings_cudf['min_y']) |
              (geom_min_y > buildings_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"building_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})

    # Neighborhood-Road edges
    logging.info("Computing neighborhood-road edges...")
    for i in tqdm(range(len(neighborhoods_cudf)), desc="Neighborhood-Road edges"):
        row = neighborhoods_cudf.iloc[i]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = roads_cudf[
            ~((geom_max_x < roads_cudf['min_x']) |
              (geom_min_x > roads_cudf['max_x']) |
              (geom_max_y < roads_cudf['min_y']) |
              (geom_min_y > roads_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"road_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})

    edges_df = cudf.DataFrame(edges)
    logging.info(f"Created {len(edges_df)} total edges")

    # Validate edges
    valid_indices = set(nodes_df['index'].to_pandas())
    if edges_df.empty:
        logging.warning("No edges created. Graph will have nodes but no edges.")
    else:
        edges_df = edges_df[edges_df['src'].isin(valid_indices) & edges_df['dst'].isin(valid_indices)]
        logging.info(f"After validation, {len(edges_df)} edges remain")
        if not edges_df.empty:
            logging.info(f"Sample edges after validation:\n{edges_df.head().to_pandas()}")

    # Create the graph
    G = cugraph.Graph()
    G._nodes = nodes_df
    if not edges_df.empty:
        G.from_cudf_edgelist(edges_df, source='src', destination='dst')
    else:
        logging.warning("No valid edges created. Graph will have nodes but no edges.")

    # Save graph data to cache
    logging.info("Saving graph data to cache...")
    try:
        nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_DATA_HASH_PATH, 'w') as f:
            f.write(current_hash)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
            json.dump(node_id_to_vertex, f)
        logging.info("Successfully saved graph data to cache.")
    except Exception as e:
        logging.error(f"Failed to save graph data to cache: {e}")

    logging.info(f"City graph constructed: {len(nodes_df)} nodes, {len(edges_df)} edges")
    return G

Cell 7: Rule-Based Walkability Scores (compute_walkability_scores)

In [318]:
def compute_walkability_scores(G, data):
    """
    Compute walkability scores for neighborhood nodes in the graph and assign them.
    
    Args:
        G (cugraph.Graph): The city graph with nodes and edges.
        data (dict): Dictionary containing roads and other datasets.
    
    Returns:
        cugraph.Graph: Updated graph with walkability scores assigned to neighborhood nodes.
    """
    logging.info("Computing walkability scores for neighborhoods...")
    nodes_df = G._nodes.to_pandas()
    
    walkability_components = compute_walkability_components_all(data['neighborhoods'], data)
    
    logging.info(f"Number of neighborhood nodes in nodes_df: {len(nodes_df[nodes_df['type'] == 'neighborhood'])}")
    logging.info(f"Number of entries in walkability_components: {len(walkability_components)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {nodes_df[nodes_df['type'] == 'neighborhood']['LIE_NAME'].head().tolist()}")
    logging.info(f"Sample LIE_NAME in walkability_components: {walkability_components['LIE_NAME'].head().tolist()}")
    
    nodes_df = nodes_df.merge(
        walkability_components[['LIE_NAME', 'walkability_score', 'walkability_category']],
        on='LIE_NAME',
        how='left'
    )
    
    unmatched = nodes_df[(nodes_df['type'] == 'neighborhood') & (nodes_df['walkability_score'].isna())]
    if len(unmatched) > 0:
        logging.warning(f"Found {len(unmatched)} neighborhood nodes without walkability scores. Filling with 0.")
        nodes_df.loc[nodes_df['type'] == 'neighborhood', 'walkability_score'] = nodes_df['walkability_score'].fillna(0)
        nodes_df.loc[nodes_df['type'] == 'neighborhood', 'walkability_category'] = nodes_df['walkability_category'].fillna('low')
    
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished computing walkability scores.")
    return G

Cell 8 prepare_gnn_data

In [319]:
def prepare_gnn_data(G):
    logging.info("Preparing data for GNN training...")
    nodes_df = G._nodes
    edges_df = G.edgelist.edgelist_df if G.edgelist else cudf.DataFrame()

    numerical_features = [
        'ndvi_mean', 'tree_count', 'transit_count', 'accident_count', 
        'road_density', 'intersection_density', 'total_population', 
        'elderly_percentage', 'area_m2', 'length_m', 'avg_road_accident_density', 
        'pedestrian_road_density'
    ] + [f'land_use_{cat.lower()}_percent' for cat in CATEGORY_PRIORITY.keys()]

    building_types = nodes_df[nodes_df['type'] == 'building']['building'].to_pandas().unique()
    road_classes = nodes_df[nodes_df['type'] == 'road']['class'].to_pandas().unique()
    categorical_features = (
        [f'building_{bt}' for bt in building_types if pd.notna(bt)] +
        [f'road_class_{rc}' for rc in road_classes if pd.notna(rc)]
    )

    all_features = numerical_features + categorical_features

    features_list = []
    labels_list = []
    node_types = []

    for node_type in tqdm(['neighborhood', 'building', 'road'], desc="Normalizing features by node type"):
        subset = nodes_df[nodes_df['type'] == node_type].to_pandas()
        if subset.empty:
            logging.warning(f"No nodes of type {node_type} found. Skipping.")
            continue

        subset_features = pd.DataFrame(0.0, index=subset.index, columns=all_features)

        if node_type == 'neighborhood':
            for col in numerical_features:
                if col in subset.columns:
                    subset_features[col] = subset[col].astype(float).fillna(0)
                else:
                    logging.warning(f"Column {col} missing in neighborhood nodes. Setting to 0.")
        elif node_type == 'building':
            if 'area_m2' in subset.columns:
                subset_features['area_m2'] = subset['area_m2'].astype(float).fillna(0)
        else:  # road
            if 'length_m' in subset.columns:
                subset_features['length_m'] = subset['length_m'].astype(float).fillna(0)

        if node_type == 'building':
            for bt in building_types:
                if pd.notna(bt):
                    subset_features[f'building_{bt}'] = (subset['building'] == bt).astype(float)
        elif node_type == 'road':
            for rc in road_classes:
                if pd.notna(rc):
                    subset_features[f'road_class_{rc}'] = (subset['class'] == rc).astype(float)

        # Z-score normalization for numerical features
        for col in numerical_features:
            if col in subset_features.columns and subset_features[col].std() > 0:
                subset_features[col] = (
                    (subset_features[col] - subset_features[col].mean()) / subset_features[col].std()
                ).fillna(0)
            else:
                logging.debug(f"Column {col} has zero variance or is missing for {node_type}. Setting to 0.")

        logging.info(f"Node type {node_type}: {len(subset)} nodes, feature shape: {subset_features.shape}")

        features_list.append(subset_features.values)

        if node_type == 'neighborhood':
            labels = subset['walkability_score'].astype(float).fillna(0).values
            labels_list.append(labels[:, None])  # Shape [n, 1]
        else:
            labels_list.append(np.zeros((len(subset), 1)))

        node_types.extend([node_type] * len(subset))

    try:
        features = np.vstack(features_list)
    except ValueError as e:
        logging.error(f"Failed to stack features: {e}")
        raise

    labels = np.vstack(labels_list)
    features_tensor = torch.tensor(features, dtype=torch.float)
    labels_tensor = torch.tensor(labels, dtype=torch.float)

    if not edges_df.empty:
        edge_index = torch.tensor(edges_df[['src', 'dst']].to_pandas().values.T, dtype=torch.long)
        logging.info(f"Edge index created with {edge_index.shape[1]} edges")
        max_index = nodes_df['index'].max()
        if edge_index.max() > max_index or edge_index.min() < 0:
            logging.warning(f"Edge indices out of bounds: min={edge_index.min()}, max={edge_index.max()}, expected max={max_index}")
            edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        logging.warning("No edges found in graph.")

    data = Data(
        x=features_tensor,
        edge_index=edge_index,
        y=labels_tensor
    )

    data.node_types = node_types

    logging.info(f"Prepared GNN data: {features_tensor.shape[0]} nodes, {edge_index.shape[1]} edges")
    logging.info(f"Feature matrix shape: {features_tensor.shape}")
    logging.info(f"Label tensor shape: {labels_tensor.shape}")
    return data

Cell 9: WalkabilityGNN, train_gnn_model, predict_walkability

In [320]:
class GNNWalkabilityPredictor(torch.nn.Module):
    def __init__(self, num_features):
        super(GNNWalkabilityPredictor, self).__init__()
        self.conv1 = GATConv(num_features, 64, heads=2, concat=True)
        self.bn1 = BatchNorm(64 * 2)
        self.conv2 = GATConv(64 * 2, 32, heads=1, concat=True)
        self.bn2 = BatchNorm(32)
        self.fc1 = torch.nn.Linear(32, 16)
        self.fc2 = torch.nn.Linear(16, 1)
        self.dropout = torch.nn.Dropout(0.3)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        if edge_index.numel() > 0:
            x = self.conv1(x, edge_index)
            x = self.bn1(x)
            x = F.relu(x)
            x = self.dropout(x)
            x = self.conv2(x, edge_index)
            x = self.bn2(x)
            x = F.relu(x)
        else:
            logging.warning("No edges in the graph. Using linear layer for node features only.")
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

def train_gnn_model(data_gnn):
    logging.info("Stage 4: Training GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_gnn = data_gnn.to(device)
    
    neighborhood_mask = np.array([t == 'neighborhood' for t in data_gnn.node_types])
    train_indices = np.where(neighborhood_mask)[0]
    
    if len(train_indices) == 0:
        logging.error("No neighborhood nodes found for training.")
        raise ValueError("No neighborhood nodes found for training.")
    
    np.random.seed(42)
    train_idx = np.random.choice(train_indices, size=int(0.8 * len(train_indices)), replace=False)
    val_idx = np.setdiff1d(train_indices, train_idx)
    
    train_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    data_gnn.train_mask = train_mask
    data_gnn.val_mask = val_mask
    
    neighborhood_labels = data_gnn.y[neighborhood_mask].cpu().numpy()
    logging.info(f"Target (walkability_score) distribution for neighborhood nodes:\n{pd.Series(neighborhood_labels.flatten()).describe()}")
    
    model = GNNWalkabilityPredictor(num_features=data_gnn.x.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)
    criterion = torch.nn.MSELoss()
    
    best_val_loss = float('inf')
    best_model_path = os.path.join(CHECKPOINT_DIR, 'best_gnn_model.pth')
    epochs = 300
    patience = 20
    early_stop_counter = 0
    
    for epoch in tqdm(range(epochs), desc="Training epochs"):
        model.train()
        optimizer.zero_grad()
        out = model(data_gnn)
        loss = criterion(out[data_gnn.train_mask], data_gnn.y[data_gnn.train_mask])
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_out = model(data_gnn)
            val_loss = criterion(val_out[data_gnn.val_mask], data_gnn.y[data_gnn.val_mask])
        
        scheduler.step(val_loss)
        
        if epoch % 10 == 0:
            logging.info(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                logging.info(f"Early stopping at epoch {epoch}")
                break
    
    model.load_state_dict(torch.load(best_model_path))
    logging.info("Finished training GNN model.")
    return model

def predict_walkability(G, model):
    """
    Predict walkability scores using the trained GNN model.
    
    Args:
        G (cugraph.Graph): The city graph with nodes and edges.
        model (GNNWalkabilityPredictor): The trained GNN model.
    
    Returns:
        cugraph.Graph: Updated graph with GNN-predicted walkability scores.
    """
    logging.info("Predicting walkability with GNN...")
    nodes_df = G._nodes.to_pandas()
    
    data = prepare_gnn_data(G)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = data.to(device)
    
    model.eval()
    with torch.no_grad():
        predictions = model(data).squeeze()
    
    predictions = predictions.cpu().numpy()
    predictions = np.clip(predictions, 0, 1)
    
    nodes_df['walkability_gnn'] = np.nan
    nodes_df.loc[nodes_df['type'] == 'neighborhood', 'walkability_gnn'] = predictions[nodes_df['type'] == 'neighborhood']
    
    G._nodes = cudf.from_pandas(nodes_df)
    logging.info(f"Walkability GNN stats after prediction:\n{nodes_df[nodes_df['type'] == 'neighborhood']['walkability_gnn'].describe()}")
    return G

Cell 10: Interactive Map Generation (create_interactive_map)

In [321]:
def create_interactive_map(G, data):
    """Generate an interactive Kepler.gl map to visualize walkability scores."""
    logging.info("Generating interactive Kepler.gl map...")
    
    nodes_df = G._nodes.to_pandas()
    neighborhoods_gdf = data['neighborhoods'].copy()

    # Standardize LIE_NAME for merging
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].astype(str).str.strip()

    # Filter for neighborhood nodes and select necessary columns
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'][['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category']]

    # Log for debugging
    nodes_lie_names = set(neighborhood_nodes['LIE_NAME'])
    gdf_lie_names = set(neighborhoods_gdf['LIE_NAME'])
    logging.info(f"Neighborhood nodes count: {len(neighborhood_nodes)}")
    logging.info(f"Neighborhoods_gdf count: {len(neighborhoods_gdf)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {list(nodes_lie_names)[:5]}")
    logging.info(f"Sample LIE_NAME in neighborhoods_gdf: {list(gdf_lie_names)[:5]}")
    logging.info(f"Common LIE_NAMEs: {len(nodes_lie_names & gdf_lie_names)}")
    logging.info(f"Nodes LIE_NAMEs not in GDF: {list(nodes_lie_names - gdf_lie_names)}")
    logging.info(f"GDF LIE_NAMEs not in nodes: {list(gdf_lie_names - nodes_lie_names)}")
    logging.info(f"Nodes nulls: {neighborhood_nodes.isna().sum().to_dict()}")
    logging.info(f"GDF geometry nulls: {neighborhoods_gdf['geometry'].isna().sum()}")

    # Merge data
    map_data = neighborhoods_gdf[['LIE_NAME', 'geometry']].merge(
        neighborhood_nodes,
        on='LIE_NAME',
        how='left'
    )

    # Drop duplicates in-place
    map_data.drop_duplicates(subset=['LIE_NAME'], keep='first', inplace=True)

    # Log merge results and score distributions
    logging.info(f"Merged map_data rows: {len(map_data)}")
    logging.info(f"Walkability score nulls: {map_data['walkability_score'].isna().sum()}")
    logging.info(f"Walkability GNN nulls: {map_data['walkability_gnn'].isna().sum()}")
    logging.info(f"Walkability score distribution in map_data:\n{map_data['walkability_score'].describe()}")
    logging.info(f"Walkability GNN distribution in map_data:\n{map_data['walkability_gnn'].describe()}")
    logging.info(f"Walkability category distribution in map_data:\n{map_data['walkability_category'].value_counts()}")

    # Fill NaN values
    map_data['walkability_score'] = map_data['walkability_score'].fillna(0)
    map_data['walkability_gnn'] = map_data['walkability_gnn'].fillna(0)
    map_data['walkability_category'] = map_data['walkability_category'].fillna('low')

    # Convert to GeoDataFrame and transform CRS
    map_data = gpd.GeoDataFrame(map_data, geometry='geometry', crs='EPSG:3826')
    map_data['geometry'] = map_data['geometry'].to_crs('EPSG:4326')
    
    # Prepare data for Kepler.gl
    kepler_data = {
        'neighborhoods': map_data[['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category', 'geometry']].to_json()
    }

    # Kepler.gl configuration
    config = {
        "version": "v1",
        "config": {
            "visState": {
                "layers": [
                    {
                        "id": "neighborhoods",
                        "type": "geojson",
                        "config": {
                            "dataId": "neighborhoods",
                            "label": "Neighborhoods",
                            "color": [18, 147, 154],
                            "columns": {
                                "geojson": "geometry"
                            },
                            "isVisible": True,
                            "visConfig": {
                                "opacity": 0.7,
                                "strokeOpacity": 0.9,
                                "thickness": 1,
                                "strokeColor": [255, 255, 255],
                                "colorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "strokeColorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "colorField": {
                                    "name": "walkability_gnn",
                                    "type": "real"
                                },
                                "colorScale": "quantile"
                            }
                        },
                        "visualChannels": {
                            "colorField": {
                                "name": "walkability_gnn",
                                "type": "real"
                            },
                            "colorScale": "quantile"
                        }
                    }
                ],
                "interactionConfig": {
                    "tooltip": {
                        "fieldsToShow": {
                            "neighborhoods": [
                                {"name": "LIE_NAME", "format": None},
                                {"name": "walkability_score", "format": "{:.3f}"},
                                {"name": "walkability_gnn", "format": "{:.3f}"},
                                {"name": "walkability_category", "format": None}
                            ]
                        },
                        "enabled": True
                    }
                }
            },
            "mapState": {
                "latitude": 25.0330,
                "longitude": 121.5654,
                "zoom": 11
            },
            "mapStyle": {
                "styleType": "dark"
            }
        }
    }

    map_1 = KeplerGl(height=800, data=kepler_data, config=config)
    map_path = os.path.join(BASE_DIR, 'taipei_walkability_map.html')
    map_1.save_to_html(file_name=map_path)
    logging.info(f"Interactive map generated and saved as {map_path}")
    print(f"Map saved to {map_path}!")

Cell 11: Main Execution (main)

In [322]:
def main(force_recompute_graph=False):
    """Main execution pipeline for the analysis."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    os.makedirs(SUBGRAPH_DIR, exist_ok=True)
    logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

    # Track timing for each step
    timings = {}
    
    try:
        # Step 1: Load and prepare data
        start_time = time.time()
        print("Starting load_and_prepare_data...")
        data = load_and_prepare_data()
        timings['load_and_prepare_data'] = time.time() - start_time

        # Step 2: Compute road type accident correlation
        start_time = time.time()
        print("Starting compute_road_type_accident_correlation...")
        road_accident_summary = compute_road_type_accident_correlation(
            data['roads'], data['neighborhoods'], data['accidents']
        )
        timings['compute_road_type_accident_correlation'] = time.time() - start_time

        # Step 3: Build graph
        start_time = time.time()
        print("Starting build_graph...")
        G = build_graph(data, force_recompute=force_recompute_graph)
        timings['build_graph'] = time.time() - start_time

        # Validate edge counts
        edge_count = G.edgelist.edgelist_df.shape[0] if G.edgelist else 0
        logging.info(f"Graph edge count: {edge_count}")
        if edge_count == 0:
            logging.warning("Graph has no edges. GNN will not utilize graph structure.")

        # Step 4: Compute walkability scores
        start_time = time.time()
        print("Starting compute_walkability_scores...")
        G = compute_walkability_scores(G, data)
        timings['compute_walkability_scores'] = time.time() - start_time

        # Step 5: Prepare GNN data
        start_time = time.time()
        print("Starting prepare_gnn_data...")
        data_gnn = prepare_gnn_data(G)
        timings['prepare_gnn_data'] = time.time() - start_time

        # Step 6: Train GNN model
        start_time = time.time()
        print("Starting train_gnn_model...")
        model = train_gnn_model(data_gnn)
        timings['train_gnn_model'] = time.time() - start_time

        # Step 7: Predict walkability
        start_time = time.time()
        print("Starting predict_walkability...")
        G = predict_walkability(G, model)
        timings['predict_walkability'] = time.time() - start_time

        # Step 8: Create interactive map
        start_time = time.time()
        print("Starting create_interactive_map...")
        create_interactive_map(G, data)
        timings['create_interactive_map'] = time.time() - start_time

        # Final validation: Check walkability scores
        nodes_df = G._nodes.to_pandas()
        neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood']
        walkability_score_stats = neighborhood_nodes['walkability_score'].describe()
        walkability_gnn_stats = neighborhood_nodes['walkability_gnn'].describe()
        walkability_category_dist = neighborhood_nodes['walkability_category'].value_counts()
        non_zero_walkability = (neighborhood_nodes['walkability_score'] > 0).sum()
        non_zero_walkability_gnn = (neighborhood_nodes['walkability_gnn'] > 0).sum()
        
        logging.info("Final validation - Walkability scores in neighborhood nodes:")
        logging.info(f"Walkability score distribution:\n{walkability_score_stats}")
        logging.info(f"Walkability GNN distribution:\n{walkability_gnn_stats}")
        logging.info(f"Walkability category distribution:\n{walkability_category_dist}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_score: {non_zero_walkability}/{len(neighborhood_nodes)}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_gnn: {non_zero_walkability_gnn}/{len(neighborhood_nodes)}")

        # Check for low variation in walkability scores
        if walkability_score_stats['std'] < 0.05:
            logging.warning("Walkability scores have low variation (std < 0.05). Components may need adjustment.")
        if walkability_gnn_stats['std'] < 0.05:
            logging.warning("GNN predictions have low variation (std < 0.05). Check edge creation and model training.")

        # Compute correlation between walkability_score and walkability_gnn
        corr, p_value = pearsonr(neighborhood_nodes['walkability_score'], neighborhood_nodes['walkability_gnn'])
        logging.info(f"Correlation between walkability_score and walkability_gnn: {corr:.2f} (p-value: {p_value:.2f})")
        if corr < 0.5:
            logging.warning("Low correlation between walkability_score and walkability_gnn. GNN predictions may not align well with rule-based scores.")

        # Log timing summary
        logging.info("Processing complete. Timing summary:")
        for step, duration in timings.items():
            logging.info(f"{step}: {duration:.2f} seconds")
        
        print("Pipeline completed successfully.")
        print(G.edgelist.edgelist_df.to_pandas().head())

    except Exception as e:
        logging.error(f"Pipeline failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    main(force_recompute_graph=True)

2025-04-23 11:48:50,546 - INFO - Ensured subgraph directory exists: /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs
2025-04-23 11:48:50,548 - INFO - Stage 1: Loading and preparing data...


Starting load_and_prepare_data...


Loading files:   0%|          | 0/8 [00:00<?, ?it/s]2025-04-23 11:48:50,670 - INFO - Loaded neighborhoods with shape (456, 57)
Loading files:  12%|█▎        | 1/8 [00:00<00:00,  8.24it/s]2025-04-23 11:48:52,003 - INFO - Loaded buildings with shape (74306, 9)
Loading files:  25%|██▌       | 2/8 [00:01<00:05,  1.20it/s]2025-04-23 11:48:52,071 - INFO - Loaded roads with shape (81444, 2)
2025-04-23 11:48:52,097 - INFO - Loaded trees with shape (5019, 12)
2025-04-23 11:48:52,179 - INFO - Loaded transit with shape (29892, 11)
Loading files:  62%|██████▎   | 5/8 [00:01<00:00,  3.59it/s]2025-04-23 11:48:52,849 - INFO - Loaded urban_masterplan with shape (15521, 15)
Loading files:  75%|███████▌  | 6/8 [00:02<00:00,  2.64it/s]2025-04-23 11:48:53,643 - INFO - Loaded accidents with shape (56133, 8)
Loading files:  88%|████████▊ | 7/8 [00:03<00:00,  2.04it/s]2025-04-23 11:48:53,645 - INFO - Columns in population_df after loading: ['LIE_NAME', 'Total_Population', 'Elderly_Percentage']
2025-04-23 11:


--- Percentage Calculation Process ---

Neighborhood: 板溪里 (Index: 373)
Total unique master plan area: 63777.59 m²
Area of City_Open_Area (priority 10): 478.13 m²
Area of Education (priority 6): 8173.36 m²
Area of Commercial (priority 4): 23017.42 m²
Area of Residential (priority 3): 32108.69 m²

Percentages:
City_Open_Area: 0.75%
Education: 12.82%
Commercial: 36.09%
Residential: 50.34%
Sum of percentages: 100.00%

Neighborhood: 芝山里 (Index: 39)
Total unique master plan area: 1061285.95 m²
Area of Education (priority 6): 80442.15 m²
Area of Residential (priority 3): 196204.41 m²
Area of Natural (priority 2): 775753.11 m²
Area of River (priority 1): 2754.93 m²
Area of Government (priority 1): 6131.35 m²

Percentages:
Education: 7.58%
Residential: 18.49%
Natural: 73.10%
River: 0.26%
Government: 0.58%
Sum of percentages: 100.00%

Neighborhood: 和平里 (Index: 340)
Total unique master plan area: 98073.11 m²
Area of City_Open_Area (priority 10): 4608.53 m²
Area of Public_Transportation (priority

2025-04-23 11:49:24,266 - INFO - Finished loading and preparing data.
2025-04-23 11:49:24,321 - INFO - Computing correlation between road types and accident density...



--- Data Structure Summary ---

Dataset: neighborhoods
Shape: (456, 67)
Columns and Data Types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
transit_count                        int64
accident_count                       int64
road_density                       float64
total_population                     int64
elderly_percentage                 float64
Length: 67, dtype: object
Missing values (total): 0
Missing values per column:
LIE_NAME                           0
SECT_NAME                          0
2024population                     0
land_use_city_open_area_count      0
land_use_city_open_area_area_m2    0
                                  ..
transit_count                      0
accident_count                     0
road_density                       0
total_population  

2025-04-23 11:49:24,837 - INFO - Assigning accidents to nearest road...
2025-04-23 11:49:29,861 - INFO - Matched 56133 accidents out of 56133
2025-04-23 11:49:29,866 - INFO - Reassigning 4991 accidents from footway/cycleway...
2025-04-23 11:49:29,976 - INFO - Reassigned 1429 accidents to wider roads
2025-04-23 11:49:29,983 - INFO - Accidents by road type:
class
bridleway            0
cycleway           247
footway           3315
living_street       79
motorway           109
path                86
pedestrian          84
primary           6535
residential      10110
secondary        16180
service           5011
steps               47
tertiary          9135
track                8
trunk             2493
unclassified      1665
unknown             66
Name: accident_count, dtype: int64
2025-04-23 11:49:29,990 - INFO - Road type counts:
class
service          21204
footway          16755
residential      14861
tertiary          5113
secondary         3869
path              3610
steps          

Road type counts:
class
service          21204
footway          16755
residential      14861
tertiary          5113
secondary         3869
path              3610
steps             2968
unclassified      1894
primary           1209
cycleway           825
track              716
trunk              593
motorway           313
pedestrian         297
living_street      264
unknown             54
bridleway            2
Name: count, dtype: int64

--- Road Type Accident Density Summary ---
            class    length_m  accident_count  accident_density  width_rank
1        cycleway   263682.07             245              0.26           1
2         footway  1767503.72            3235              0.73           1
3   living_street    23929.78              77              1.58           3
4        motorway   215317.82             109              2.74           5
5            path   720602.60              84              0.09           1
6      pedestrian    31830.26              82              

2025-04-23 11:49:30,324 - INFO - Road-neighborhood join resulted in 85340 matches with columns: ['index_left', 'geometry', 'class', 'length_m', 'width_rank', 'accident_density', 'index_right0', 'index_right', 'LIE_NAME']
2025-04-23 11:49:30,329 - INFO - Assigned avg_road_accident_density to 0 neighborhoods
2025-04-23 11:49:30,332 - INFO - Avg road accident density stats:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: avg_road_accident_density, dtype: float64
2025-04-23 11:49:30,471 - INFO - Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


2025-04-23 11:49:30,922 - INFO - Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


2025-04-23 11:49:31,194 - INFO - Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
2025-04-23 11:49:31,197 - INFO - Top 3 road types by accident density:
        class  accident_density
7     primary             44.39
9   secondary             36.32
14      trunk             11.69
2025-04-23 11:49:31,198 - INFO - Stage 2: Building city graph...
2025-04-23 11:49:31,199 - INFO - Dataset neighborhoods column types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
accident_count                       int64
road_density                       float64
total_population                     int64
elderly_percentage                 float64
avg_road_accident_density          float64
Length: 68, dtype: object
2025-04-23 11:49:31,207 - I

Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
Top 3 road types by accident density:
        class  accident_density
7     primary             44.39
9   secondary             36.32
14      trunk             11.69
Starting build_graph...


Neighborhood nodes: 100%|██████████| 456/456 [00:00<00:00, 22439.17it/s]
2025-04-23 11:49:31,262 - INFO - Adding building nodes...
Building nodes: 100%|██████████| 74306/74306 [00:01<00:00, 47930.66it/s]
2025-04-23 11:49:32,814 - INFO - Adding road nodes...
Road nodes: 100%|██████████| 81444/81444 [00:01<00:00, 47950.44it/s]
2025-04-23 11:49:35,680 - INFO - Converting GeoDataFrames to cudf for GPU processing...
2025-04-23 11:49:35,827 - INFO - Extracting bounding box coordinates...
2025-04-23 11:49:46,911 - INFO - neighborhoods_cudf['min_x'] dtype: float64
2025-04-23 11:49:46,912 - INFO - buildings_cudf['min_x'] dtype: float64
2025-04-23 11:49:46,912 - INFO - roads_cudf['min_x'] dtype: float64
2025-04-23 11:49:46,913 - INFO - Creating edges using GPU-accelerated spatial joins...
2025-04-23 11:49:46,913 - INFO - Computing neighborhood-neighborhood edges...
Neighborhood-Neighborhood edges: 100%|██████████| 456/456 [00:30<00:00, 15.07it/s] 
2025-04-23 11:50:17,173 - INFO - Computing neigh

Starting compute_walkability_scores...


2025-04-23 11:51:24,643 - INFO - Filtered 20977 roads of types ['footway', 'pedestrian', 'cycleway'] out of 81444 total roads.
2025-04-23 11:51:24,644 - INFO - Checking spatial overlap between neighborhoods and pedestrian_roads...
2025-04-23 11:51:24,964 - INFO - neighborhoods bounds: [ 296266.05303084 2761514.89561711  317197.26073793 2789176.16901603]
2025-04-23 11:51:24,965 - INFO - pedestrian_roads bounds: [ 295216.88848867 2758756.21684514  314125.37860428 2787925.43931427]
2025-04-23 11:51:24,965 - INFO - Bounding boxes overlap: True
2025-04-23 11:51:24,974 - INFO - Sample intersection check: 337 intersections found out of 10 samples.
2025-04-23 11:51:25,026 - INFO - Pedestrian roads join resulted in 20772 matches.
2025-04-23 11:51:25,032 - INFO - Pedestrian road density stats:
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: pedestrian_road_density, dtype: float64
Computing walkability scores: 100%|██████████| 456/456 

Starting prepare_gnn_data...


2025-04-23 11:51:26,157 - INFO - Node type neighborhood: 456 nodes, feature shape: (456, 127)
2025-04-23 11:51:26,499 - INFO - Node type building: 74306 nodes, feature shape: (74306, 127)
Normalizing features by node type:  67%|██████▋   | 2/3 [00:00<00:00,  4.39it/s]2025-04-23 11:51:26,769 - INFO - Node type road: 81444 nodes, feature shape: (81444, 127)
Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00,  4.09it/s]
2025-04-23 11:51:26,872 - INFO - Edge index created with 270909 edges
2025-04-23 11:51:26,881 - INFO - Prepared GNN data: 156206 nodes, 270909 edges
2025-04-23 11:51:26,882 - INFO - Feature matrix shape: torch.Size([156206, 127])
2025-04-23 11:51:26,883 - INFO - Label tensor shape: torch.Size([156206, 1])
2025-04-23 11:51:26,887 - INFO - Stage 4: Training GNN model...


Starting train_gnn_model...


2025-04-23 11:51:27,098 - INFO - Target (walkability_score) distribution for neighborhood nodes:
count    456.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
dtype: float64
Training epochs:   0%|          | 0/300 [00:00<?, ?it/s]2025-04-23 11:51:27,445 - INFO - Epoch 0, Train Loss: 0.3208, Val Loss: 0.2735
Training epochs:   3%|▎         | 10/300 [00:01<00:29,  9.68it/s]2025-04-23 11:51:28,460 - INFO - Epoch 10, Train Loss: 0.0084, Val Loss: 0.0028
Training epochs:   7%|▋         | 20/300 [00:02<00:25, 10.80it/s]2025-04-23 11:51:29,367 - INFO - Epoch 20, Train Loss: 0.0003, Val Loss: 0.0000
Training epochs:  10%|█         | 30/300 [00:03<00:24, 11.12it/s]2025-04-23 11:51:30,265 - INFO - Epoch 30, Train Loss: 0.0004, Val Loss: 0.0000
Training epochs:  13%|█▎        | 40/300 [00:03<00:23, 11.27it/s]2025-04-23 11:51:31,151 - INFO - Epoch 40, Train Loss: 0.0000, Val Loss: 0.0000
Training epochs:  17%|█▋        | 50/300 [00:04<00:20

Starting predict_walkability...


2025-04-23 11:51:32,125 - INFO - Node type neighborhood: 456 nodes, feature shape: (456, 127)
2025-04-23 11:51:32,415 - INFO - Node type building: 74306 nodes, feature shape: (74306, 127)
Normalizing features by node type:  67%|██████▋   | 2/3 [00:00<00:00,  5.88it/s]2025-04-23 11:51:32,605 - INFO - Node type road: 81444 nodes, feature shape: (81444, 127)
Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00,  5.65it/s]
2025-04-23 11:51:32,685 - INFO - Edge index created with 270909 edges
2025-04-23 11:51:32,688 - INFO - Prepared GNN data: 156206 nodes, 270909 edges
2025-04-23 11:51:32,688 - INFO - Feature matrix shape: torch.Size([156206, 127])
2025-04-23 11:51:32,689 - INFO - Label tensor shape: torch.Size([156206, 1])
2025-04-23 11:51:32,982 - INFO - Walkability GNN stats after prediction:
count    4.560000e+02
mean     4.135814e-04
std      1.464419e-03
min      1.241403e-07
25%      1.656487e-05
50%      7.764637e-05
75%      2.750272e-04
max      1.713020e-02
Name:

Starting create_interactive_map...
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


2025-04-23 11:51:33,187 - INFO - Interactive map generated and saved as /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html
2025-04-23 11:51:33,287 - INFO - Final validation - Walkability scores in neighborhood nodes:
2025-04-23 11:51:33,290 - INFO - Walkability score distribution:
count    456.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: walkability_score, dtype: float64
2025-04-23 11:51:33,291 - INFO - Walkability GNN distribution:
count    4.560000e+02
mean     4.135814e-04
std      1.464419e-03
min      1.241403e-07
25%      1.656487e-05
50%      7.764637e-05
75%      2.750272e-04
max      1.713020e-02
Name: walkability_gnn, dtype: float64
2025-04-23 11:51:33,292 - INFO - Walkability category distribution:
walkability_category
low    456
Name: count, dtype: int64
2025-04-23 11:51:33,293 - INFO - Number of neighborhood nodes with non-zero walkability_score: 0/456
2025-04-23 11:51:33,29

Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Pipeline completed successfully.
   src  dst
0    0    1
1    0    2
2    0    3
3    0    4
4    0    5
