Cell 0: CUDA Version check

In [25]:
import cudf, cugraph
print(cudf.__version__, cugraph.__version__)

25.02.02 25.02.00


Cell 1: Imports ,Global Constants

In [26]:
# Standard library imports
import os
import logging
import hashlib
import json
import time

# Third-party imports
import geopandas as gpd
import pandas as pd
import cudf
import cugraph
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from scipy.stats import pearsonr, spearmanr
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
from tqdm import tqdm
from keplergl import KeplerGl

# Plotting configuration
%matplotlib inline
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC', 'Noto Serif CJK TC', 'Noto Sans Mono CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Directory and file paths
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical_corrected.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population_corrected.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
INTERSECTION_CACHE_PATH = os.path.join(BASE_DIR, "neighborhoods_with_intersections.geoparquet")
GRAPH_NODES_CACHE_PATH = os.path.join(BASE_DIR, "graph_nodes.parquet")
GRAPH_EDGES_CACHE_PATH = os.path.join(BASE_DIR, "graph_edges.parquet")
GRAPH_NODE_ID_CACHE_PATH = os.path.join(BASE_DIR, "graph_node_id_to_index.json")
GRAPH_DATA_HASH_PATH = os.path.join(BASE_DIR, "graph_data_hash.txt")

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Constants for spatial analysis
BUFFER_DISTANCE = 10  # Meters, buffer distance for border sharing of accidents (tunable based on spatial resolution)
MIN_ROAD_LENGTH = 10  # Meters, minimum road length to avoid inflated accident density (tunable based on dataset)

# Land use category priorities for area assignment
CATEGORY_PRIORITY = {
    'City_Open_Area': 10,
    'Pedestrian': 9,
    'Public_Transportation': 8,
    'Amenity': 7,
    'Education': 6,
    'Medical': 5,
    'Commercial': 4,
    'Residential': 3,
    'Natural': 2,
    'Road': 1,
    'River': 1,
    'Infrastructure': 1,
    'Government': 1,
    'Special_Zone': 1,
    'Military': 1,
    'Industrial': 1,
    'Agriculture': 1
}

# Weights for land use diversity in walkability scoring
land_use_weights = {
    'city_open_area': 0.8,
    'commercial': 0.7,
    'infrastructure': 0.4,
    'government': 0.5,
    'public_transportation': 0.8,
    'education': 0.7,
    'medical': 0.6,
    'amenity': 0.8,
    'road': 0.3,
    'pedestrian': 1.0,
    'natural': 0.7,
    'special_zone': 0.4,
    'river': 0.7,
    'military': 0.2,
    'residential': 0.6,
    'industrial': 0.3,
    'agriculture': 0.4
}

Cell 2: Utility Functions

In [27]:
def print_data_structure(data_dict):
    """Print a detailed summary of the data structure for each dataset."""
    print("\n--- Data Structure Summary ---")
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            print(f"\nDataset: {key}")
            print(f"Shape: {df.shape}")
            print(f"Columns and Data Types:\n{df.dtypes}")
            print(f"Missing values (total): {df.isnull().sum().sum()}")
            print(f"Missing values per column:\n{df.isnull().sum()}")
            if 'LIE_NAME' in df.columns:
                print(f"Unique LIE_NAME: {df['LIE_NAME'].nunique()}")
            if 'class' in df.columns and key == 'roads':
                print(f"Road class counts:\n{df['class'].value_counts()}")
            # Print sample data for better debugging
            print(f"Sample data (first 2 rows):\n{df.head(2)}")
    print("--- End of Data Structure Summary ---\n")

def fix_geometry(geom, buffer_size=1e-5):
    """Fix invalid geometries with logging for debugging."""
    if geom is None or geom.is_empty:
        logging.debug("Geometry is None or empty, returning a default Point(0,0).")
        return Point(0, 0)  # Fallback to a default point
    try:
        geom = make_valid(geom)
        if not geom.is_valid:
            logging.debug(f"Geometry invalid, applying buffer with size {buffer_size}: {geom.bounds}")
            geom = geom.buffer(buffer_size)
            geom = make_valid(geom)
        if not geom.is_valid:
            logging.warning(f"Geometry remains invalid after fixing: {geom.bounds}. Returning default Point(0,0).")
            return Point(0, 0)  # Fallback to a default point
        return geom
    except Exception as e:
        logging.error(f"Error fixing geometry: {e}. Returning default Point(0,0).")
        return Point(0, 0)

def print_percentage_calculation(neighborhoods_gdf, urban_masterplan_gdf, sample_size=3):
    """Print the land use percentage calculation process for a sample of neighborhoods."""
    print("\n--- Percentage Calculation Process ---")
    sample_neighborhoods = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    for idx, row in sample_neighborhoods.iterrows():
        lie_name = row['LIE_NAME']
        print(f"\nNeighborhood: {lie_name} (Index: {idx})")
        
        neighborhood_geom = fix_geometry(row['geometry'])
        if not neighborhood_geom.is_valid:
            print(f"Neighborhood geometry is invalid after fixing: {lie_name}")
            continue
        
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            print("No master plan polygons intersect with this neighborhood.")
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            print("No valid intersections after overlay.")
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            print("No valid geometries after fixing intersected polygons.")
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area_geom = intersected.geometry.union_all()
        total_area = total_area_geom.area
        print(f"Total unique master plan area: {total_area:.2f} m²")
        
        remaining_geom = total_area_geom
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                print(f"Area of {category} (priority {CATEGORY_PRIORITY.get(category, 0)}): {category_area:.2f} m²")
                remaining_geom = remaining_geom.difference(category_area_geom)
            except GEOSException as e:
                print(f"Topology error for category {category}: {e}")
                category_areas[category] = 0.0
        
        print("\nPercentages:")
        total_percentage = 0.0
        for category, area in category_areas.items():
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            total_percentage += percentage
            print(f"{category}: {percentage:.2f}%")
        print(f"Sum of percentages: {total_percentage:.2f}%")
    print("--- End of Percentage Calculation Process ---\n")

def compute_data_hash(data_dict):
    """Compute a hash of the data for caching purposes."""
    hasher = hashlib.sha256()
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            hasher.update(str(df.shape).encode('utf-8'))
            hasher.update(str(sorted(df.columns)).encode('utf-8'))
            
            # Log column types for debugging
            logging.info(f"Dataset {key} column types:\n{df.dtypes}")
            
            # Create a copy of the sample and convert non-serializable types
            sample_df = df.head(5).copy()
            # Drop geometry column if present, as it's not JSON serializable
            if 'geometry' in sample_df.columns:
                sample_df = sample_df.drop(columns=['geometry'])
            # Convert NumPy types to Python types
            for col in sample_df.columns:
                sample_df[col] = sample_df[col].apply(
                    lambda x: x.tolist() if isinstance(x, np.ndarray) else
                              float(x) if isinstance(x, (np.floating, np.integer)) else x
                )
            try:
                sample = sample_df.to_json()
                hasher.update(sample.encode('utf-8'))
            except Exception as e:
                logging.error(f"Failed to serialize sample for dataset {key}: {e}")
                # Fallback: Convert to string representation
                sample = str(sample_df.to_dict())
                hasher.update(sample.encode('utf-8'))
    return hasher.hexdigest()

def check_spatial_overlap(gdf1, gdf2, label1="gdf1", label2="gdf2"):
    """Check for spatial overlap between two GeoDataFrames and log the results."""
    logging.info(f"Checking spatial overlap between {label1} and {label2}...")
    gdf1 = gdf1.copy()
    gdf2 = gdf2.copy()
    
    # Ensure CRS matches
    if gdf1.crs != gdf2.crs:
        logging.warning(f"CRS mismatch between {label1} ({gdf1.crs}) and {label2} ({gdf2.crs}). Aligning to {gdf1.crs}...")
        gdf2 = gdf2.to_crs(gdf1.crs)
    
    # Validate geometries
    gdf1['geometry'] = gdf1['geometry'].apply(fix_geometry)
    gdf2['geometry'] = gdf2['geometry'].apply(fix_geometry)
    
    # Compute bounding boxes
    gdf1_bounds = gdf1.total_bounds
    gdf2_bounds = gdf2.total_bounds
    logging.info(f"{label1} bounds: {gdf1_bounds}")
    logging.info(f"{label2} bounds: {gdf2_bounds}")
    
    # Check if bounding boxes overlap
    bounds_overlap = not (gdf1_bounds[2] < gdf2_bounds[0] or  # gdf1 max_x < gdf2 min_x
                         gdf1_bounds[0] > gdf2_bounds[2] or  # gdf1 min_x > gdf2 max_x
                         gdf1_bounds[3] < gdf2_bounds[1] or  # gdf1 max_y < gdf2 min_y
                         gdf1_bounds[1] > gdf2_bounds[3])    # gdf1 min_y > gdf2 max_y
    logging.info(f"Bounding boxes overlap: {bounds_overlap}")
    
    # Perform a sample intersection check
    sample_size = min(10, len(gdf1), len(gdf2))
    if sample_size > 0:
        sample_gdf1 = gdf1.sample(sample_size, random_state=42)
        intersects = gpd.sjoin(sample_gdf1, gdf2, how='inner', predicate='intersects')
        logging.info(f"Sample intersection check: {len(intersects)} intersections found out of {sample_size} samples.")
    
    return bounds_overlap

def compute_road_type_accident_correlation(roads_gdf, neighborhoods_gdf, accidents_gdf):
    """
    Compute correlation between OSM road types and accident density (accidents per km of road length).
    Uses road class as a proxy for width, with ordinal ranking based on OSM hierarchy.
    Generates bar, box, and scatter plots for visualization.
    """
    logging.info("Computing correlation between road types and accident density...")
    
    # Validate inputs
    if roads_gdf.empty or neighborhoods_gdf.empty or accidents_gdf.empty:
        logging.error("One or more input GeoDataFrames are empty.")
        raise ValueError("Input GeoDataFrames cannot be empty.")
    
    # Define ordinal width proxy based on OSM highway hierarchy
    width_ranking = {
        'motorway': 5, 'trunk': 5, 'primary': 4, 'secondary': 4, 'tertiary': 3,
        'residential': 3, 'living_street': 3, 'service': 2, 'track': 2,
        'path': 1, 'footway': 1, 'cycleway': 1, 'steps': 1, 'pedestrian': 1,
        'unclassified': 0, 'bridleway': 0, 'unknown': 0
    }
    
    # Assign width rank to roads
    roads_gdf = roads_gdf.copy()
    roads_gdf['width_rank'] = roads_gdf['class'].map(width_ranking).fillna(0).astype(int)
    
    # Buffer wider roads to increase their likelihood of capturing accidents
    roads_gdf_buffered = roads_gdf.copy()
    roads_gdf_buffered['geometry'] = roads_gdf_buffered.apply(
        lambda row: row['geometry'].buffer(5) if row['width_rank'] >= 4 else row['geometry'], axis=1
    )
    
    # Assign accidents to the nearest road using sjoin_nearest
    logging.info("Assigning accidents to nearest road...")
    accidents_gdf = accidents_gdf.copy()
    accidents_gdf['geometry'] = accidents_gdf['geometry'].apply(fix_geometry)
    accidents_gdf = accidents_gdf[accidents_gdf['geometry'].is_valid & ~accidents_gdf['geometry'].is_empty]
    
    if accidents_gdf.empty:
        logging.warning("No valid accidents after geometry fixing.")
        return pd.DataFrame()
    
    # Use sjoin_nearest with weighted distance based on width_rank
    nearest = gpd.sjoin_nearest(
        accidents_gdf,
        roads_gdf_buffered[['geometry', 'class', 'width_rank']],
        how='left',
        distance_col='distance'
    )
    # Adjust distance by width_rank to favor wider roads
    nearest['weighted_distance'] = nearest['distance'] / (nearest['width_rank'].replace(0, 1) ** 2)
    # Drop duplicates based on the accident index (DataFrame index)
    nearest = nearest.sort_values('weighted_distance').drop_duplicates(keep='first')
    
    matched_accidents = nearest[['index_right']].copy().reset_index()
    matched_accidents.columns = ['accident_idx', 'road_idx']
    matched_accidents = matched_accidents.dropna(subset=['road_idx'])
    matched_accidents['road_idx'] = matched_accidents['road_idx'].astype(int)
    
    logging.info(f"Matched {len(matched_accidents)} accidents out of {len(accidents_gdf)}")
    
    # Reassign accidents from footway/cycleway to nearby wider roads
    footway_cycleway_accidents = matched_accidents[
        matched_accidents['road_idx'].isin(
            roads_gdf[roads_gdf['class'].isin(['footway', 'cycleway'])].index
        )
    ]
    if not footway_cycleway_accidents.empty:
        logging.info(f"Reassigning {len(footway_cycleway_accidents)} accidents from footway/cycleway...")
        accidents_to_reassign = accidents_gdf.loc[footway_cycleway_accidents['accident_idx']].copy()
        wider_roads = roads_gdf_buffered[roads_gdf_buffered['width_rank'] >= 4]
        if not wider_roads.empty:
            reassigned = gpd.sjoin_nearest(
                accidents_to_reassign,
                wider_roads[['geometry', 'class']],
                how='left',
                max_distance=10  # Only reassign if within 10 meters
            )
            reassigned_matches = pd.DataFrame({
                'accident_idx': reassigned.index,
                'road_idx': reassigned['index_right']
            }).copy()
            reassigned_matches = reassigned_matches.dropna(subset=['road_idx'])
            reassigned_matches['road_idx'] = reassigned_matches['road_idx'].astype(int)
            matched_accidents = matched_accidents[~matched_accidents['accident_idx'].isin(reassigned_matches['accident_idx'])]
            matched_accidents = pd.concat([matched_accidents, reassigned_matches], ignore_index=True)
            logging.info(f"Reassigned {len(reassigned_matches)} accidents to wider roads")
    
    # Count accidents per road segment
    accident_counts = matched_accidents.groupby('road_idx').size().reindex(roads_gdf.index, fill_value=0)
    roads_gdf['accident_count'] = accident_counts
    
    # Log the distribution of accidents by road type
    accident_summary = roads_gdf.groupby('class')['accident_count'].sum()
    logging.info(f"Accidents by road type:\n{accident_summary}")
    
    # Filter short roads to avoid inflated accident density
    roads_gdf = roads_gdf[roads_gdf['length_m'] >= MIN_ROAD_LENGTH]
    
    # Compute accident density (accidents per km)
    roads_gdf['accident_density'] = roads_gdf['accident_count'] / (roads_gdf['length_m'] / 1000)
    roads_gdf['accident_density'] = roads_gdf['accident_density'].fillna(0).replace([np.inf, -np.inf], 0)
    
    # Normalize density by width_rank to reduce impact of narrow roads
    roads_gdf['accident_density'] = roads_gdf['accident_density'] * (roads_gdf['width_rank'].replace(0, 1) / 5)
    
    # Log road type counts
    logging.info(f"Road type counts:\n{roads_gdf['class'].value_counts()}")
    print(f"Road type counts:\n{roads_gdf['class'].value_counts()}")
    
    # Aggregate by road class for summary
    summary = roads_gdf.groupby('class').agg({
        'length_m': 'sum',
        'accident_count': 'sum',
        'accident_density': 'mean',
        'width_rank': 'first'
    }).reset_index()
    
    # Filter out classes with negligible data
    summary = summary[summary['length_m'] > 1000]  # At least 1km total length
    summary = summary[summary['width_rank'] > 0]   # Exclude unknown, unclassified, bridleway
    
    # Log summary
    print("\n--- Road Type Accident Density Summary ---")
    print(summary[['class', 'length_m', 'accident_count', 'accident_density', 'width_rank']].round(2))
    
    # Compute Spearman's rank correlation
    if len(summary) >= 2:
        corr, p_value = spearmanr(summary['width_rank'], summary['accident_density'])
        logging.info(f"Spearman's correlation between road width rank and accident density: {corr:.3f} (p-value: {p_value:.3f})")
        print(f"Spearman's correlation: {corr:.3f} (p-value: {p_value:.3f})")
    else:
        logging.warning("Insufficient road types for correlation analysis.")
        print("Insufficient road types for correlation analysis.")
    
    # Compute average accident density per neighborhood for walkability
    logging.info("Computing average road accident density per neighborhood...")
    road_neighborhoods = gpd.sjoin(
        roads_gdf[['geometry', 'class', 'length_m', 'width_rank', 'accident_density']], 
        neighborhoods_gdf[['geometry', 'LIE_NAME']], 
        how='left', predicate='intersects'
    )
    avg_accident_density = road_neighborhoods.groupby('LIE_NAME')['accident_density'].mean()
    avg_accident_density = avg_accident_density.reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
    neighborhoods_gdf['avg_road_accident_density'] = avg_accident_density.fillna(0)
    
    # Visualize with multiple charts
    # 1. Bar Chart: Mean accident density by road type
    plt.figure(figsize=(12, 6))
    summary_sorted = summary.sort_values('width_rank', ascending=False)
    sns.barplot(data=summary_sorted, x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Mean Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    bar_path = os.path.join(BASE_DIR, 'road_type_accident_bar.png')
    plt.savefig(bar_path)
    plt.close()
    logging.info(f"Bar chart saved to {bar_path}")
    print(f"Bar chart saved to {bar_path}")
    
    # 2. Box Plot: Distribution of accident density by road type
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=roads_gdf[roads_gdf['class'].isin(summary['class'])], 
                x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Accident Density (Accidents per km)')
    plt.title('Distribution of Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.yscale('log')  # Log scale for skewed data
    plt.legend(title='Width Rank')
    plt.tight_layout()
    box_path = os.path.join(BASE_DIR, 'road_type_accident_box.png')
    plt.savefig(box_path)
    plt.close()
    logging.info(f"Box chart saved to {box_path}")
    print(f"Box chart saved to {box_path}")
    
    # 3. Scatter Plot: Accident density vs. width rank with trend line
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=summary, x='width_rank', y='accident_density', 
                    size='length_m', sizes=(50, 500), hue='class', style='class', alpha=0.7)
    z = np.polyfit(summary['width_rank'], summary['accident_density'], 1)
    p = np.poly1d(z)
    plt.plot(summary['width_rank'], p(summary['width_rank']), "r--", alpha=0.5)
    plt.xlabel('Road Width Rank (1=Path, 5=Motorway)')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Road Type vs. Accident Density')
    plt.yscale('log')  # Log scale for skewed data
    plt.grid(True)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    scatter_path = os.path.join(BASE_DIR, 'road_type_accident_scatter.png')
    plt.savefig(scatter_path)
    plt.close()
    logging.info(f"Scatter plot saved to {scatter_path}")
    print(f"Scatter plot saved to {scatter_path}")
    
    # Log top accident-prone road types
    top_types = summary.nlargest(3, 'accident_density')[['class', 'accident_density']]
    logging.info(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    print(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    
    return summary

Cell 3: Walkability Computation Functions

In [28]:
def compute_walkability_components(neighborhoods_gdf, sample_size=5):
    """
    Compute walkability components for a sample of neighborhoods.
    
    Args:
        neighborhoods_gdf (gpd.GeoDataFrame): GeoDataFrame of neighborhoods.
        sample_size (int): Number of neighborhoods to sample.
    
    Returns:
        pd.DataFrame: DataFrame with walkability components for sampled neighborhoods.
    """
    if not all(col in neighborhoods_gdf.columns for col in ['ndvi_mean', 'tree_count', 'transit_count', 'intersection_density', 'accident_count', 'area_km2', 'avg_road_accident_density', 'elderly_percentage']):
        logging.error("Required columns missing in neighborhoods_gdf for walkability computation.")
        raise KeyError("Missing required columns in neighborhoods_gdf.")
    
    sample_gdf = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'pedestrian_infrastructure_score': [],
        'walkability_score': [],
        'walkability_category': []
    }
    
    # Precompute normalization constants
    ndvi_min, ndvi_max = neighborhoods_gdf['ndvi_mean'].min(), neighborhoods_gdf['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_gdf['tree_count'].min(), neighborhoods_gdf['tree_count'].max()
    transit_min, transit_max = neighborhoods_gdf['transit_count'].min(), neighborhoods_gdf['transit_count'].max()
    intersection_density_min = neighborhoods_gdf['intersection_density'].min()
    intersection_density_max = neighborhoods_gdf['intersection_density'].max()
    accident_count_min, accident_count_max = neighborhoods_gdf['accident_count'].min(), neighborhoods_gdf['accident_count'].max()
    accident_density_max = neighborhoods_gdf['avg_road_accident_density'].max()
    pedestrian_road_max = neighborhoods_gdf.get('pedestrian_road_density', pd.Series(0)).max()
    
    # Compute accident_count_density_max for safety score
    accident_count_density = neighborhoods_gdf['accident_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    accident_count_density_max = accident_count_density.max() if accident_count_density.max() > 0 else 1.0  # Avoid division by zero
    
    for idx, row in sample_gdf.iterrows():
        # Land Use Diversity (weighted by walkability desirability)
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.0) / 100 for col in land_use_cols if col in row]
        land_use_weights_list = [land_use_weights[category.lower()] for category in CATEGORY_PRIORITY.keys()]
        weighted_values = [p * w for p, w in zip(land_use_values, land_use_weights_list) if p > 0]
        if weighted_values:
            total = sum(weighted_values)
            if total > 0:
                weighted_values = [v / total for v in weighted_values]
                entropy = -np.sum([p * np.log2(p + 1e-10) for p in weighted_values])
                max_entropy = np.log2(len(weighted_values))
                land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0
            else:
                land_use_diversity = 0
        else:
            land_use_diversity = 0
        
        # Green Space and Comfort (use linear scaling for better variation)
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6)) if (ndvi_max - ndvi_min) > 0 else 0
        tree_density = row['tree_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        tree_density_max = (neighborhoods_gdf['tree_count'] / neighborhoods_gdf['area_km2']).replace(0, 1e-6).max()
        tree_density_normalized = (tree_density / (tree_density_max + 1e-6)) if tree_density_max > 0 else 0
        open_area = row.get('land_use_city_open_area_percent', 0.0) / 100
        green_space_score = (0.4 * ndvi_normalized + 0.3 * tree_density_normalized + 0.3 * open_area)
        
        # Transit Accessibility (use linear scaling for better variation)
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6) if (transit_max - transit_min) > 0 else 0
        transit_score = transit_raw
        
        # Road Connectivity (use linear scaling for better variation)
        intersection_density = row['intersection_density']
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6) if (intersection_density_max - intersection_density_min) > 0 else 0
        road_connectivity = intersection_density_normalized
        
        # Safety Score (adjust weights to balance contribution)
        accident_count_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_count_density = min(accident_count_density, accident_count_density_max * 0.5)
        safety_score_count = 1 - (accident_count_density / (accident_count_density_max + 1e-6)) if accident_count_density_max > 0 else 1
        accident_density = row['avg_road_accident_density']
        safety_score_roads = 1 - (accident_density / (accident_density_max + 1e-6)) if accident_density_max > 0 else 1
        pedestrian_roads = row.get('pedestrian_road_density', 0.0)
        pedestrian_roads_safety = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        safety_score = (0.3 * safety_score_count + 0.3 * safety_score_roads + 0.4 * pedestrian_roads_safety) if row['accident_count'] != 0 else (0.5 * safety_score_roads + 0.5 * pedestrian_roads_safety)
        
        # Elderly Accessibility (adjust weights)
        elderly_percentage = row['elderly_percentage'] / 100
        medical_access = row.get('land_use_medical_percent', 0.0) / 100
        elderly_accessibility = 0.3 * elderly_percentage + 0.4 * medical_access + 0.3 * pedestrian_roads_safety
        
        # Pedestrian Infrastructure (use linear scaling for better variation)
        pedestrian_road_score = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        amenity_access = row.get('land_use_amenity_percent', 0.0) / 100
        pedestrian_infrastructure_score = 0.5 * pedestrian_road_score + 0.5 * amenity_access
        
        # Walkability Score with adjusted weights to increase variation
        base_score = (
            0.25 * land_use_diversity +  # Increase weight
            0.25 * green_space_score +   # Increase weight
            0.15 * transit_score +
            0.15 * road_connectivity +
            0.20 * pedestrian_infrastructure_score  # Reduce weight to balance
        )
        safety_modifier = 0.7 + 0.3 * safety_score  # Reduce impact to increase variation
        elderly_modifier = 1 + elderly_accessibility * 0.1  # Reduce impact to increase variation
        walkability_score = base_score * safety_modifier * elderly_modifier
        walkability_score = np.clip(walkability_score, 0, 1)
        
        # Categorize walkability score
        if pd.isna(walkability_score):
            category = 'low'
        elif walkability_score < 0.33:
            category = 'low'
        elif walkability_score < 0.66:
            category = 'medium'
        else:
            category = 'high'
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['pedestrian_infrastructure_score'].append(pedestrian_infrastructure_score)
        components['walkability_score'].append(walkability_score)
        components['walkability_category'].append(category)
    
    return pd.DataFrame(components)

def compute_walkability_components_all(neighborhoods_df, data):
    """
    Compute walkability components for all neighborhoods.
    
    Args:
        neighborhoods_df (gpd.GeoDataFrame): GeoDataFrame of neighborhoods.
        data (dict): Dictionary containing roads and other datasets.
    
    Returns:
        pd.DataFrame: DataFrame with walkability components for all neighborhoods.
    """
    # Validate inputs
    if neighborhoods_df.empty:
        logging.error("neighborhoods_df is empty.")
        raise ValueError("neighborhoods_df cannot be empty.")
    if 'roads' not in data or data['roads'].empty:
        logging.error("Roads data is missing or empty.")
        raise ValueError("Roads data cannot be empty.")
    
    required_cols = ['LIE_NAME', 'geometry', 'ndvi_mean', 'tree_count', 'transit_count', 'intersection_density', 
                     'accident_count', 'area_km2', 'avg_road_accident_density', 'elderly_percentage']
    missing_cols = [col for col in required_cols if col not in neighborhoods_df.columns]
    if missing_cols:
        logging.error(f"Missing columns in neighborhoods_df: {missing_cols}")
        raise KeyError(f"Missing columns in neighborhoods_df: {missing_cols}")
    
    # Check for non-null values in critical columns
    for col in required_cols:
        if col == 'geometry':
            null_count = neighborhoods_df[col].isna().sum()
            invalid_count = (~neighborhoods_df[col].is_valid).sum()
            logging.info(f"Geometry column: {null_count} nulls, {invalid_count} invalid geometries.")
        else:
            null_count = neighborhoods_df[col].isna().sum()
            zero_count = (neighborhoods_df[col] == 0).sum()
            logging.info(f"{col}: {null_count} nulls, {zero_count} zeros.")
    
    # Ensure CRS matches
    neighborhoods_df = neighborhoods_df.copy()
    neighborhoods_df = neighborhoods_df.to_crs('EPSG:3826')
    roads_df = data['roads'].copy()
    roads_df = roads_df.to_crs('EPSG:3826')
    
    # Validate geometries
    neighborhoods_df['geometry'] = neighborhoods_df['geometry'].apply(fix_geometry)
    invalid_geoms = neighborhoods_df[~neighborhoods_df.geometry.is_valid]
    if not invalid_geoms.empty:
        logging.warning(f"Found {len(invalid_geoms)} invalid geometries in neighborhoods_df after fixing.")
        neighborhoods_df = neighborhoods_df[neighborhoods_df.geometry.is_valid]
        if neighborhoods_df.empty:
            logging.error("All geometries in neighborhoods_df are invalid after fixing.")
            raise ValueError("No valid geometries in neighborhoods_df.")
    
    roads_df['geometry'] = roads_df['geometry'].apply(fix_geometry)
    invalid_roads = roads_df[~roads_df.geometry.is_valid]
    if not invalid_roads.empty:
        logging.warning(f"Found {len(invalid_roads)} invalid geometries in roads_df after fixing.")
        roads_df = roads_df[roads_df.geometry.is_valid]
        if roads_df.empty:
            logging.error("All geometries in roads_df are invalid after fixing.")
            raise ValueError("No valid geometries in roads_df.")
    
    # Compute pedestrian road density
    pedestrian_road_types = ['footway', 'pedestrian', 'cycleway']
    pedestrian_roads_df = roads_df[roads_df['class'].isin(pedestrian_road_types)]
    logging.info(f"Filtered {len(pedestrian_roads_df)} roads of types {pedestrian_road_types} out of {len(roads_df)} total roads.")
    
    # Check spatial overlap
    overlap = check_spatial_overlap(neighborhoods_df, pedestrian_roads_df, "neighborhoods", "pedestrian_roads")
    if not overlap:
        logging.warning("No spatial overlap between neighborhoods and pedestrian roads. Buffering geometries to find matches...")
        neighborhoods_buffered = neighborhoods_df.copy()
        neighborhoods_buffered['geometry'] = neighborhoods_buffered['geometry'].buffer(50)  # Buffer by 50 meters
        pedestrian_roads = gpd.sjoin(
            pedestrian_roads_df,
            neighborhoods_buffered[['geometry', 'LIE_NAME']],
            how='left',
            predicate='intersects'
        )
    else:
        pedestrian_roads = gpd.sjoin(
            pedestrian_roads_df,
            neighborhoods_df[['geometry', 'LIE_NAME']],
            how='left',
            predicate='intersects'
        )
    
    logging.info(f"Pedestrian roads join resulted in {len(pedestrian_roads)} matches.")
    pedestrian_lengths = pedestrian_roads.groupby('index_right')['length_m'].sum().reindex(neighborhoods_df.index, fill_value=0)
    neighborhoods_df['pedestrian_road_density'] = pedestrian_lengths / (neighborhoods_df['area_km2'] * 1000)
    logging.info(f"Pedestrian road density stats:\n{neighborhoods_df['pedestrian_road_density'].describe()}")
    
    # Precompute normalization constants for raw features
    ndvi_min, ndvi_max = neighborhoods_df['ndvi_mean'].min(), neighborhoods_df['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_df['tree_count'].min(), neighborhoods_df['tree_count'].max()
    transit_min, transit_max = neighborhoods_df['transit_count'].min(), neighborhoods_df['transit_count'].max()
    intersection_density_min = neighborhoods_df['intersection_density'].min()
    intersection_density_max = neighborhoods_df['intersection_density'].max()
    accident_count_min, accident_count_max = neighborhoods_df['accident_count'].min(), neighborhoods_df['accident_count'].max()
    accident_density_max = neighborhoods_df['avg_road_accident_density'].max()
    pedestrian_road_max = neighborhoods_df['pedestrian_road_density'].max()
    
    # Compute accident_count_density_max for safety score
    accident_count_density = neighborhoods_df['accident_count'] / neighborhoods_df['area_km2'].replace(0, 1e-6)
    accident_count_density_max = accident_count_density.max() if accident_count_density.max() > 0 else 1.0  # Avoid division by zero
    
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'pedestrian_infrastructure_score': [],
        'walkability_score': [],
        'walkability_category': []
    }
    
    for idx, row in tqdm(neighborhoods_df.iterrows(), total=len(neighborhoods_df), desc="Computing walkability scores"):
        # Land Use Diversity (weighted by walkability desirability)
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.0) / 100 for col in land_use_cols if col in row]
        land_use_weights_list = [land_use_weights[category.lower()] for category in CATEGORY_PRIORITY.keys()]
        weighted_values = [p * w for p, w in zip(land_use_values, land_use_weights_list) if p > 0]
        if weighted_values:
            total = sum(weighted_values)
            if total > 0:
                weighted_values = [v / total for v in weighted_values]
                entropy = -np.sum([p * np.log2(p + 1e-10) for p in weighted_values])
                max_entropy = np.log2(len(weighted_values))
                land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0
            else:
                land_use_diversity = 0
        else:
            land_use_diversity = 0
        
        # Green Space and Comfort (use linear scaling for better variation)
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6)) if (ndvi_max - ndvi_min) > 0 else 0
        tree_density = row['tree_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        tree_density_max = (neighborhoods_df['tree_count'] / neighborhoods_df['area_km2']).replace(0, 1e-6).max()
        tree_density_normalized = (tree_density / (tree_density_max + 1e-6)) if tree_density_max > 0 else 0
        open_area = row.get('land_use_city_open_area_percent', 0.0) / 100
        green_space_score = (0.4 * ndvi_normalized + 0.3 * tree_density_normalized + 0.3 * open_area)
        
        # Transit Accessibility (use linear scaling for better variation)
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6) if (transit_max - transit_min) > 0 else 0
        transit_score = transit_raw
        
        # Road Connectivity (use linear scaling for better variation)
        intersection_density = row['intersection_density']
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6) if (intersection_density_max - intersection_density_min) > 0 else 0
        road_connectivity = intersection_density_normalized
        
        # Safety Score (adjust weights to balance contribution)
        accident_count_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_count_density = min(accident_count_density, accident_count_density_max * 0.5)
        safety_score_count = 1 - (accident_count_density / (accident_count_density_max + 1e-6)) if accident_count_density_max > 0 else 1
        accident_density = row['avg_road_accident_density']
        safety_score_roads = 1 - (accident_density / (accident_density_max + 1e-6)) if accident_density_max > 0 else 1
        pedestrian_roads = row.get('pedestrian_road_density', 0.0)
        pedestrian_roads_safety = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        safety_score = (0.3 * safety_score_count + 0.3 * safety_score_roads + 0.4 * pedestrian_roads_safety) if row['accident_count'] != 0 else (0.5 * safety_score_roads + 0.5 * pedestrian_roads_safety)
        
        # Elderly Accessibility (adjust weights)
        elderly_percentage = row['elderly_percentage'] / 100
        medical_access = row.get('land_use_medical_percent', 0.0) / 100
        elderly_accessibility = 0.3 * elderly_percentage + 0.4 * medical_access + 0.3 * pedestrian_roads_safety
        
        # Pedestrian Infrastructure (use linear scaling for better variation)
        pedestrian_road_score = pedestrian_roads / (pedestrian_road_max + 1e-6) if pedestrian_road_max > 0 else 0
        amenity_access = row.get('land_use_amenity_percent', 0.0) / 100
        pedestrian_infrastructure_score = 0.5 * pedestrian_road_score + 0.5 * amenity_access
        
        # Walkability Score with adjusted weights to increase variation
        base_score = (
            0.25 * land_use_diversity +  # Increase weight
            0.25 * green_space_score +   # Increase weight
            0.15 * transit_score +
            0.15 * road_connectivity +
            0.20 * pedestrian_infrastructure_score  # Reduce weight to balance
        )
        safety_modifier = 0.7 + 0.3 * safety_score  # Reduce impact to increase variation
        elderly_modifier = 1 + elderly_accessibility * 0.1  # Reduce impact to increase variation
        walkability_score = base_score * safety_modifier * elderly_modifier
        walkability_score = np.clip(walkability_score, 0, 1)
        
        # Categorize walkability score
        if pd.isna(walkability_score):
            category = 'low'
        elif walkability_score < 0.33:
            category = 'low'
        elif walkability_score < 0.66:
            category = 'medium'
        else:
            category = 'high'
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['pedestrian_infrastructure_score'].append(pedestrian_infrastructure_score)
        components['walkability_score'].append(walkability_score)
        components['walkability_category'].append(category)
    
    result_df = pd.DataFrame(components)
    logging.info(f"Walkability score distribution:\n{result_df['walkability_score'].describe()}")
    logging.info(f"Walkability category distribution:\n{result_df['walkability_category'].value_counts()}")
    return result_df

def compute_walkability_scores(G, data):
    """Compute walkability scores for all neighborhoods in the graph."""
    logging.info("Computing walkability scores for all neighborhoods...")
    
    neighborhoods_gdf = data['neighborhoods']
    walkability_df = compute_walkability_components_all(neighborhoods_gdf, data)
    
    nodes_df = G._nodes.to_pandas()
    nodes_df = nodes_df.merge(
        walkability_df[['LIE_NAME', 'land_use_diversity', 'green_space_score', 'transit_score', 
                       'road_connectivity', 'safety_score', 'elderly_accessibility', 
                       'pedestrian_infrastructure_score', 'walkability_score', 'walkability_category']],
        on='LIE_NAME',
        how='left'
    )
    
    # Fill NaN values for non-neighborhood nodes
    for col in ['land_use_diversity', 'green_space_score', 'transit_score', 'road_connectivity', 
                'safety_score', 'elderly_accessibility', 'pedestrian_infrastructure_score', 
                'walkability_score', 'walkability_category']:
        nodes_df[col] = nodes_df[col].fillna(0 if col != 'walkability_category' else 'low')
    
    G._nodes = cudf.from_pandas(nodes_df)
    logging.info("Finished computing walkability scores.")
    return G

Cell 4 Main Data Loading and Processing

In [29]:
import logging
import os
import json
from tqdm import tqdm
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from shapely import make_valid

def load_and_prepare_data():
    logging.info("Stage 1: Loading and preparing data...")
    
    # Define file paths and their corresponding keys
    data_files = {
        'neighborhoods': LANDUSE_NDVI_PATH,
        'buildings': OSM_BUILDINGS_PATH,
        'roads': OSM_ROADS_PATH,
        'trees': OSM_TREES_PATH,
        'transit': OSM_TRANSIT_PATH,
        'urban_masterplan': URBAN_MASTERPLAN_PATH,
        'accidents': ACCIDENTS_PATH,
        'population': POPULATION_PATH
    }
    
    data = {}
    
    # Load data with progress bar
    for key, path in tqdm(data_files.items(), desc="Loading files"):
        try:
            if key == 'population':
                with open(path, 'r') as f:
                    data[key] = pd.DataFrame(json.load(f))
                # Log columns of population_df to debug missing columns
                logging.info(f"Columns in population_df after loading: {list(data[key].columns)}")
            elif path.endswith('.geoparquet'):
                data[key] = gpd.read_parquet(path)
            else:
                data[key] = gpd.read_file(path)
            logging.info(f"Loaded {key} with shape {data[key].shape}")
        except Exception as e:
            logging.error(f"Failed to load {key} from {path}: {e}")
            raise
    
    # Log columns of neighborhoods_gdf to debug missing 'area_km2'
    neighborhoods_gdf = data['neighborhoods']
    logging.info(f"Columns in neighborhoods_gdf after loading: {list(neighborhoods_gdf.columns)}")
    
    # Ensure all GeoDataFrames are in the same CRS
    target_crs = 'EPSG:3826'
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            if data[key].crs != target_crs:
                data[key] = data[key].to_crs(target_crs)
                logging.info(f"Converted {key} to CRS {target_crs}")
    
    # Fix geometries in all GeoDataFrames
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            data[key]['geometry'] = data[key]['geometry'].apply(fix_geometry)
            invalid_geoms = data[key][~data[key].geometry.is_valid]
            if not invalid_geoms.empty:
                logging.warning(f"Found {len(invalid_geoms)} invalid geometries in {key} after fixing.")
                data[key] = data[key][data[key].geometry.is_valid]
    
    # Compute intersections for neighborhoods
    logging.info("Computing intersections for neighborhoods...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf after loading: {list(roads_gdf.columns)}")
    
    # Extract endpoints from road segments
    logging.info("Extracting endpoints from road segments...")
    endpoints = []
    road_indices = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        geom = row['geometry']
        if geom.geom_type == 'LineString':
            coords = list(geom.coords)
            start_point = Point(coords[0])
            end_point = Point(coords[-1])
            if start_point.is_valid and end_point.is_valid:
                endpoints.extend([start_point, end_point])
                road_indices.extend([idx, idx])
        elif geom.geom_type == 'MultiLineString':
            for line in geom.geoms:
                coords = list(line.coords)
                start_point = Point(coords[0])
                end_point = Point(coords[-1])
                if start_point.is_valid and end_point.is_valid:
                    endpoints.extend([start_point, end_point])
                    road_indices.extend([idx, idx])
    
    if not endpoints:
        logging.warning("No valid endpoints extracted from road segments. Using fallback method for intersections.")
        neighborhoods_gdf = data['neighborhoods']
        road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
        intersection_counts = road_neighborhoods.groupby('index_right').size()
        neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    else:
        endpoints_gdf = gpd.GeoDataFrame({'geometry': endpoints, 'road_idx': road_indices}, crs=target_crs)
        
        # Create a spatial index for endpoints
        endpoints_sindex = endpoints_gdf.sindex
        
        # Cluster endpoints to identify intersections (points shared by 3 or more roads)
        logging.info("Building endpoint-to-road mapping...")
        endpoint_to_roads = {}
        for idx, point in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
            point_geom = point['geometry']
            road_idx = point['road_idx']
            point_key = (round(point_geom.x, 6), round(point_geom.y, 6))  # Round to avoid floating-point precision issues
            if point_key not in endpoint_to_roads:
                endpoint_to_roads[point_key] = set()
            endpoint_to_roads[point_key].add(road_idx)
        
        logging.info("Identifying intersections...")
        intersections = []
        for point_key, road_ids in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
            if len(road_ids) >= 3:  # Intersection if shared by 3 or more roads
                intersections.append(Point(point_key))
        
        if not intersections:
            logging.warning("No intersections found using endpoint clustering. Using fallback method.")
            neighborhoods_gdf = data['neighborhoods']
            road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = road_neighborhoods.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
        else:
            intersections_gdf = gpd.GeoDataFrame({'geometry': intersections}, crs=target_crs)
            
            # Count intersections per neighborhood
            logging.info("Counting intersections per neighborhood...")
            neighborhoods_gdf = data['neighborhoods']
            intersections_joined = gpd.sjoin(intersections_gdf, neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = intersections_joined.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    
    # Compute or verify area_km2
    if 'area_km2' not in neighborhoods_gdf.columns:
        logging.warning("'area_km2' column missing in neighborhoods_gdf. Computing from geometry...")
        # Compute area in square meters, then convert to square kilometers
        neighborhoods_gdf['area_m2'] = neighborhoods_gdf['geometry'].area
        neighborhoods_gdf['area_km2'] = neighborhoods_gdf['area_m2'] / 1_000_000  # Convert m² to km²
        logging.info(f"Computed area_km2 stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    else:
        logging.info(f"area_km2 already present. Stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    
    # Compute intersection density
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"Intersection count stats:\n{neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats:\n{neighborhoods_gdf['intersection_density'].describe()}")
    
    # Cache the result
    try:
        neighborhoods_gdf.to_parquet(INTERSECTION_CACHE_PATH)
        logging.info(f"Saved neighborhoods with intersections to {INTERSECTION_CACHE_PATH}")
    except Exception as e:
        logging.error(f"Failed to save neighborhoods with intersections: {e}")
    
    data['neighborhoods'] = neighborhoods_gdf
    
    # Compute tree count per neighborhood
    logging.info("Computing tree count per neighborhood...")
    trees_gdf = data['trees']
    trees_joined = gpd.sjoin(trees_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    tree_counts = trees_joined.groupby('index_right').size()
    data['neighborhoods']['tree_count'] = tree_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute transit count per neighborhood
    logging.info("Computing transit count per neighborhood...")
    transit_gdf = data['transit']
    transit_joined = gpd.sjoin(transit_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    transit_counts = transit_joined.groupby('index_right').size()
    data['neighborhoods']['transit_count'] = transit_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute accident count per neighborhood
    logging.info("Computing accident count per neighborhood...")
    accidents_gdf = data['accidents']
    accidents_buffered = accidents_gdf.copy()
    accidents_buffered['geometry'] = accidents_buffered['geometry'].buffer(BUFFER_DISTANCE)
    accidents_joined = gpd.sjoin(accidents_buffered[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    accident_counts = accidents_joined.groupby('index_right').size()
    data['neighborhoods']['accident_count'] = accident_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute road density per neighborhood
    logging.info("Computing road density per neighborhood...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf before computing road density: {list(roads_gdf.columns)}")
    
    # Compute length_m if missing
    if 'length_m' not in roads_gdf.columns:
        logging.warning("'length_m' column missing in roads_gdf. Computing from geometry...")
        roads_gdf['length_m'] = roads_gdf['geometry'].length  # Length in meters (since CRS is EPSG:3826)
        logging.info(f"Computed length_m stats:\n{roads_gdf['length_m'].describe()}")
    
    roads_joined = gpd.sjoin(roads_gdf[['geometry', 'length_m']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    road_lengths = roads_joined.groupby('index_right')['length_m'].sum()
    data['neighborhoods']['road_density'] = road_lengths.reindex(data['neighborhoods'].index, fill_value=0) / (data['neighborhoods']['area_km2'] * 1000)
    logging.info(f"Road density stats:\n{data['neighborhoods']['road_density'].describe()}")
    
    # Merge population data
    logging.info("Merging population data...")
    population_df = data['population']
    population_df['LIE_NAME'] = population_df['LIE_NAME'].astype(str).str.strip()
    data['neighborhoods']['LIE_NAME'] = data['neighborhoods']['LIE_NAME'].astype(str).str.strip()
    
    # Check for possible column names for total_population and elderly_percentage
    expected_cols = ['total_population', 'elderly_percentage']
    population_cols = list(population_df.columns)
    missing_cols = [col for col in expected_cols if col not in population_cols]
    
    if missing_cols:
        logging.warning(f"Expected columns {missing_cols} not found in population_df. Attempting to find alternatives...")
        # Possible alternative names
        total_pop_alt = None
        elderly_alt = None
        for col in population_cols:
            col_lower = col.lower()
            if 'population' in col_lower and total_pop_alt is None:
                total_pop_alt = col
                logging.info(f"Found alternative for total_population: {col}")
            if 'elderly' in col_lower and elderly_alt is None:
                elderly_alt = col
                logging.info(f"Found alternative for elderly_percentage: {col}")
        
        # Rename columns if alternatives are found
        if total_pop_alt:
            population_df = population_df.rename(columns={total_pop_alt: 'total_population'})
        else:
            logging.warning("No alternative found for total_population. Setting to 0.")
            population_df['total_population'] = 0
        if elderly_alt:
            population_df = population_df.rename(columns={elderly_alt: 'elderly_percentage'})
        else:
            logging.warning("No alternative found for elderly_percentage. Setting to 0.")
            population_df['elderly_percentage'] = 0
    
    # Perform the merge
    data['neighborhoods'] = data['neighborhoods'].merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']],
        on='LIE_NAME',
        how='left'
    )
    
    # Compute land use percentages
    logging.info("Computing land use percentages for neighborhoods...")
    urban_masterplan_gdf = data['urban_masterplan']
    print_percentage_calculation(data['neighborhoods'], urban_masterplan_gdf, sample_size=3)
    
    for idx, row in data['neighborhoods'].iterrows():
        neighborhood_geom = row['geometry']
        if not neighborhood_geom.is_valid:
            continue
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area = intersected.geometry.union_all().area
        remaining_geom = intersected.geometry.union_all()
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                remaining_geom = remaining_geom.difference(category_area_geom)
            except Exception as e:
                logging.warning(f"Topology error for category {category} in neighborhood {row['LIE_NAME']}: {e}")
                category_areas[category] = 0.0
        
        for category in CATEGORY_PRIORITY.keys():
            area = category_areas.get(category, 0.0)
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            data['neighborhoods'].at[idx, f'land_use_{category.lower()}_percent'] = percentage
    
    # Fill NaN values in land use percentages
    for category in CATEGORY_PRIORITY.keys():
        col = f'land_use_{category.lower()}_percent'
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0.0)
    
    # Fill NaN values in other columns
    for col in ['intersection_count', 'intersection_density', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'total_population', 'elderly_percentage']:
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0)
    
    # Print data structure summary
    print_data_structure(data)
    
    logging.info("Finished loading and preparing data.")
    return data

Cell 5 compute_intersection_counts

In [30]:
def compute_intersection_counts(neighborhoods_gdf, roads_gdf):
    logging.info("Computing intersection counts for neighborhoods...")
    
    # Extract endpoints from road segments
    def get_endpoints(line):
        if line is None or line.is_empty:
            return []
        coords = list(line.coords)
        return [Point(coords[0]), Point(coords[-1])]
    
    endpoints = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        points = get_endpoints(row['geometry'])
        for point in points:
            endpoints.append({'geometry': point, 'road_idx': idx})
    
    endpoints_gdf = gpd.GeoDataFrame(endpoints, crs='EPSG:3826')
    
    # Build a mapping of endpoints to road indices
    endpoint_to_roads = {}
    for idx, row in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
        point = row['geometry']
        road_idx = row['road_idx']
        point_tuple = (point.x, point.y)
        if point_tuple not in endpoint_to_roads:
            endpoint_to_roads[point_tuple] = set()
        endpoint_to_roads[point_tuple].add(road_idx)
    
    # Identify intersections (endpoints shared by 3 or more roads)
    intersections = []
    for point_tuple, road_indices in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
        if len(road_indices) >= 3:  # Intersection if 3 or more roads share the endpoint
            intersections.append({'geometry': Point(point_tuple)})
    
    if not intersections:
        logging.warning("No intersections found. Setting intersection counts to 0.")
        neighborhoods_gdf['intersection_count'] = 0
        neighborhoods_gdf['intersection_density'] = 0.0
        return neighborhoods_gdf
    
    intersections_gdf = gpd.GeoDataFrame(intersections, crs='EPSG:3826')
    
    # Spatial join to count intersections per neighborhood
    intersection_counts = gpd.sjoin(
        neighborhoods_gdf[['geometry', 'LIE_NAME']],
        intersections_gdf,
        how='left',
        predicate='contains'
    )
    intersection_counts = intersection_counts.groupby('LIE_NAME').size().reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
    neighborhoods_gdf['intersection_count'] = intersection_counts
    
    # Compute intersection density (intersections per km²)
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2']
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_density'].replace([np.inf, -np.inf], 0).fillna(0)
    
    logging.info(f"Intersection count stats: {neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats: {neighborhoods_gdf['intersection_density'].describe()}")
    
    return neighborhoods_gdf

def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building the graph...")
    
    # Compute data hash to check if graph needs recomputing
    data_hash = compute_data_hash(data)
    cached_hash = None
    if os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read().strip()
    
    if not force_recompute and cached_hash == data_hash and all(
        os.path.exists(path) for path in [GRAPH_NODES_CACHE_PATH, GRAPH_EDGES_CACHE_PATH, GRAPH_NODE_ID_CACHE_PATH]
    ):
        logging.info("Data unchanged. Loading graph from cache...")
        nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
            node_id_to_index = json.load(f)
        G = cugraph.Graph()
        G.from_cudf_edgelist(
            edges_df,
            source='src',
            destination='dst',
            edge_attr='weight'
        )
        G._nodes = nodes_df
        logging.info("Graph loaded from cache.")
        return G
    
    neighborhoods_gdf = data['neighborhoods'].copy()
    buildings_gdf = data['buildings'].copy()
    roads_gdf = data['roads'].copy()
    trees_gdf = data['trees'].copy()
    transit_gdf = data['transit'].copy()
    
    # Create nodes for neighborhoods, buildings, roads, trees, and transit
    nodes = []
    node_id_to_index = {}
    current_idx = 0
    
    # Neighborhood nodes
    for idx, row in neighborhoods_gdf.iterrows():
        node_id = f"neighborhood_{row['LIE_NAME']}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'geometry': row['geometry'].centroid,
            'ndvi_mean': row['ndvi_mean'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            'area_km2': row['area_km2']
        })
        current_idx += 1
    
    # Building nodes
    for idx, row in buildings_gdf.iterrows():
        node_id = f"building_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'building',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'building_type': row['building'],
            'area_m2': row['area_m2']
        })
        current_idx += 1
    
    # Road nodes
    for idx, row in roads_gdf.iterrows():
        node_id = f"road_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'road',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'class': row['class'],
            'length_m': row['length_m']
        })
        current_idx += 1
    
    # Tree nodes
    for idx, row in trees_gdf.iterrows():
        node_id = f"tree_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'tree',
            'LIE_NAME': None,
            'geometry': row['geometry']
        })
        current_idx += 1
    
    # Transit nodes
    for idx, row in transit_gdf.iterrows():
        node_id = f"transit_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'transit',
            'LIE_NAME': None,
            'geometry': row['geometry'],
            'class': row['class']
        })
        current_idx += 1
    
    nodes_df = pd.DataFrame(nodes)
    nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs='EPSG:3826')
    nodes_df = cudf.from_pandas(nodes_df.drop(columns=['geometry']))
    
    # Create edges based on spatial proximity
    edges = []
    nodes_gdf_sindex = nodes_gdf.sindex
    
    # Neighborhood-to-neighborhood edges (shared borders)
    logging.info("Creating neighborhood-to-neighborhood edges...")
    for idx1, row1 in neighborhoods_gdf.iterrows():
        geom1 = row1['geometry']
        node_idx1 = node_id_to_index[f"neighborhood_{row1['LIE_NAME']}"]
        possible_matches = list(nodes_gdf_sindex.query(geom1, predicate='intersects'))
        for idx2 in possible_matches:
            row2 = nodes_gdf.iloc[idx2]
            if row2['type'] != 'neighborhood':
                continue
            if row1['LIE_NAME'] == row2['LIE_NAME']:
                continue
            geom2 = neighborhoods_gdf[neighborhoods_gdf['LIE_NAME'] == row2['LIE_NAME']]['geometry'].iloc[0]
            if geom1.intersects(geom2):
                node_idx2 = node_id_to_index[f"neighborhood_{row2['LIE_NAME']}"]
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': 1.0
                })
    
    # Other edges (neighborhood to building, road, tree, transit)
    logging.info("Creating edges between neighborhoods and other entities...")
    for idx, row in tqdm(nodes_gdf.iterrows(), total=len(nodes_gdf), desc="Creating edges"):
        if row['type'] == 'neighborhood':
            continue
        geom = row['geometry']
        possible_matches = list(neighborhoods_gdf.sindex.query(geom, predicate='contains'))
        for match_idx in possible_matches:
            neighborhood = neighborhoods_gdf.iloc[match_idx]
            if neighborhood['geometry'].contains(geom):
                node_idx1 = node_id_to_index[row['node_id']]
                node_idx2 = node_id_to_index[f"neighborhood_{neighborhood['LIE_NAME']}"]
                weight = 1.0
                if row['type'] == 'transit':
                    weight = 2.0  # Higher weight for transit nodes
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': weight
                })
                edges.append({
                    'src': node_idx2,
                    'dst': node_idx1,
                    'weight': weight
                })
    
    edges_df = cudf.DataFrame(edges)
    
    # Build the graph
    G = cugraph.Graph()
    G.from_cudf_edgelist(
        edges_df,
        source='src',
        destination='dst',
        edge_attr='weight'
    )
    G._nodes = nodes_df
    
    # Cache the graph
    nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
    edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
    with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
        json.dump(node_id_to_index, f)
    with open(GRAPH_DATA_HASH_PATH, 'w') as f:
        f.write(data_hash)
    
    logging.info("Graph construction completed.")
    return G

def prepare_gnn_data(G):
    logging.info("Stage 3: Preparing data for GNN...")
    
    nodes_df = G._nodes.to_pandas()
    edges_df = G.edgelist.edgelist_df.to_pandas()
    
    # Create node features
    feature_columns = [
        'ndvi_mean', 'total_population', 'elderly_percentage', 'area_km2',
        'area_m2', 'length_m'
    ]
    features = []
    for idx, row in nodes_df.iterrows():
        node_features = []
        for col in feature_columns:
            value = row.get(col, 0.0)
            if pd.isna(value):
                value = 0.0
            node_features.append(value)
        
        # One-hot encode node type
        node_type = row['type']
        type_encoding = [0] * 5  # 5 types: neighborhood, building, road, tree, transit
        type_mapping = {
            'neighborhood': 0,
            'building': 1,
            'road': 2,
            'tree': 3,
            'transit': 4
        }
        type_idx = type_mapping.get(node_type, 0)
        type_encoding[type_idx] = 1
        node_features.extend(type_encoding)
        
        features.append(node_features)
    
    feature_matrix = np.array(features, dtype=np.float32)
    
    # Normalize numerical features
    numerical_features = feature_matrix[:, :len(feature_columns)]
    means = numerical_features.mean(axis=0)
    stds = numerical_features.std(axis=0)
    stds[stds == 0] = 1  # Avoid division by zero
    numerical_features = (numerical_features - means) / stds
    feature_matrix[:, :len(feature_columns)] = numerical_features
    
    # Create edge indices for PyG
    edge_index = torch.tensor(
        np.array([edges_df['src'].values, edges_df['dst'].values]),
        dtype=torch.long
    )
    edge_attr = torch.tensor(edges_df['weight'].values, dtype=torch.float)
    
    # Create target (walkability score) for neighborhood nodes
    y = np.zeros(len(nodes_df), dtype=np.float32)
    if 'walkability_score' in nodes_df.columns:
        walkability_scores = nodes_df['walkability_score'].fillna(0).values
        mask = nodes_df['type'] == 'neighborhood'
        y[mask] = walkability_scores[mask]
    else:
        logging.warning("Walkability scores not found in nodes_df. Setting targets to 0.")
    
    y = torch.tensor(y, dtype=torch.float)
    
    node_type_mapping = {
        'neighborhood': 0,
        'building': 1,
        'road': 2,
        'tree': 3,
        'transit': 4
    }
    node_type = nodes_df['type'].map(node_type_mapping).fillna(-1).astype(int).values
    node_type = torch.tensor(node_type, dtype=torch.long)
    
    data = Data(
        x=torch.tensor(feature_matrix, dtype=torch.float),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=y,
        node_type=node_type
    )
    
    logging.info("GNN data prepared.")
    return data

Cell 6: Graph Construction (build_graph)

In [31]:
from multiprocessing import Pool

def compute_neighborhood_neighborhood_edges(args):
    idx, row, neighborhoods_gdf, neighborhood_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(neighborhood_sindex.intersection(geom.bounds))
    for other_idx in possible_matches_index:
        if other_idx != idx:
            other_row = neighborhoods_gdf.iloc[other_idx]
            other_geom = other_row['geometry']
            try:
                if geom.buffer(1e-3).intersects(other_geom.buffer(1e-3)) or geom.buffer(1e-3).touches(other_geom.buffer(1e-3)):
                    src = f"neighborhood_{idx}"
                    dst = f"neighborhood_{other_idx}"
                    edges.append({'src': src, 'dst': dst})
            except Exception as e:
                logging.warning(f"Error checking intersection between neighborhood {idx} and {other_idx}: {e}")
    return edges

def compute_neighborhood_building_edges(args):
    idx, row, buildings_gdf, building_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(building_sindex.intersection(geom.bounds))
    for building_idx in possible_matches_index:
        building_row = buildings_gdf.iloc[building_idx]
        building_geom = building_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(building_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"building_{building_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and building {building_idx}: {e}")
    return edges

def compute_neighborhood_road_edges(args):
    idx, row, roads_gdf, road_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(road_sindex.intersection(geom.bounds))
    for road_idx in possible_matches_index:
        road_row = roads_gdf.iloc[road_idx]
        road_geom = road_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(road_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"road_{road_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and road {road_idx}: {e}")
    return edges

def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building city graph...")
    current_hash = compute_data_hash(data)
    nodes_df = None
    edges_df = None
    node_id_to_index = {}

    if not force_recompute and os.path.exists(GRAPH_NODES_CACHE_PATH) and os.path.exists(GRAPH_EDGES_CACHE_PATH):
        try:
            with open(GRAPH_DATA_HASH_PATH, 'r') as f:
                cached_hash = f.read()
            if cached_hash == current_hash:
                logging.info("Data hash matches cached hash. Loading graph from cache...")
                nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
                edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
                with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
                    node_id_to_index = json.load(f)
                G = cugraph.Graph()
                G._nodes = nodes_df
                if not edges_df.empty:
                    G.from_cudf_edgelist(edges_df, source='src', destination='dst')
                logging.info(f"Loaded graph from cache: {len(nodes_df)} nodes, {len(edges_df)} edges")
                return G
        except Exception as e:
            logging.warning(f"Failed to load cached graph: {e}. Recomputing graph...")

    logging.info("Constructing graph nodes...")
    neighborhoods_gdf = data['neighborhoods']
    buildings_gdf = data['buildings']
    roads_gdf = data['roads']

    # Compute area_m2 for buildings if not present
    if 'area_m2' not in buildings_gdf.columns:
        logging.warning("'area_m2' column missing in buildings_gdf. Computing from geometry...")
        buildings_gdf['area_m2'] = buildings_gdf['geometry'].area
        logging.info(f"Computed area_m2 stats:\n{buildings_gdf['area_m2'].describe()}")

    # Create nodes for neighborhoods
    logging.info("Adding neighborhood nodes...")
    neighborhood_nodes = []
    for idx, row in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood nodes"):
        node_id = f"neighborhood_{idx}"
        node_id_to_index[node_id] = idx
        neighborhood_nodes.append({
            'vertex': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'ndvi_mean': row['ndvi_mean'],
            'tree_count': row['tree_count'],
            'transit_count': row['transit_count'],
            'accident_count': row['accident_count'],
            'road_density': row['road_density'],
            'intersection_density': row['intersection_density'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            # Add land use features for later use
            **{f'land_use_{category.lower()}_percent': row.get(f'land_use_{category.lower()}_percent', 0.0) for category in CATEGORY_PRIORITY.keys()}
        })

    # Create nodes for buildings
    logging.info("Adding building nodes...")
    building_nodes = []
    for idx, row in tqdm(buildings_gdf.iterrows(), total=len(buildings_gdf), desc="Building nodes"):
        node_id = f"building_{idx}"
        node_id_to_index[node_id] = idx + len(neighborhoods_gdf)
        building_nodes.append({
            'vertex': node_id,
            'type': 'building',
            'building': row['building'],
            'area_m2': row['area_m2']
        })

    logging.info("Adding road nodes...")
    road_nodes = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Road nodes"):
        node_id = f"road_{idx}"
        node_id_to_index[node_id] = idx + len(neighborhoods_gdf) + len(buildings_gdf)
        road_nodes.append({
            'vertex': node_id,
            'type': 'road',
            'class': row['class'],
            'length_m': row['length_m']
        })

    # Combine all nodes
    nodes = neighborhood_nodes + building_nodes + road_nodes
    nodes_df = cudf.DataFrame(nodes)

    # Create spatial indices
    logging.info("Creating spatial indices...")
    neighborhoods_gdf = neighborhoods_gdf.copy()
    buildings_gdf = buildings_gdf.copy()
    roads_gdf = roads_gdf.copy()

    neighborhoods_gdf['geometry'] = neighborhoods_gdf['geometry'].apply(fix_geometry)
    buildings_gdf['geometry'] = buildings_gdf['geometry'].apply(fix_geometry)
    roads_gdf['geometry'] = roads_gdf['geometry'].apply(fix_geometry)

    # Drop rows with invalid geometries
    neighborhoods_gdf = neighborhoods_gdf[neighborhoods_gdf.geometry.is_valid & ~neighborhoods_gdf.geometry.is_empty]
    buildings_gdf = buildings_gdf[buildings_gdf.geometry.is_valid & ~buildings_gdf.geometry.is_empty]
    roads_gdf = roads_gdf[roads_gdf.geometry.is_valid & ~roads_gdf.geometry.is_empty]

    if neighborhoods_gdf.empty or buildings_gdf.empty or roads_gdf.empty:
        logging.error("One or more GeoDataFrames are empty after geometry validation.")
        raise ValueError("GeoDataFrames cannot be empty after geometry validation.")

    neighborhood_sindex = neighborhoods_gdf.sindex
    building_sindex = buildings_gdf.sindex
    road_sindex = roads_gdf.sindex

    # Create edges
    logging.info("Creating edges based on spatial proximity...")
    edges = []
    
    # Parallelize neighborhood-neighborhood edges
    logging.info("Computing neighborhood-neighborhood edges...")
    with Pool() as pool:
        tasks = [(i, row, neighborhoods_gdf, neighborhood_sindex) for i, row in neighborhoods_gdf.iterrows()]
        results = list(tqdm(pool.imap(compute_neighborhood_neighborhood_edges, tasks), total=len(tasks), desc="Neighborhood-Neighborhood edges"))
    for batch in results:
        edges.extend(batch)
    logging.info(f"Created {len(edges)} neighborhood-neighborhood edges")
    
    # Parallelize neighborhood-building edges
    logging.info("Computing neighborhood-building edges...")
    with Pool() as pool:
        tasks = [(i, row, buildings_gdf, building_sindex) for i, row in neighborhoods_gdf.iterrows()]
        results = list(tqdm(pool.imap(compute_neighborhood_building_edges, tasks), total=len(tasks), desc="Neighborhood-Building edges"))
    for batch in results:
        edges.extend(batch)
    logging.info(f"Created {len(edges)} total edges after neighborhood-building")
    
    # Parallelize neighborhood-road edges
    logging.info("Computing neighborhood-road edges...")
    with Pool() as pool:
        tasks = [(i, row, roads_gdf, road_sindex) for i, row in neighborhoods_gdf.iterrows()]
        results = list(tqdm(pool.imap(compute_neighborhood_road_edges, tasks), total=len(tasks), desc="Neighborhood-Road edges"))
    for batch in results:
        edges.extend(batch)
    logging.info(f"Created {len(edges)} total edges after neighborhood-road")
    
    edges_df = cudf.DataFrame(edges)
    
    # Validate edges (relaxed validation)
    valid_nodes = set(nodes_df['vertex'].to_pandas())
    if edges_df.empty:
        logging.warning("No edges created. Graph will have nodes but no edges.")
    else:
        edges_df = edges_df[edges_df['src'].isin(valid_nodes) & edges_df['dst'].isin(valid_nodes)]
        logging.info(f"After validation, {len(edges_df)} edges remain")
        if not edges_df.empty:
            logging.info(f"Sample edges after validation:\n{edges_df.head().to_pandas()}")
    
    # Create the graph
    G = cugraph.Graph()
    G._nodes = nodes_df
    if not edges_df.empty:
        G.from_cudf_edgelist(edges_df, source='src', destination='dst')
    else:
        logging.warning("No valid edges created. Graph will have nodes but no edges.")
    
    # Save graph data to cache
    logging.info("Saving graph data to cache...")
    try:
        nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_DATA_HASH_PATH, 'w') as f:
            f.write(current_hash)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
            json.dump(node_id_to_index, f)
        logging.info("Successfully saved graph data to cache.")
    except Exception as e:
        logging.error(f"Failed to save graph data to cache: {e}")
    
    logging.info(f"City graph constructed: {len(nodes_df)} nodes, {len(edges_df)} edges")
    return G

Cell 7: Rule-Based Walkability Scores (compute_walkability_scores)

In [32]:
def compute_walkability_scores(G, data):
    """Compute walkability scores for neighborhoods and update the graph."""
    logging.info("Computing walkability scores for neighborhoods...")
    
    nodes_df = G._nodes.to_pandas()
    
    # Use the original neighborhoods GeoDataFrame
    walkability_components = compute_walkability_components_all(data['neighborhoods'], data)
    
    # Standardize LIE_NAME for merging
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    walkability_components['LIE_NAME'] = walkability_components['LIE_NAME'].astype(str).str.strip()
    
    # Log for debugging
    logging.info(f"Number of neighborhood nodes in nodes_df: {len(nodes_df[nodes_df['type'] == 'neighborhood'])}")
    logging.info(f"Number of entries in walkability_components: {len(walkability_components)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {nodes_df['LIE_NAME'].head().tolist()}")
    logging.info(f"Sample LIE_NAME in walkability_components: {walkability_components['LIE_NAME'].head().tolist()}")
    
    # Map LIE_NAME to walkability_score and walkability_category
    nodes_df['walkability_score'] = np.nan
    nodes_df['walkability_category'] = np.nan
    walkability_dict = dict(zip(walkability_components['LIE_NAME'], walkability_components['walkability_score']))
    category_dict = dict(zip(walkability_components['LIE_NAME'], walkability_components['walkability_category']))
    nodes_df['walkability_score'] = nodes_df['LIE_NAME'].map(walkability_dict)
    nodes_df['walkability_category'] = nodes_df['LIE_NAME'].map(category_dict)
    
    # Check for unmatched nodes
    unmatched = nodes_df[nodes_df['type'] == 'neighborhood']['walkability_score'].isna().sum()
    if unmatched > 0:
        logging.error(f"Found {unmatched} neighborhood nodes without walkability scores. This should not happen after fixing computation.")
        raise ValueError("Failed to assign walkability scores to all neighborhood nodes.")
    
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished computing walkability scores.")
    return G

Cell 8 prepare_gnn_data

In [33]:
def prepare_gnn_data(G):
    """Prepare data for GNN training."""
    logging.info("Preparing data for GNN training...")
    
    nodes_df = G._nodes.to_pandas()
    
    # Define features for different node types
    feature_sets = {
        'neighborhood': ['ndvi_mean', 'tree_count', 'transit_count', 'accident_count', 
                         'road_density', 'intersection_density', 'total_population', 'elderly_percentage',
                         'land_use_residential_percent', 'land_use_commercial_percent', 
                         'land_use_education_percent', 'land_use_city_open_area_percent',
                         'land_use_public_transportation_percent', 'land_use_pedestrian_percent'],
        'building': ['area_m2'],
        'road': ['length_m']
    }
    
    # Determine the maximum number of features (based on neighborhood nodes)
    max_features = max(len(features) for features in feature_sets.values())
    
    # Normalize features for each node type and pad to max_features
    feature_data = []
    for node_type, features in tqdm(feature_sets.items(), desc="Normalizing features by node type"):
        type_nodes = nodes_df[nodes_df['type'] == node_type]
        if type_nodes.empty:
            continue
        
        node_features = []
        for feature in features:
            if feature not in type_nodes.columns:
                logging.warning(f"Feature column {feature} not found in nodes_df for type {node_type}. Setting to 0.")
                type_nodes[feature] = 0
            feature_values = type_nodes[feature].astype(float).fillna(0)
            feature_max = feature_values.max()
            feature_min = feature_values.min()
            if feature_max == feature_min:
                logging.warning(f"Feature {feature} for type {node_type} has no variation (max=min). Setting to 0.")
                normalized = np.zeros_like(feature_values)
            else:
                normalized = (feature_values - feature_min) / (feature_max - feature_min)
            node_features.append(normalized)
        
        # Stack features for this node type
        feature_matrix = np.stack(node_features, axis=1)
        
        # Pad with zeros to match max_features
        if feature_matrix.shape[1] < max_features:
            padding = np.zeros((feature_matrix.shape[0], max_features - feature_matrix.shape[1]))
            feature_matrix = np.hstack((feature_matrix, padding))
        
        feature_data.append((type_nodes.index, feature_matrix))
    
    # Combine features for all nodes
    all_indices = []
    all_features = []
    for indices, features in feature_data:
        all_indices.extend(indices)
        all_features.append(features)
    
    all_features = np.vstack(all_features)
    nodes_df.loc[all_indices, 'feature_index'] = np.arange(len(all_indices))
    
    # Map node types to numerical values
    type_to_int = {'neighborhood': 0, 'building': 1, 'road': 2}
    node_type = nodes_df['type'].map(type_to_int).astype(int).values
    node_type_tensor = torch.tensor(node_type, dtype=torch.long)
    
    # Prepare edge index
    if G.edgelist is not None and not G.edgelist.edgelist_df.empty:
        edges_df = G.edgelist.edgelist_df.to_pandas()
        
        # Create a dictionary for fast lookups of feature_index
        vertex_to_feature_index = dict(zip(nodes_df['vertex'], nodes_df['feature_index']))
        
        edge_index = []
        for _, edge in tqdm(edges_df.iterrows(), total=len(edges_df), desc="Preparing edge index"):
            src_idx = vertex_to_feature_index.get(edge['src'])
            dst_idx = vertex_to_feature_index.get(edge['dst'])
            if src_idx is not None and dst_idx is not None:
                edge_index.append([int(src_idx), int(dst_idx)])
        
        if edge_index:
            edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        else:
            logging.warning("No valid edges after mapping node IDs to indices. GNN will treat nodes independently.")
            edge_index = torch.zeros((2, 0), dtype=torch.long)
    else:
        logging.warning("No edges in the graph. GNN will treat nodes independently.")
        edge_index = torch.zeros((2, 0), dtype=torch.long)
    
    # Prepare node features and labels
    x = torch.tensor(all_features, dtype=torch.float)
    y = torch.tensor(nodes_df['walkability_score'].fillna(0).values, dtype=torch.float)
    
    # Create masks for neighborhood nodes (for training)
    neighborhood_mask = nodes_df['type'] == 'neighborhood'
    train_mask = torch.zeros(len(nodes_df), dtype=torch.bool)
    train_mask[neighborhood_mask] = True
    
    data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, node_type=node_type_tensor)
    
    logging.info(f"Prepared GNN data: {len(nodes_df)} nodes, {edge_index.shape[1]} edges")
    logging.info(f"Feature matrix shape: {x.shape}")
    return data

Cell 9: WalkabilityGNN, train_gnn_model, predict_walkability

In [34]:
class WalkabilityGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super(WalkabilityGNN, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1)
        # Add a linear layer for cases with no edges
        self.linear = torch.nn.Linear(in_channels, out_channels)
    
    def forward(self, x, edge_index, edge_attr=None):
        if edge_index.shape[1] == 0:  # No edges
            logging.warning("No edges in the graph. Using linear layer for node features only.")
            return self.linear(x).squeeze()
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x

def train_gnn_model(data):
    """Train the GNN model to predict walkability scores."""
    logging.info("Stage 4: Training GNN model...")
    
    # Validate input data
    if data.x.shape[0] == 0 or data.y.shape[0] == 0:
        logging.error("Input data for GNN training is empty.")
        raise ValueError("Input data for GNN training cannot be empty.")
    
    # Log target distribution
    y_stats = pd.Series(data.y.cpu().numpy()).describe()
    logging.info(f"Target (walkability_score) distribution:\n{y_stats}")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = WalkabilityGNN(
        in_channels=data.x.shape[1],
        hidden_channels=128,
        out_channels=1,
        heads=4
    ).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    
    # Use the existing train_mask for splitting
    train_mask = data.train_mask
    num_train_nodes = train_mask.sum().item()
    if num_train_nodes == 0:
        logging.error("No nodes available for training. Check train_mask.")
        raise ValueError("No nodes available for training.")
    
    # Split training nodes into train and validation sets (80-20 split)
    indices = torch.arange(num_train_nodes, device=device)
    train_size = int(0.8 * num_train_nodes)
    train_indices, val_indices = indices[:train_size], indices[train_size:]
    
    full_train_mask = data.train_mask.clone()
    full_val_mask = data.train_mask.clone()
    
    # Map indices back to the original node indices
    train_node_indices = torch.where(train_mask)[0][train_indices]
    val_node_indices = torch.where(train_mask)[0][val_indices]
    
    train_mask = torch.zeros_like(full_train_mask, dtype=torch.bool)
    val_mask = torch.zeros_like(full_val_mask, dtype=torch.bool)
    train_mask[train_node_indices] = True
    val_mask[val_node_indices] = True
    
    # Early stopping parameters
    best_val_loss = float('inf')
    patience = 15
    patience_counter = 0
    
    model.train()
    for epoch in tqdm(range(100), desc="Training epochs"):  # Reduced max epochs for efficiency
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr).squeeze()
        train_loss = F.mse_loss(out[train_mask], data.y[train_mask])
        train_loss.backward()
        optimizer.step()
        
        # Validation step
        model.eval()
        with torch.no_grad():
            val_loss = F.mse_loss(out[val_mask], data.y[val_mask])
        model.train()
        
        scheduler.step(val_loss)
        
        if epoch % 10 == 0:
            logging.info(f"Epoch {epoch}, Train Loss: {train_loss.item():.4f}, Val Loss: {val_loss.item():.4f}")
        
        # Early stopping based on validation loss
        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            logging.info(f"Early stopping at epoch {epoch}. Best validation loss: {best_val_loss:.4f}")
            break

    model.eval()
    logging.info("Finished training GNN model.")
    return model

def predict_walkability(G, model):
    """Predict walkability scores using the trained GNN model."""
    logging.info("Predicting walkability with GNN...")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = prepare_gnn_data(G)
    data = data.to(device)

    with torch.no_grad():
        predictions = model(data.x, data.edge_index, data.edge_attr).squeeze()

    predictions = predictions.cpu().numpy()
    # Apply sigmoid and clip to ensure predictions are in [0, 1]
    predictions = 1 / (1 + np.exp(-predictions))
    predictions = np.clip(predictions, 0, 1)

    nodes_df = G._nodes.to_pandas()
    nodes_df['walkability_gnn'] = predictions
    
    # Ensure no NaN values in predictions
    if nodes_df['walkability_gnn'].isna().any():
        logging.warning(f"Found {nodes_df['walkability_gnn'].isna().sum()} NaN values in GNN predictions. Filling with 0.")
        nodes_df['walkability_gnn'] = nodes_df['walkability_gnn'].fillna(0)
    
    # Log prediction distribution
    prediction_stats = pd.Series(predictions).describe()
    logging.info(f"GNN prediction (walkability_gnn) distribution:\n{prediction_stats}")
    
    G._nodes = cudf.from_pandas(nodes_df)

    logging.info("Finished predicting walkability with GNN.")
    return G

Cell 10: Interactive Map Generation (create_interactive_map)

In [35]:
def create_interactive_map(G, data):
    """Generate an interactive Kepler.gl map to visualize walkability scores."""
    logging.info("Generating interactive Kepler.gl map...")
    
    nodes_df = G._nodes.to_pandas()
    neighborhoods_gdf = data['neighborhoods'].copy()

    # Standardize LIE_NAME for merging
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].astype(str).str.strip()

    # Filter for neighborhood nodes and select necessary columns
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'][['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category']]

    # Log for debugging
    nodes_lie_names = set(neighborhood_nodes['LIE_NAME'])
    gdf_lie_names = set(neighborhoods_gdf['LIE_NAME'])
    logging.info(f"Neighborhood nodes count: {len(neighborhood_nodes)}")
    logging.info(f"Neighborhoods_gdf count: {len(neighborhoods_gdf)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {list(nodes_lie_names)[:5]}")
    logging.info(f"Sample LIE_NAME in neighborhoods_gdf: {list(gdf_lie_names)[:5]}")
    logging.info(f"Common LIE_NAMEs: {len(nodes_lie_names & gdf_lie_names)}")
    logging.info(f"Nodes LIE_NAMEs not in GDF: {list(nodes_lie_names - gdf_lie_names)}")
    logging.info(f"GDF LIE_NAMEs not in nodes: {list(gdf_lie_names - nodes_lie_names)}")
    logging.info(f"Nodes nulls: {neighborhood_nodes.isna().sum().to_dict()}")
    logging.info(f"GDF geometry nulls: {neighborhoods_gdf['geometry'].isna().sum()}")

    # Merge data
    map_data = neighborhoods_gdf[['LIE_NAME', 'geometry']].merge(
        neighborhood_nodes,
        on='LIE_NAME',
        how='left'
    )

    # Drop duplicates in-place
    map_data.drop_duplicates(subset=['LIE_NAME'], keep='first', inplace=True)

    # Log merge results and score distributions
    logging.info(f"Merged map_data rows: {len(map_data)}")
    logging.info(f"Walkability score nulls: {map_data['walkability_score'].isna().sum()}")
    logging.info(f"Walkability GNN nulls: {map_data['walkability_gnn'].isna().sum()}")
    logging.info(f"Walkability score distribution in map_data:\n{map_data['walkability_score'].describe()}")
    logging.info(f"Walkability GNN distribution in map_data:\n{map_data['walkability_gnn'].describe()}")
    logging.info(f"Walkability category distribution in map_data:\n{map_data['walkability_category'].value_counts()}")

    # Fill NaN values
    map_data['walkability_score'] = map_data['walkability_score'].fillna(0)
    map_data['walkability_gnn'] = map_data['walkability_gnn'].fillna(0)
    map_data['walkability_category'] = map_data['walkability_category'].fillna('low')

    # Convert to GeoDataFrame and transform CRS
    map_data = gpd.GeoDataFrame(map_data, geometry='geometry', crs='EPSG:3826')
    map_data['geometry'] = map_data['geometry'].to_crs('EPSG:4326')
    
    # Prepare data for Kepler.gl
    kepler_data = {
        'neighborhoods': map_data[['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category', 'geometry']].to_json()
    }

    # Kepler.gl configuration
    config = {
        "version": "v1",
        "config": {
            "visState": {
                "layers": [
                    {
                        "id": "neighborhoods",
                        "type": "geojson",
                        "config": {
                            "dataId": "neighborhoods",
                            "label": "Neighborhoods",
                            "color": [18, 147, 154],
                            "columns": {
                                "geojson": "geometry"
                            },
                            "isVisible": True,
                            "visConfig": {
                                "opacity": 0.7,
                                "strokeOpacity": 0.9,
                                "thickness": 1,
                                "strokeColor": [255, 255, 255],
                                "colorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "strokeColorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "colorField": {
                                    "name": "walkability_gnn",
                                    "type": "real"
                                },
                                "colorScale": "quantile"
                            }
                        },
                        "visualChannels": {
                            "colorField": {
                                "name": "walkability_gnn",
                                "type": "real"
                            },
                            "colorScale": "quantile"
                        }
                    }
                ],
                "interactionConfig": {
                    "tooltip": {
                        "fieldsToShow": {
                            "neighborhoods": [
                                {"name": "LIE_NAME", "format": None},
                                {"name": "walkability_score", "format": "{:.3f}"},
                                {"name": "walkability_gnn", "format": "{:.3f}"},
                                {"name": "walkability_category", "format": None}
                            ]
                        },
                        "enabled": True
                    }
                }
            },
            "mapState": {
                "latitude": 25.0330,
                "longitude": 121.5654,
                "zoom": 11
            },
            "mapStyle": {
                "styleType": "dark"
            }
        }
    }

    map_1 = KeplerGl(height=800, data=kepler_data, config=config)
    map_path = os.path.join(BASE_DIR, 'taipei_walkability_map.html')
    map_1.save_to_html(file_name=map_path)
    logging.info(f"Interactive map generated and saved as {map_path}")
    print(f"Map saved to {map_path}!")

Cell 11: Main Execution (main)

In [36]:
def main(force_recompute_graph=False):
    """Main execution pipeline for the analysis."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    os.makedirs(SUBGRAPH_DIR, exist_ok=True)
    logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

    # Track timing for each step
    timings = {}
    
    try:
        # Step 1: Load and prepare data
        start_time = time.time()
        print("Starting load_and_prepare_data...")
        data = load_and_prepare_data()
        timings['load_and_prepare_data'] = time.time() - start_time

        # Step 2: Compute road type accident correlation
        start_time = time.time()
        print("Starting compute_road_type_accident_correlation...")
        road_accident_summary = compute_road_type_accident_correlation(
            data['roads'], data['neighborhoods'], data['accidents']
        )
        timings['compute_road_type_accident_correlation'] = time.time() - start_time

        # Step 3: Build graph
        start_time = time.time()
        print("Starting build_graph...")
        G = build_graph(data, force_recompute=force_recompute_graph)
        timings['build_graph'] = time.time() - start_time

        # Validate edge counts
        edge_count = G.edgelist.edgelist_df.shape[0] if G.edgelist else 0
        logging.info(f"Graph edge count: {edge_count}")
        if edge_count == 0:
            logging.warning("Graph has no edges. GNN will not utilize graph structure.")

        # Step 4: Compute walkability scores
        start_time = time.time()
        print("Starting compute_walkability_scores...")
        G = compute_walkability_scores(G, data)
        timings['compute_walkability_scores'] = time.time() - start_time

        # Step 5: Prepare GNN data
        start_time = time.time()
        print("Starting prepare_gnn_data...")
        data_gnn = prepare_gnn_data(G)
        timings['prepare_gnn_data'] = time.time() - start_time

        # Step 6: Train GNN model
        start_time = time.time()
        print("Starting train_gnn_model...")
        model = train_gnn_model(data_gnn)
        timings['train_gnn_model'] = time.time() - start_time

        # Step 7: Predict walkability
        start_time = time.time()
        print("Starting predict_walkability...")
        G = predict_walkability(G, model)
        timings['predict_walkability'] = time.time() - start_time

        # Step 8: Create interactive map
        start_time = time.time()
        print("Starting create_interactive_map...")
        create_interactive_map(G, data)
        timings['create_interactive_map'] = time.time() - start_time

        # Final validation: Check walkability scores
        nodes_df = G._nodes.to_pandas()
        neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood']
        walkability_score_stats = neighborhood_nodes['walkability_score'].describe()
        walkability_gnn_stats = neighborhood_nodes['walkability_gnn'].describe()
        walkability_category_dist = neighborhood_nodes['walkability_category'].value_counts()
        non_zero_walkability = (neighborhood_nodes['walkability_score'] > 0).sum()
        non_zero_walkability_gnn = (neighborhood_nodes['walkability_gnn'] > 0).sum()
        
        logging.info("Final validation - Walkability scores in neighborhood nodes:")
        logging.info(f"Walkability score distribution:\n{walkability_score_stats}")
        logging.info(f"Walkability GNN distribution:\n{walkability_gnn_stats}")
        logging.info(f"Walkability category distribution:\n{walkability_category_dist}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_score: {non_zero_walkability}/{len(neighborhood_nodes)}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_gnn: {non_zero_walkability_gnn}/{len(neighborhood_nodes)}")

        # Check for low variation in walkability scores
        if walkability_score_stats['std'] < 0.05:
            logging.warning("Walkability scores have low variation (std < 0.05). Components may need adjustment.")
        if walkability_gnn_stats['std'] < 0.05:
            logging.warning("GNN predictions have low variation (std < 0.05). Check edge creation and model training.")

        # Compute correlation between walkability_score and walkability_gnn
        corr, p_value = pearsonr(neighborhood_nodes['walkability_score'], neighborhood_nodes['walkability_gnn'])
        logging.info(f"Correlation between walkability_score and walkability_gnn: {corr:.2f} (p-value: {p_value:.2f})")
        if corr < 0.5:
            logging.warning("Low correlation between walkability_score and walkability_gnn. GNN predictions may not align well with rule-based scores.")

        # Log timing summary
        logging.info("Processing complete. Timing summary:")
        for step, duration in timings.items():
            logging.info(f"{step}: {duration:.2f} seconds")
        
        print("Pipeline completed successfully.")
        print(G.edgelist.edgelist_df.to_pandas().head())

    except Exception as e:
        logging.error(f"Pipeline failed with error: {str(e)}")
        raise

if __name__ == "__main__":
    main(force_recompute_graph=True)

2025-04-22 21:47:46,856 - INFO - Ensured subgraph directory exists: /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs
2025-04-22 21:47:46,857 - INFO - Stage 1: Loading and preparing data...


Starting load_and_prepare_data...


Loading files:   0%|          | 0/8 [00:00<?, ?it/s]2025-04-22 21:47:46,961 - INFO - Loaded neighborhoods with shape (456, 57)
Loading files:  12%|█▎        | 1/8 [00:00<00:00,  9.69it/s]2025-04-22 21:47:48,157 - INFO - Loaded buildings with shape (74306, 9)
Loading files:  25%|██▌       | 2/8 [00:01<00:04,  1.34it/s]2025-04-22 21:47:48,209 - INFO - Loaded roads with shape (81444, 2)
2025-04-22 21:47:48,244 - INFO - Loaded trees with shape (5019, 12)
2025-04-22 21:47:48,319 - INFO - Loaded transit with shape (29892, 11)
Loading files:  62%|██████▎   | 5/8 [00:01<00:00,  3.99it/s]2025-04-22 21:47:48,973 - INFO - Loaded urban_masterplan with shape (15521, 15)
2025-04-22 21:47:50,082 - INFO - Loaded accidents with shape (56133, 8)
Loading files:  88%|████████▊ | 7/8 [00:03<00:00,  1.97it/s]2025-04-22 21:47:50,091 - INFO - Columns in population_df after loading: ['LIE_NAME', 'Total_Population', 'Elderly_Percentage']
2025-04-22 21:47:50,092 - INFO - Loaded population with shape (456, 3)
Loa


--- Percentage Calculation Process ---

Neighborhood: 板溪里 (Index: 373)
Total unique master plan area: 63777.59 m²
Area of City_Open_Area (priority 10): 478.13 m²
Area of Education (priority 6): 8173.36 m²
Area of Commercial (priority 4): 23017.42 m²
Area of Residential (priority 3): 32108.69 m²

Percentages:
City_Open_Area: 0.75%
Education: 12.82%
Commercial: 36.09%
Residential: 50.34%
Sum of percentages: 100.00%

Neighborhood: 芝山里 (Index: 39)
Total unique master plan area: 1061285.95 m²
Area of Education (priority 6): 80442.15 m²
Area of Residential (priority 3): 196204.41 m²
Area of Natural (priority 2): 775753.11 m²
Area of River (priority 1): 2754.93 m²
Area of Government (priority 1): 6131.35 m²

Percentages:
Education: 7.58%
Residential: 18.49%
Natural: 73.10%
River: 0.26%
Government: 0.58%
Sum of percentages: 100.00%

Neighborhood: 和平里 (Index: 340)
Total unique master plan area: 98073.11 m²
Area of City_Open_Area (priority 10): 4608.53 m²
Area of Public_Transportation (priority

2025-04-22 21:48:20,901 - INFO - Finished loading and preparing data.
2025-04-22 21:48:20,948 - INFO - Computing correlation between road types and accident density...



--- Data Structure Summary ---

Dataset: neighborhoods
Shape: (456, 67)
Columns and Data Types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
transit_count                        int64
accident_count                       int64
road_density                       float64
total_population                     int64
elderly_percentage                 float64
Length: 67, dtype: object
Missing values (total): 0
Missing values per column:
LIE_NAME                           0
SECT_NAME                          0
2024population                     0
land_use_city_open_area_count      0
land_use_city_open_area_area_m2    0
                                  ..
transit_count                      0
accident_count                     0
road_density                       0
total_population  

2025-04-22 21:48:21,412 - INFO - Assigning accidents to nearest road...
2025-04-22 21:48:26,095 - INFO - Matched 27933 accidents out of 56133
2025-04-22 21:48:26,099 - INFO - Reassigning 2200 accidents from footway/cycleway...
2025-04-22 21:48:26,151 - INFO - Reassigned 612 accidents to wider roads
2025-04-22 21:48:26,158 - INFO - Accidents by road type:
class
bridleway           0
cycleway          110
footway          1478
living_street      36
motorway           62
path               40
pedestrian         36
primary          3503
residential      4474
secondary        9515
service          2231
steps              22
tertiary         3985
track               4
trunk            1276
unclassified      719
unknown            23
Name: accident_count, dtype: int64
2025-04-22 21:48:26,167 - INFO - Road type counts:
class
service          21204
footway          16755
residential      14861
tertiary          5113
secondary         3869
path              3610
steps             2968
unclassifi

Road type counts:
class
service          21204
footway          16755
residential      14861
tertiary          5113
secondary         3869
path              3610
steps             2968
unclassified      1894
primary           1209
cycleway           825
track              716
trunk              593
motorway           313
pedestrian         297
living_street      264
unknown             54
bridleway            2
Name: count, dtype: int64

--- Road Type Accident Density Summary ---
            class    length_m  accident_count  accident_density  width_rank
1        cycleway   263682.07             109              0.12           1
2         footway  1767503.72            1443              0.33           1
3   living_street    23929.78              35              0.76           3
4        motorway   215317.82              62              1.55           5
5            path   720602.60              39              0.04           1
6      pedestrian    31830.26              35              

2025-04-22 21:48:26,712 - INFO - Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


2025-04-22 21:48:27,135 - INFO - Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


2025-04-22 21:48:27,386 - INFO - Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
2025-04-22 21:48:27,389 - INFO - Top 3 road types by accident density:
        class  accident_density
7     primary             30.15
9   secondary             26.77
14      trunk              8.54
2025-04-22 21:48:27,390 - INFO - Stage 2: Building city graph...
2025-04-22 21:48:27,391 - INFO - Dataset neighborhoods column types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
accident_count                       int64
road_density                       float64
total_population                     int64
elderly_percentage                 float64
avg_road_accident_density          float64
Length: 68, dtype: object
2025-04-22 21:48:27,400 - I

Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
Top 3 road types by accident density:
        class  accident_density
7     primary             30.15
9   secondary             26.77
14      trunk              8.54
Starting build_graph...


Neighborhood nodes: 100%|██████████| 456/456 [00:00<00:00, 19563.67it/s]
2025-04-22 21:48:27,454 - INFO - Adding building nodes...
Building nodes: 100%|██████████| 74306/74306 [00:01<00:00, 56245.83it/s]
2025-04-22 21:48:28,777 - INFO - Adding road nodes...
Road nodes: 100%|██████████| 81444/81444 [00:01<00:00, 52667.21it/s]
2025-04-22 21:48:30,717 - INFO - Creating spatial indices...
2025-04-22 21:48:33,218 - INFO - Creating edges based on spatial proximity...
2025-04-22 21:48:33,219 - INFO - Computing neighborhood-neighborhood edges...
Neighborhood-Neighborhood edges: 100%|██████████| 456/456 [00:03<00:00, 120.49it/s]
2025-04-22 21:48:37,363 - INFO - Created 2796 neighborhood-neighborhood edges
2025-04-22 21:48:37,363 - INFO - Computing neighborhood-building edges...
Neighborhood-Building edges: 100%|██████████| 456/456 [05:20<00:00,  1.42it/s]
2025-04-22 21:53:58,231 - INFO - Created 77576 total edges after neighborhood-building
2025-04-22 21:53:58,231 - INFO - Computing neighborhoo

Starting compute_walkability_scores...


2025-04-22 21:58:20,308 - INFO - Filtered 20977 roads of types ['footway', 'pedestrian', 'cycleway'] out of 81444 total roads.
2025-04-22 21:58:20,309 - INFO - Checking spatial overlap between neighborhoods and pedestrian_roads...
2025-04-22 21:58:20,630 - INFO - neighborhoods bounds: [ 296266.05303084 2761514.89561711  317197.26073793 2789176.16901603]
2025-04-22 21:58:20,631 - INFO - pedestrian_roads bounds: [ 295216.88848867 2758756.21684514  314125.37860428 2787925.43931427]
2025-04-22 21:58:20,631 - INFO - Bounding boxes overlap: True
2025-04-22 21:58:20,640 - INFO - Sample intersection check: 337 intersections found out of 10 samples.
2025-04-22 21:58:20,749 - INFO - Pedestrian roads join resulted in 24636 matches.
2025-04-22 21:58:20,754 - INFO - Pedestrian road density stats:
count    456.000000
mean      15.841759
std       10.276229
min        0.000000
25%        8.202007
50%       15.006599
75%       21.558429
max       57.267153
Name: pedestrian_road_density, dtype: float64

Starting prepare_gnn_data...


Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00, 69.27it/s]
Preparing edge index: 100%|██████████| 143871/143871 [00:01<00:00, 97204.32it/s]
2025-04-22 21:58:22,944 - INFO - Prepared GNN data: 156206 nodes, 0 edges
2025-04-22 21:58:22,945 - INFO - Feature matrix shape: torch.Size([156206, 14])
2025-04-22 21:58:22,954 - INFO - Stage 4: Training GNN model...
2025-04-22 21:58:22,961 - INFO - Target (walkability_score) distribution:
count    156206.000000
mean          0.000732
std           0.013985
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           0.487566
dtype: float64


Starting train_gnn_model...


2025-04-22 21:58:23,463 - INFO - Epoch 0, Train Loss: 0.1660, Val Loss: 0.1804
2025-04-22 21:58:23,516 - INFO - Epoch 10, Train Loss: 0.1376, Val Loss: 0.1511
2025-04-22 21:58:23,554 - INFO - Epoch 20, Train Loss: 0.1125, Val Loss: 0.1250
2025-04-22 21:58:23,592 - INFO - Epoch 30, Train Loss: 0.0908, Val Loss: 0.1023
2025-04-22 21:58:23,632 - INFO - Epoch 40, Train Loss: 0.0726, Val Loss: 0.0831
2025-04-22 21:58:23,670 - INFO - Epoch 50, Train Loss: 0.0576, Val Loss: 0.0671
2025-04-22 21:58:24,010 - INFO - Epoch 60, Train Loss: 0.0454, Val Loss: 0.0540
2025-04-22 21:58:24,055 - INFO - Epoch 70, Train Loss: 0.0358, Val Loss: 0.0435
2025-04-22 21:58:24,098 - INFO - Epoch 80, Train Loss: 0.0283, Val Loss: 0.0351
2025-04-22 21:58:24,134 - INFO - Epoch 90, Train Loss: 0.0226, Val Loss: 0.0286
Training epochs: 100%|██████████| 100/100 [00:01<00:00, 95.87it/s]
2025-04-22 21:58:24,165 - INFO - Finished training GNN model.
2025-04-22 21:58:24,166 - INFO - Predicting walkability with GNN...
2025

Starting predict_walkability...


Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00, 73.09it/s]
Preparing edge index: 100%|██████████| 143871/143871 [00:01<00:00, 90284.28it/s]
2025-04-22 21:58:26,111 - INFO - Prepared GNN data: 156206 nodes, 0 edges
2025-04-22 21:58:26,112 - INFO - Feature matrix shape: torch.Size([156206, 14])
2025-04-22 21:58:26,222 - INFO - GNN prediction (walkability_gnn) distribution:
count    156206.000000
mean          0.503724
std           0.002088
min           0.499809
25%           0.503542
50%           0.503557
75%           0.503633
max           0.620021
dtype: float64
2025-04-22 21:58:26,338 - INFO - Finished predicting walkability with GNN.
2025-04-22 21:58:26,343 - INFO - Generating interactive Kepler.gl map...
2025-04-22 21:58:26,468 - INFO - Neighborhood nodes count: 456
2025-04-22 21:58:26,469 - INFO - Neighborhoods_gdf count: 456
2025-04-22 21:58:26,469 - INFO - Sample LIE_NAME in nodes_df: ['南福里', '天母里', '松光里', '社子里', '新和里']
2025-04-22 21:58:26,469 - INFO - S

Starting create_interactive_map...


2025-04-22 21:58:26,607 - INFO - Interactive map generated and saved as /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html
2025-04-22 21:58:26,697 - INFO - Final validation - Walkability scores in neighborhood nodes:
2025-04-22 21:58:26,698 - INFO - Walkability score distribution:
count    456.000000
mean       0.250785
std        0.065527
min        0.046299
25%        0.209132
50%        0.254688
75%        0.289512
max        0.487566
Name: walkability_score, dtype: float64
2025-04-22 21:58:26,698 - INFO - Walkability GNN distribution:
count    456.000000
mean       0.537118
std        0.018447
min        0.499809
25%        0.524181
50%        0.533877
75%        0.548079
max        0.620021
Name: walkability_gnn, dtype: float64
2025-04-22 21:58:26,699 - INFO - Walkability category distribution:
walkability_category
low       413
medium     43
Name: count, dtype: int64
2025-04-22 21:58:26,699 - INFO - Number of neighborhood nodes with non-zero walkab

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Pipeline completed successfully.
     src     dst
0  41293    7328
1  41293   54168
2  41293  108167
3  41293  120864
4  41293   49042
