Cell 0: CUDA Version check

In [1]:
import cudf, cugraph
print(cudf.__version__, cugraph.__version__)

25.02.02 25.02.00


Cell 1: Imports ,Global Constants

In [2]:
# Standard library imports
import os
import logging
import hashlib
import json
import time

# Third-party imports
import geopandas as gpd
import pandas as pd
import cudf
import cugraph
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from scipy.stats import pearsonr, spearmanr
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
from tqdm import tqdm
from keplergl import KeplerGl
from multiprocessing import Pool, cpu_count
from sklearn.metrics import mean_absolute_error, r2_score
from torch_geometric.nn import GATConv, BatchNorm
import torch.nn.functional as F
import contextily as ctx
# Plotting configuration
%matplotlib inline
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC', 'Noto Serif CJK TC', 'Noto Sans Mono CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Directory and file paths
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical_corrected.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population_corrected.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
INTERSECTION_CACHE_PATH = os.path.join(BASE_DIR, "neighborhoods_with_intersections.geoparquet")
GRAPH_NODES_CACHE_PATH = os.path.join(BASE_DIR, "graph_nodes.parquet")
GRAPH_EDGES_CACHE_PATH = os.path.join(BASE_DIR, "graph_edges.parquet")
GRAPH_NODE_ID_CACHE_PATH = os.path.join(BASE_DIR, "graph_node_id_to_index.json")
GRAPH_DATA_HASH_PATH = os.path.join(BASE_DIR, "graph_data_hash.txt")

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Constants for spatial analysis
BUFFER_DISTANCE = 10  # Meters, buffer distance for border sharing of accidents (tunable based on spatial resolution)
MIN_ROAD_LENGTH = 10  # Meters, minimum road length to avoid inflated accident density (tunable based on dataset)

# Land use category priorities for area assignment
CATEGORY_PRIORITY = {
    'City_Open_Area': 10,
    'Pedestrian': 9,
    'Public_Transportation': 8,
    'Amenity': 7,
    'Education': 6,
    'Medical': 5,
    'Commercial': 4,
    'Residential': 3,
    'Natural': 2,
    'Road': 1,
    'River': 1,
    'Infrastructure': 1,
    'Government': 1,
    'Special_Zone': 1,
    'Military': 1,
    'Industrial': 1,
    'Agriculture': 1
}

# Weights for land use diversity in walkability scoring
land_use_weights = {
    'city_open_area': 0.8,
    'commercial': 0.7,
    'infrastructure': 0.4,
    'government': 0.5,
    'public_transportation': 0.8,
    'education': 0.7,
    'medical': 0.6,
    'amenity': 0.8,
    'road': 0.3,
    'pedestrian': 1.0,
    'natural': 0.7,
    'special_zone': 0.4,
    'river': 0.7,
    'military': 0.2,
    'residential': 0.6,
    'industrial': 0.3,
    'agriculture': 0.4
}

"""
Normalise Information to Standard 
Read and Group to standard
Merge to standard
graph
kepler merge
"""

'\nNormalise Information to Standard \nRead and Group to standard\nMerge to standard\ngraph\nkepler merge\n'

Cell 2: Utility Functions

In [3]:
import pandas as pd
import numpy as np
import logging
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from scipy.stats import pearsonr
import geopandas as gpd
from shapely import make_valid

def compute_land_use_percentages(neighborhoods_gdf, urban_masterplan, land_use_categories, category_priority):
    """Compute land use percentages for each neighborhood based on urban_masterplan intersections."""
    logging.info("Computing land use percentages for neighborhoods...")
    neighborhoods_gdf = neighborhoods_gdf.copy()
    
    # Validate urban_masterplan
    if urban_masterplan.empty:
        logging.error("urban_masterplan is empty. Using default percentages.")
        for cat in land_use_categories:
            neighborhoods_gdf[f'land_use_{cat}_percent'] = 100.0 / len(land_use_categories)
        return neighborhoods_gdf
    
    if 'Category' not in urban_masterplan.columns:
        logging.error("urban_masterplan missing 'Category' column. Using default percentages.")
        for cat in land_use_categories:
            neighborhoods_gdf[f'land_use_{cat}_percent'] = 100.0 / len(land_use_categories)
        return neighborhoods_gdf
    
    # Ensure CRS alignment
    if neighborhoods_gdf.crs != urban_masterplan.crs:
        logging.info(f"Aligning urban_masterplan CRS from {urban_masterplan.crs} to {neighborhoods_gdf.crs}")
        urban_masterplan = urban_masterplan.to_crs(neighborhoods_gdf.crs)
    
    # Fix geometries
    neighborhoods_gdf['geometry'] = neighborhoods_gdf['geometry'].apply(make_valid)
    urban_masterplan = urban_masterplan.copy()
    urban_masterplan['geometry'] = urban_masterplan['geometry'].apply(make_valid)
    
    # Initialize columns
    for cat in land_use_categories:
        neighborhoods_gdf[f'land_use_{cat}_percent'] = 0.0
    
    for idx, row in neighborhoods_gdf.iterrows():
        try:
            neighborhood_geom = row['geometry']
            if not neighborhood_geom.is_valid:
                logging.warning(f"Invalid geometry for neighborhood {row['LIE_NAME']}. Skipping.")
                continue
            
            # Find intersecting masterplan polygons
            relevant_masterplan = urban_masterplan[urban_masterplan.intersects(neighborhood_geom)]
            if relevant_masterplan.empty:
                logging.debug(f"No masterplan polygons intersect with neighborhood {row['LIE_NAME']}.")
                continue
            
            # Perform overlay to get intersection areas
            temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs=neighborhoods_gdf.crs)
            intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
            if intersected.empty:
                logging.debug(f"No valid intersections for neighborhood {row['LIE_NAME']} after overlay.")
                continue
            
            intersected['geometry'] = intersected['geometry'].apply(make_valid)
            intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
            if intersected.empty:
                logging.debug(f"No valid geometries after fixing for neighborhood {row['LIE_NAME']}.")
                continue
            
            # Assign priorities and sort
            intersected['priority'] = intersected['Category'].map(category_priority).fillna(0)
            intersected = intersected.sort_values(by='priority', ascending=False)
            
            total_area = intersected.geometry.union_all().area
            if total_area == 0:
                logging.debug(f"Zero total area for neighborhood {row['LIE_NAME']}.")
                continue
            
            # Compute areas for each category
            remaining_geom = intersected.geometry.union_all()
            category_areas = {}
            for category in intersected['Category'].unique():
                category_rows = intersected[intersected['Category'] == category]
                category_geom = category_rows.geometry.union_all()
                try:
                    category_area_geom = category_geom.intersection(remaining_geom)
                    category_areas[category] = category_area_geom.area
                    remaining_geom = remaining_geom.difference(category_area_geom)
                except Exception as e:
                    logging.warning(f"Error computing area for category {category} in {row['LIE_NAME']}: {e}")
                    category_areas[category] = 0.0
            
            # Assign percentages
            for category, area in category_areas.items():
                cat_key = category.lower().replace(' ', '_')
                if cat_key in land_use_categories:
                    percentage = (area / total_area * 100) if total_area > 0 else 0.0
                    neighborhoods_gdf.at[idx, f'land_use_{cat_key}_percent'] = percentage
                    logging.debug(f"Neighborhood {row['LIE_NAME']}: {cat_key} = {percentage:.2f}%")
        
        except Exception as e:
            logging.error(f"Error processing neighborhood {row['LIE_NAME']}: {e}")
            continue
    
    # Check if any percentages were computed
    if neighborhoods_gdf[land_use_cols].sum().sum() == 0:
        logging.warning("No land use percentages computed. Assigning default equal distribution.")
        for cat in land_use_categories:
            neighborhoods_gdf[f'land_use_{cat}_percent'] = 100.0 / len(land_use_categories)
    
    return neighborhoods_gdf

In [4]:
def compute_walkability_components_all(neighborhoods_gdf, data):
    # Step 1: Validate Land Use Percentages
    land_use_categories = list(data.get('land_use_weights', {}).keys())
    land_use_cols = [f'land_use_{cat}_percent' for cat in land_use_categories]
    available_cols = [col for col in land_use_cols if col in neighborhoods_gdf.columns]
    
    logging.info(f"Available land use columns: {available_cols}")
    logging.info(f"Missing land use columns: {[col for col in land_use_cols if col not in available_cols]}")
    
    # Check variation in land use columns
    for col in available_cols:
        logging.info(f"{col} stats:\n{neighborhoods_gdf[col].describe()}")
    
    def compute_land_use_score(row):
        score = 0.0
        valid_cats = 0
        for cat in land_use_categories:
            col = f'land_use_{cat}_percent'
            if col in row and not pd.isna(row[col]) and row[col] > 0:
                p = row[col] / 100.0
                score += data['land_use_weights'][cat] * p
                valid_cats += 1
        # Fallback if no valid categories or score is zero
        if valid_cats == 0 or score == 0:
            non_zero_cats = sum(1 for cat in land_use_categories if f'land_use_{cat}_percent' in row and not pd.isna(row[f'land_use_{cat}_percent']) and row[f'land_use_{cat}_percent'] > 0)
            if non_zero_cats > 0:
                score = non_zero_cats / len(land_use_categories)
                logging.debug(f"Row {row.name}: Using non-zero categories: {non_zero_cats}, score: {score}")
            else:
                # Fallback: Use cluster-based score
                cluster = row.get('cluster', 0)
                score = 0.5 + (cluster * 0.1)
                logging.debug(f"Row {row.name}: No valid land use data. Using cluster-based score: {score}")
        return score
    
    neighborhoods_gdf['land_use_score'] = neighborhoods_gdf.apply(compute_land_use_score, axis=1)
    logging.info(f"land_use_score stats:\n{neighborhoods_gdf['land_use_score'].describe()}")
    logging.info(f"Sample land_use_score values (first 5 rows):\n{neighborhoods_gdf['land_use_score'].head()}")
    
    neighborhoods_gdf['population_density'] = np.log1p(neighborhoods_gdf['total_population'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6))
    logging.info(f"population_density (log) stats:\n{neighborhoods_gdf['population_density'].describe()}")
    
    neighborhoods_gdf['transit_density'] = np.log1p(neighborhoods_gdf['transit_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6))
    logging.info(f"transit_density (log) stats:\n{neighborhoods_gdf['transit_density'].describe()}")
    
    if 'ndvi' in neighborhoods_gdf.columns:
        if 'land_use_city_open_area_percent' in neighborhoods_gdf.columns and 'land_use_river_percent' in neighborhoods_gdf.columns:
            city_open_corr, _ = pearsonr(neighborhoods_gdf['ndvi'], neighborhoods_gdf['land_use_city_open_area_percent'].fillna(0) / 100.0)
            river_corr, _ = pearsonr(neighborhoods_gdf['ndvi'], neighborhoods_gdf['land_use_river_percent'].fillna(0) / 100.0)
            logging.info(f"Correlation between NDVI and City_Open_Area: {city_open_corr:.3f}")
            logging.info(f"Correlation between NDVI and River: {river_corr:.3f}")
        else:
            city_open_corr, river_corr = 0.5, 0.5
            logging.warning("Land use columns for City_Open_Area or River missing. Using default correlation weights.")

        city_open_contribution = neighborhoods_gdf['land_use_city_open_area_percent'].fillna(0) / 100.0
        river_accessibility = np.minimum(neighborhoods_gdf['land_use_pedestrian_percent'].fillna(0) / 100.0, 1.0)
        river_contribution = (neighborhoods_gdf['land_use_river_percent'].fillna(0) / 100.0) * river_accessibility

        w1, w2, w3 = 0.5, city_open_corr / (city_open_corr + river_corr + 1), river_corr / (city_open_corr + river_corr + 1)
        total_w = w1 + w2 + w3
        w1, w2, w3 = w1/total_w, w2/total_w, w3/total_w

        combined_green = (w1 * neighborhoods_gdf['ndvi'] + w2 * city_open_contribution + w3 * river_contribution)
        neighborhoods_gdf['green_space'] = 1 / (1 + np.exp(-5 * (combined_green - 0.5)))
        logging.info(f"green_space stats:\n{neighborhoods_gdf['green_space'].describe()}")
    else:
        logging.warning("'ndvi' missing. Using tree_density.")
        neighborhoods_gdf['tree_density'] = neighborhoods_gdf['tree_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
        combined_green = (0.5 * np.log1p(neighborhoods_gdf['tree_density']) + 
                         0.25 * (neighborhoods_gdf['land_use_city_open_area_percent'].fillna(0) / 100.0) + 
                         0.25 * (neighborhoods_gdf['land_use_river_percent'].fillna(0) / 100.0))
        neighborhoods_gdf['green_space'] = 1 / (1 + np.exp(-5 * (combined_green - 0.5)))
    
    neighborhoods_gdf['accident_density'] = np.log1p(neighborhoods_gdf['accident_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6))
    neighborhoods_gdf['intersection_density'] = np.log1p(neighborhoods_gdf['intersection_density'].fillna(0).clip(upper=neighborhoods_gdf['intersection_density'].quantile(0.95)))

    pedestrian_roads = data['roads'][data['roads']['class'].isin(['footway', 'pedestrian', 'cycleway'])]
    pedestrian_roads_in_neighborhoods = gpd.sjoin(pedestrian_roads, neighborhoods_gdf[['geometry', 'LIE_NAME', 'area_km2']], how='left', predicate='intersects')
    pedestrian_length_per_neighborhood = pedestrian_roads_in_neighborhoods.groupby('LIE_NAME')['length_m'].sum()
    neighborhoods_gdf['sidewalk_coverage'] = (neighborhoods_gdf['LIE_NAME'].map(pedestrian_length_per_neighborhood).fillna(0) / 
                                             neighborhoods_gdf['area_km2'].replace(0, 1e-6))
    neighborhoods_gdf['sidewalk_coverage'] = np.log1p(neighborhoods_gdf['sidewalk_coverage'])

    amenities_gdf = data['urban_masterplan'][data['urban_masterplan']['Category'].isin(['Commercial', 'Amenity', 'Education', 'Medical'])]
    neighborhoods_gdf_buffered = neighborhoods_gdf.copy()
    neighborhoods_gdf_buffered['geometry'] = neighborhoods_gdf_buffered['geometry'].buffer(500)
    amenities_in_neighborhoods = gpd.sjoin(amenities_gdf, neighborhoods_gdf_buffered[['geometry', 'LIE_NAME', 'area_km2']], how='left', predicate='intersects')
    amenities_count = amenities_in_neighborhoods.groupby('LIE_NAME').size().reset_index(name='amenities_count')
    neighborhoods_gdf = neighborhoods_gdf.merge(amenities_count, on='LIE_NAME', how='left')
    neighborhoods_gdf['amenities_count'] = neighborhoods_gdf['amenities_count'].fillna(0)
    neighborhoods_gdf['amenities_density'] = np.log1p(neighborhoods_gdf['amenities_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6))

    # Step 3: Define Components
    base_components = {
        'land_use_score': 'higher_better',
        'intersection_density': 'higher_better',
        'population_density': 'higher_better',
        'transit_density': 'higher_better',
        'green_space': 'higher_better',
        'accident_density': 'lower_better',
        'sidewalk_coverage': 'higher_better',
        'amenities_density': 'higher_better'
    }

    # Step 4: Cluster Neighborhoods
    clustering_features = ['population_density', 'transit_density', 'green_space']
    clustering_data = neighborhoods_gdf[clustering_features].fillna(0)
    kmeans = KMeans(n_clusters=3, random_state=42)
    neighborhoods_gdf['cluster'] = kmeans.fit_predict(clustering_data)
    logging.info(f"Neighborhood clusters:\n{neighborhoods_gdf['cluster'].value_counts()}")

    # Step 5: Dynamic Weighting with Increased Variation
    base_weights = {
        'land_use_score': 0.20, 'intersection_density': 0.05, 'population_density': 0.20,
        'transit_density': 0.20, 'green_space': 0.15, 'accident_density': 0.10,
        'sidewalk_coverage': 0.05, 'amenities_density': 0.05,
        'pop_transit_interaction': 0.05, 'safety_green_interaction': 0.05
    }
    
    def adjust_weights(cluster):
        weights = base_weights.copy()
        if cluster == 0:  # Urban
            weights['transit_density'] += 0.20
            weights['amenities_density'] += 0.20
            weights['green_space'] -= 0.20
        elif cluster == 1:  # Suburban
            weights['sidewalk_coverage'] += 0.20
            weights['population_density'] -= 0.20
        else:  # Rural
            weights['green_space'] += 0.20
            weights['accident_density'] += 0.20
            weights['transit_density'] -= 0.20
        total = sum(weights.values())
        return {k: v/total for k, v in weights.items()}

    # Step 6: Normalize Components
    means = {}
    stds = {}
    for comp in base_components:
        means[comp] = neighborhoods_gdf[comp].mean()
        stds[comp] = neighborhoods_gdf[comp].std() if neighborhoods_gdf[comp].std() != 0 else 1.0
        if stds[comp] == 0:
            logging.warning(f"Component {comp} has no variation. Adding small random noise.")
            neighborhoods_gdf[f'{comp}_norm'] = np.random.normal(0, 0.01, len(neighborhoods_gdf))
        else:
            neighborhoods_gdf[f'{comp}_norm'] = (neighborhoods_gdf[comp] - means[comp]) / stds[comp]
        logging.info(f"{comp}_norm stats:\n{neighborhoods_gdf[f'{comp}_norm'].describe()}")

    # Step 7: Interaction Terms
    neighborhoods_gdf['pop_transit_interaction'] = neighborhoods_gdf['population_density_norm'] * neighborhoods_gdf['transit_density_norm']
    neighborhoods_gdf['safety_green_interaction'] = neighborhoods_gdf['accident_density_norm'] * neighborhoods_gdf['green_space_norm']
    interaction_components = {
        'pop_transit_interaction': 'higher_better',
        'safety_green_interaction': 'lower_better'
    }
    for comp in interaction_components:
        means[comp] = neighborhoods_gdf[comp].mean()
        stds[comp] = neighborhoods_gdf[comp].std() if neighborhoods_gdf[comp].std() != 0 else 1.0
        if stds[comp] == 0:
            logging.warning(f"Interaction component {comp} has no variation. Adding small random noise.")
            neighborhoods_gdf[f'{comp}_norm'] = np.random.normal(0, 0.01, len(neighborhoods_gdf))
        else:
            neighborhoods_gdf[f'{comp}_norm'] = (neighborhoods_gdf[comp] - means[comp]) / stds[comp]
        logging.info(f"{comp}_norm stats:\n{neighborhoods_gdf[f'{comp}_norm'].describe()}")

    # Step 8: Combine Components
    components = {**base_components, **interaction_components}

    # Step 9: Compute Walkability Score with Debugging
    scores = []
    for idx, row in neighborhoods_gdf.iterrows():
        cluster = row['cluster']
        weights = adjust_weights(cluster)
        score = 0.0
        for comp in components:
            norm_col = f'{comp}_norm'
            if norm_col in neighborhoods_gdf.columns:
                score += weights.get(comp, 0.05) * row[norm_col]
            else:
                logging.error(f"Column {norm_col} missing.")
        scores.append(score)
        if idx < 5:  # Log first 5 rows
            logging.info(f"Row {idx}: cluster={cluster}, weights={weights}, score={score}")
    
    neighborhoods_gdf['walkability_score'] = scores
    logging.info(f"Walkability scores before scaling:\n{neighborhoods_gdf['walkability_score'].describe()}")

    # Step 10: Scale to [0, 1]
    min_score = neighborhoods_gdf['walkability_score'].min()
    max_score = neighborhoods_gdf['walkability_score'].max()
    logging.info(f"Min score: {min_score}, Max score: {max_score}")
    if max_score != min_score:
        neighborhoods_gdf['walkability_score'] = (neighborhoods_gdf['walkability_score'] - min_score) / (max_score - min_score)
    else:
        logging.warning("All scores identical. Setting to 0.5.")
        neighborhoods_gdf['walkability_score'] = 0.5
    neighborhoods_gdf['walkability_score'] = neighborhoods_gdf['walkability_score'].clip(0, 1)
    logging.info(f"Final walkability_score stats:\n{neighborhoods_gdf['walkability_score'].describe()}")

    # Step 11: Categorize
    low_threshold = neighborhoods_gdf['walkability_score'].quantile(0.33)
    high_threshold = neighborhoods_gdf['walkability_score'].quantile(0.66)
    neighborhoods_gdf['walkability_category'] = neighborhoods_gdf['walkability_score'].apply(
        lambda x: 'low' if x < low_threshold else 'medium' if x < high_threshold else 'high'
    )
    logging.info(f"Walkability category distribution:\n{neighborhoods_gdf['walkability_category'].value_counts()}")

    return_cols = ['LIE_NAME', 'walkability_score', 'walkability_category', 'cluster', 'green_space'] + [f'{comp}_norm' for comp in components]
    return neighborhoods_gdf[return_cols], components

In [5]:
def fix_geometry(geom, buffer_size=1e-5):
    """Fix invalid geometries with logging for debugging."""
    if geom is None or geom.is_empty:
        logging.debug("Geometry is None or empty, returning a default Point(0,0).")
        return Point(0, 0)
    try:
        geom = make_valid(geom)
        if not geom.is_valid:
            logging.debug(f"Geometry invalid after make_valid, applying buffer(0): {geom.bounds}")
            geom = geom.buffer(0)
            if not geom.is_valid:
                logging.debug(f"Geometry still invalid, applying buffer with size {buffer_size}: {geom.bounds}")
                geom = geom.buffer(buffer_size)
                geom = make_valid(geom)
        if not geom.is_valid:
            logging.warning(f"Geometry remains invalid after all attempts: {geom.bounds}. Returning default Point(0,0).")
            return Point(0, 0)
        return geom
    except Exception as e:
        logging.error(f"Error fixing geometry: {e}. Returning default Point(0,0).")
        return Point(0, 0)

def print_data_structure(data_dict):
    """Print a detailed summary of the data structure for each dataset."""
    print("\n--- Data Structure Summary ---")
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            print(f"\nDataset: {key}")
            print(f"Shape: {df.shape}")
            print(f"Columns and Data Types:\n{df.dtypes}")
            print(f"Missing values (total): {df.isnull().sum().sum()}")
            print(f"Missing values per column:\n{df.isnull().sum()}")
            if 'LIE_NAME' in df.columns:
                print(f"Unique LIE_NAME: {df['LIE_NAME'].nunique()}")
            if 'class' in df.columns and key == 'roads':
                print(f"Road class counts:\n{df['class'].value_counts()}")
            print(f"Sample data (first 2 rows):\n{df.head(2)}")
    print("--- End of Data Structure Summary ---\n")

def print_percentage_calculation(neighborhoods_gdf, urban_masterplan_gdf, sample_size=3):
    """Print the land use percentage calculation process for a sample of neighborhoods."""
    print("\n--- Percentage Calculation Process ---")
    sample_neighborhoods = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    for idx, row in sample_neighborhoods.iterrows():
        lie_name = row['LIE_NAME']
        print(f"\nNeighborhood: {lie_name} (Index: {idx})")
        
        neighborhood_geom = fix_geometry(row['geometry'])
        if not neighborhood_geom.is_valid:
            print(f"Neighborhood geometry is invalid after fixing: {lie_name}")
            continue
        
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            print("No master plan polygons intersect with this neighborhood.")
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            print("No valid intersections after overlay.")
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            print("No valid geometries after fixing intersected polygons.")
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area_geom = intersected.geometry.union_all()
        total_area = total_area_geom.area
        print(f"Total unique master plan area: {total_area:.2f} m²")
        
        remaining_geom = total_area_geom
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                print(f"Area of {category} (priority {CATEGORY_PRIORITY.get(category, 0)}): {category_area:.2f} m²")
                remaining_geom = remaining_geom.difference(category_area_geom)
            except GEOSException as e:
                print(f"Topology error for category {category}: {e}")
                category_areas[category] = 0.0
        
        print("\nPercentages:")
        total_percentage = 0.0
        for category, area in category_areas.items():
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            total_percentage += percentage
            print(f"{category}: {percentage:.2f}%")
        print(f"Sum of percentages: {total_percentage:.2f}%")
    print("--- End of Percentage Calculation Process ---\n")


In [6]:
def compute_data_hash(data_dict):
    """Compute a hash of the data for caching purposes."""
    hasher = hashlib.sha256()
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            hasher.update(str(df.shape).encode('utf-8'))
            hasher.update(str(sorted(df.columns)).encode('utf-8'))
            
            logging.info(f"Dataset {key} column types:\n{df.dtypes}")
            
            sample_df = df.head(5).copy()
            if 'geometry' in sample_df.columns:
                sample_df = sample_df.drop(columns=['geometry'])
            for col in sample_df.columns:
                sample_df[col] = sample_df[col].apply(
                    lambda x: x.tolist() if isinstance(x, np.ndarray) else
                              float(x) if isinstance(x, (np.floating, np.integer)) else x
                )
            try:
                sample = sample_df.to_json()
                hasher.update(sample.encode('utf-8'))
            except Exception as e:
                logging.error(f"Failed to serialize sample for dataset {key}: {e}")
                sample = str(sample_df.to_dict())
                hasher.update(sample.encode('utf-8'))
    return hasher.hexdigest()

def check_spatial_overlap(gdf1, gdf2, label1="gdf1", label2="gdf2"):
    """Check for spatial overlap between two GeoDataFrames and log the results."""
    logging.info(f"Checking spatial overlap between {label1} and {label2}...")
    gdf1 = gdf1.copy()
    gdf2 = gdf2.copy()
    
    if gdf1.crs != gdf2.crs:
        logging.warning(f"CRS mismatch between {label1} ({gdf1.crs}) and {label2} ({gdf2.crs}). Aligning to {gdf1.crs}...")
        gdf2 = gdf2.to_crs(gdf1.crs)
    
    gdf1['geometry'] = gdf1['geometry'].apply(fix_geometry)
    gdf2['geometry'] = gdf2['geometry'].apply(fix_geometry)
    
    gdf1_bounds = gdf1.total_bounds
    gdf2_bounds = gdf2.total_bounds
    logging.info(f"{label1} bounds: {gdf1_bounds}")
    logging.info(f"{label2} bounds: {gdf2_bounds}")
    
    bounds_overlap = not (gdf1_bounds[2] < gdf2_bounds[0] or
                         gdf1_bounds[0] > gdf2_bounds[2] or
                         gdf1_bounds[3] < gdf2_bounds[1] or
                         gdf1_bounds[1] > gdf2_bounds[3])
    logging.info(f"Bounding boxes overlap: {bounds_overlap}")
    
    sample_size = min(10, len(gdf1), len(gdf2))
    if sample_size > 0:
        sample_gdf1 = gdf1.sample(sample_size, random_state=42)
        intersects = gpd.sjoin(sample_gdf1, gdf2, how='inner', predicate='intersects')
        logging.info(f"Sample intersection check: {len(intersects)} intersections found out of {sample_size} samples.")
    
    return bounds_overlap

In [7]:
def validate_data(gdf, required_cols, name="GeoDataFrame"):
    """Validate that the GeoDataFrame has all required columns, no missing geometries, and valid geometries."""
    if gdf.empty:
        logging.error(f"{name} is empty.")
        raise ValueError(f"{name} is empty.")
    missing_cols = [col for col in required_cols if col not in gdf.columns]
    if missing_cols:
        logging.error(f"Missing columns in {name}: {missing_cols}")
        raise KeyError(f"Missing columns in {name}: {missing_cols}")
    if gdf.geometry.isna().any():
        logging.error(f"Missing geometries in {name}")
        raise ValueError(f"Missing geometries in {name}")
    if not all(gdf.geometry.is_valid):
        logging.error(f"Invalid geometries in {name}")
        raise ValueError(f"Invalid geometries in {name}")

Cell 3: Walkability Computation Functions

In [8]:
# Cell 3: Walkability Computation Functions (Updated to Return neighborhoods_gdf)
def compute_road_type_accident_correlation(roads_gdf, neighborhoods_gdf, accidents_gdf):
    logging.info("Computing correlation between road types and accident density...")
    
    # Validate input data
    validate_data(roads_gdf, ['class', 'geometry', 'length_m'], "roads_gdf")
    validate_data(neighborhoods_gdf, ['LIE_NAME', 'geometry'], "neighborhoods_gdf")
    validate_data(accidents_gdf, ['geometry'], "accidents_gdf")
    
    # Ensure correct CRS
    target_crs = 'EPSG:3826'
    source_crs = 'EPSG:4326'
    for gdf, name in [(roads_gdf, "roads"), (neighborhoods_gdf, "neighborhoods"), (accidents_gdf, "accidents")]:
        if gdf.crs is None:
            logging.warning(f"{name} has no CRS defined. Assuming {source_crs}.")
            gdf.set_crs(source_crs, inplace=True)
        if gdf.crs != target_crs:
            logging.info(f"Reprojecting {name} from {gdf.crs} to {target_crs}")
            gdf.to_crs(target_crs, inplace=True)
    
    # Log CRS, bounds, and sample geometries for debugging
    logging.info(f"Roads CRS: {roads_gdf.crs}, Bounds: {roads_gdf.total_bounds}")
    logging.info(f"Neighborhoods CRS: {neighborhoods_gdf.crs}, Bounds: {neighborhoods_gdf.total_bounds}")
    logging.info(f"Accidents CRS: {accidents_gdf.crs}, Bounds: {accidents_gdf.total_bounds}")
    logging.info(f"Roads geometry types: {roads_gdf.geometry.type.unique()}")
    logging.info(f"Neighborhoods geometry types: {neighborhoods_gdf.geometry.type.unique()}")
    sample_roads = roads_gdf.head(5)['geometry'].apply(lambda x: str(x)[:100])
    sample_neighborhoods = neighborhoods_gdf.head(5)['geometry'].apply(lambda x: str(x)[:100])
    logging.info(f"Sample road geometries:\n{sample_roads}")
    logging.info(f"Sample neighborhood geometries:\n{sample_neighborhoods}")
    
    # Visualize data for debugging
    fig, ax = plt.subplots(figsize=(10, 10))
    neighborhoods_gdf.plot(ax=ax, color='blue', alpha=0.5, label='Neighborhoods')
    roads_gdf.plot(ax=ax, color='red', alpha=0.5, label='Roads')
    plt.legend()
    plt.title('Roads and Neighborhoods Overlay')
    plt.savefig(os.path.join(BASE_DIR, 'roads_neighborhoods_overlap.png'))
    plt.close()
    logging.info(f"Overlay plot saved to {os.path.join(BASE_DIR, 'roads_neighborhoods_overlap.png')}")
    
    # Make local copies for roads and accidents
    roads_gdf_local = roads_gdf.copy()
    accidents_gdf_local = accidents_gdf.copy()
    
    # Add unique identifier to accidents
    accidents_gdf_local['accident_id'] = range(len(accidents_gdf_local))
    
    # Define width ranking
    width_ranking = {
        'motorway': 5, 'trunk': 5, 'primary': 4, 'secondary': 4, 'tertiary': 3,
        'residential': 3, 'living_street': 3, 'service': 2, 'track': 2,
        'path': 1, 'footway': 1, 'cycleway': 1, 'steps': 1, 'pedestrian': 1,
        'unclassified': 0, 'bridleway': 0, 'unknown': 0
    }
    roads_gdf_local['width_rank'] = roads_gdf_local['class'].map(width_ranking).fillna(0).astype(int)
    
    # Buffer wider roads for accident assignment
    roads_gdf_buffered = roads_gdf_local.copy()
    roads_gdf_buffered['geometry'] = roads_gdf_buffered.apply(
        lambda row: row['geometry'].buffer(5) if row['width_rank'] >= 4 else row['geometry'], axis=1
    )
    
    logging.info("Assigning accidents to nearest road...")
    accidents_gdf_local['geometry'] = accidents_gdf_local['geometry'].apply(fix_geometry)
    accidents_gdf_local = accidents_gdf_local[accidents_gdf_local['geometry'].is_valid & ~accidents_gdf_local['geometry'].is_empty]
    
    if accidents_gdf_local.empty:
        logging.warning("No valid accidents after geometry fixing.")
        return pd.DataFrame()
    
    # Assign accidents to nearest road
    nearest = gpd.sjoin_nearest(
        accidents_gdf_local,
        roads_gdf_buffered[['geometry', 'class', 'width_rank']],
        how='left',
        distance_col='distance'
    )
    nearest['weighted_distance'] = nearest['distance'] / (nearest['width_rank'].replace(0, 1) ** 2)
    nearest = nearest.sort_values('weighted_distance').drop_duplicates(subset=['accident_id'], keep='first')
    
    matched_accidents = nearest[['accident_id', 'index_right']].copy()
    matched_accidents.columns = ['accident_id', 'road_idx']
    matched_accidents = matched_accidents.dropna(subset=['road_idx'])
    matched_accidents['road_idx'] = matched_accidents['road_idx'].astype(int)
    
    logging.info(f"Matched {len(matched_accidents)} accidents out of {len(accidents_gdf_local)}")
    
    # Reassign accidents from footway/cycleway to wider roads if possible
    footway_cycleway_accidents = matched_accidents[
        matched_accidents['road_idx'].isin(
            roads_gdf_local[roads_gdf_local['class'].isin(['footway', 'cycleway'])].index
        )
    ]
    if not footway_cycleway_accidents.empty:
        logging.info(f"Reassigning {len(footway_cycleway_accidents)} accidents from footway/cycleway...")
        accidents_to_reassign = accidents_gdf_local[accidents_gdf_local['accident_id'].isin(footway_cycleway_accidents['accident_id'])].copy()
        wider_roads = roads_gdf_buffered[roads_gdf_buffered['width_rank'] >= 4]
        if not wider_roads.empty:
            reassigned = gpd.sjoin_nearest(
                accidents_to_reassign,
                wider_roads[['geometry', 'class']],
                how='left',
                max_distance=10
            )
            reassigned_matches = reassigned[['accident_id', 'index_right']].copy()
            reassigned_matches.columns = ['accident_id', 'road_idx']
            reassigned_matches = reassigned_matches.dropna(subset=['road_idx'])
            reassigned_matches['road_idx'] = reassigned_matches['road_idx'].astype(int)
            matched_accidents = matched_accidents[~matched_accidents['accident_id'].isin(reassigned_matches['accident_id'])]
            matched_accidents = pd.concat([matched_accidents, reassigned_matches], ignore_index=True)
            logging.info(f"Reassigned {len(reassigned_matches)} accidents to wider roads")
    
    # Count accidents per road
    accident_counts = matched_accidents.groupby('road_idx').size().reindex(roads_gdf_local.index, fill_value=0)
    roads_gdf_local['accident_count'] = accident_counts
    
    logging.info(f"Accidents by road type:\n{roads_gdf_local.groupby('class')['accident_count'].sum()}")
    
    # Filter roads by minimum length
    roads_gdf_local = roads_gdf_local[roads_gdf_local['length_m'] >= MIN_ROAD_LENGTH]
    
    # Log data integrity before calculating accident density
    logging.info(f"length_m stats:\n{roads_gdf_local['length_m'].describe()}")
    logging.info(f"accident_count stats:\n{roads_gdf_local['accident_count'].describe()}")
    
    # Calculate accident density
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_count'] / (roads_gdf_local['length_m'] / 1000)
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'].fillna(0).replace([np.inf, -np.inf], 0)
    
    # Store raw accident density before adjustments
    roads_gdf_local['accident_density_raw'] = roads_gdf_local['accident_density']
    
    # Adjust density by width rank
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'] * (roads_gdf_local['width_rank'].replace(0, 1) / 5)
    
    # Apply smoothing and logarithmic transformation to reduce sparsity and skewness
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'] + 0.1  # Add small constant to reduce zeros
    roads_gdf_local['accident_density'] = np.log1p(roads_gdf_local['accident_density'])
    
    # Log accident density integrity
    logging.info(f"NaN in accident_density: {roads_gdf_local['accident_density'].isna().sum()}")
    logging.info(f"accident_density stats:\n{roads_gdf_local['accident_density'].describe()}")
    
    # Compute road type summary
    road_types = roads_gdf_local['class'].unique()
    road_types = [rt for rt in road_types if rt != 'bridleway']
    summary_data = []
    for rt in road_types:
        rt_data = roads_gdf_local[roads_gdf_local['class'] == rt]
        total_length = rt_data['length_m'].sum()
        total_accidents = rt_data['accident_count'].sum()
        mean_density = rt_data['accident_density'].mean()
        width_rank = rt_data['width_rank'].iloc[0] if not rt_data.empty else 0
        summary_data.append({
            'class': rt,
            'length_m': total_length,
            'accident_count': total_accidents,
            'accident_density': mean_density,
            'width_rank': width_rank
        })
    
    summary = pd.DataFrame(summary_data)
    summary = summary[summary['length_m'] > 0]
    
    # Generate plots
    plt.figure(figsize=(12, 6))
    summary_sorted = summary.sort_values('width_rank', ascending=False)
    sns.barplot(data=summary_sorted, x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Mean Accident Density (log scale)')
    plt.title('Mean Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    bar_path = os.path.join(BASE_DIR, 'road_type_accident_bar.png')
    plt.savefig(bar_path)
    plt.close()
    logging.info(f"Bar chart saved to {bar_path}")
    print(f"Bar chart saved to {bar_path}")
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=roads_gdf_local[roads_gdf_local['class'].isin(summary['class'])], 
                x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Accident Density (log scale)')
    plt.title('Distribution of Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    box_path = os.path.join(BASE_DIR, 'road_type_accident_box.png')
    plt.savefig(box_path)
    plt.close()
    logging.info(f"Box chart saved to {box_path}")
    print(f"Box chart saved to {box_path}")
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=summary, x='width_rank', y='accident_density', 
                    size='length_m', sizes=(50, 500), hue='class', style='class', alpha=0.7)
    z = np.polyfit(summary['width_rank'], summary['accident_density'], 1)
    p = np.poly1d(z)
    plt.plot(summary['width_rank'], p(summary['width_rank']), "r--", alpha=0.5)
    plt.xlabel('Road Width Rank (1=Path, 5=Motorway)')
    plt.ylabel('Mean Accident Density (log scale)')
    plt.title('Road Type vs. Accident Density')
    plt.grid(True)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    scatter_path = os.path.join(BASE_DIR, 'road_type_accident_scatter.png')
    plt.savefig(scatter_path)
    plt.close()
    logging.info(f"Scatter plot saved to {scatter_path}")
    print(f"Scatter plot saved to {scatter_path}")
    
    top_types = summary.nlargest(3, 'accident_density')[['class', 'accident_density']]
    logging.info(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    print(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    
    logging.info("Computing average road accident density per neighborhood...")
    logging.info(f"Roads DataFrame shape before join: {roads_gdf_local.shape}")
    logging.info(f"Neighborhoods DataFrame shape before join: {neighborhoods_gdf.shape}")
    
    roads_with_neighborhood = gpd.sjoin(
        roads_gdf_local[['geometry', 'class', 'length_m', 'width_rank', 'accident_density']],
        neighborhoods_gdf[['geometry', 'LIE_NAME']],
        how='left',
        predicate='intersects'
    )
    logging.info(f"Road-neighborhood join resulted in {len(roads_with_neighborhood)} matches with columns: {list(roads_with_neighborhood.columns)}")
    logging.info(f"Non-NaN LIE_NAME count: {roads_with_neighborhood['LIE_NAME'].notna().sum()}")
    logging.info(f"Non-NaN accident_density count: {roads_with_neighborhood['accident_density'].notna().sum()}")
    logging.info(f"Unique LIE_NAME values: {roads_with_neighborhood['LIE_NAME'].nunique()}")
    
    avg_accident_density = roads_with_neighborhood.groupby('LIE_NAME')['accident_density'].mean().reset_index()
    avg_accident_density.columns = ['LIE_NAME', 'avg_road_accident_density']
    logging.info(f"Number of neighborhoods with calculated avg_accident_density: {len(avg_accident_density)}")
    logging.info(f"NaN in avg_accident_density: {avg_accident_density['avg_road_accident_density'].isna().sum()}")
    
    neighborhoods_gdf = neighborhoods_gdf.merge(avg_accident_density, on='LIE_NAME', how='left')
    neighborhoods_gdf['avg_road_accident_density'] = neighborhoods_gdf['avg_road_accident_density'].fillna(0)
    logging.info(f"Assigned avg_road_accident_density to {len(neighborhoods_gdf)} neighborhoods")
    logging.info(f"Avg road accident density stats:\n{neighborhoods_gdf['avg_road_accident_density'].describe()}")
    
    return summary, roads_gdf_local[['accident_density_raw', 'accident_density']], neighborhoods_gdf

In [9]:
def compute_pedestrian_road_density(roads_gdf, neighborhoods_gdf):
    # Filters roads classified as 'footway', 'pedestrian', or 'cycleway'.
    # Performs a spatial join to sum road lengths per neighborhood.
    # Calculates density as length (m) / area (km²).
    """Compute pedestrian road density (length of pedestrian roads per km²) for each neighborhood."""
    logging.info("Computing pedestrian road density per neighborhood...")
    
    # Filter pedestrian roads (e.g., footway, pedestrian, cycleway)
    pedestrian_classes = ['footway', 'pedestrian', 'cycleway']
    pedestrian_roads = roads_gdf[roads_gdf['class'].isin(pedestrian_classes)].copy()
    
    # Fix geometries
    pedestrian_roads['geometry'] = pedestrian_roads['geometry'].apply(fix_geometry)
    neighborhoods_gdf_with_idx = neighborhoods_gdf[['geometry', 'LIE_NAME', 'area_km2']].copy()
    neighborhoods_gdf_with_idx['geometry'] = neighborhoods_gdf_with_idx['geometry'].apply(fix_geometry)
    
    # Perform spatial join
    pedestrian_road_neighborhoods = gpd.sjoin(
        pedestrian_roads[['geometry', 'length_m']],
        neighborhoods_gdf_with_idx,
        how='left',
        predicate='intersects'
    )
    
    # Sum pedestrian road lengths per neighborhood
    pedestrian_length = pedestrian_road_neighborhoods.groupby('LIE_NAME')['length_m'].sum()
    
    # Calculate density (length in meters per km²)
    neighborhoods_gdf['pedestrian_road_density'] = neighborhoods_gdf['LIE_NAME'].map(pedestrian_length).fillna(0) / (neighborhoods_gdf['area_km2'].replace(0, 1e-6) * 1000)
    
    logging.info(f"pedestrian_road_density stats:\n{neighborhoods_gdf['pedestrian_road_density'].describe()}")
    return neighborhoods_gdf

Cell 4 Main Data Loading and Processing

In [10]:
def load_and_prepare_data():
    logging.info("Stage 1: Loading and preparing data...")
    
    # Define file paths and their corresponding keys
    data_files = {
        'neighborhoods': LANDUSE_NDVI_PATH,
        'buildings': OSM_BUILDINGS_PATH,
        'roads': OSM_ROADS_PATH,
        'trees': OSM_TREES_PATH,
        'transit': OSM_TRANSIT_PATH,
        'urban_masterplan': URBAN_MASTERPLAN_PATH,
        'accidents': ACCIDENTS_PATH,
        'population': POPULATION_PATH
    }
    
    data = {}
    
    # Load data with progress bar
    for key, path in tqdm(data_files.items(), desc="Loading files"):
        try:
            if key == 'population':
                with open(path, 'r') as f:
                    data[key] = pd.DataFrame(json.load(f))
                logging.info(f"Columns in population_df after loading: {list(data[key].columns)}")
            elif path.endswith('.geoparquet'):
                data[key] = gpd.read_parquet(path)
            else:
                data[key] = gpd.read_file(path)
            logging.info(f"Loaded {key} with shape {data[key].shape}")
        except Exception as e:
            logging.error(f"Failed to load {key} from {path}: {e}")
            raise
    
    # Log columns of neighborhoods_gdf to debug missing 'ndvi' and 'area_km2'
    neighborhoods_gdf = data['neighborhoods']
    logging.info(f"Columns in neighborhoods_gdf after loading: {list(neighborhoods_gdf.columns)}")
    
    # Check for alternative NDVI column names and rename if found
    possible_ndvi_columns = ['ndvi_mean', 'NDVI', 'ndvi_value']
    for col in possible_ndvi_columns:
        if col in neighborhoods_gdf.columns and 'ndvi' not in neighborhoods_gdf.columns:
            logging.info(f"Found alternative NDVI column '{col}'. Renaming to 'ndvi'.")
            neighborhoods_gdf['ndvi'] = neighborhoods_gdf[col]
            break
    
    # Ensure all GeoDataFrames are in the same CRS
    target_crs = 'EPSG:3826'
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            if data[key].crs is None:
                logging.warning(f"No CRS defined for {key}. Assuming EPSG:4326.")
                data[key].set_crs('EPSG:4326', inplace=True)
            if data[key].crs != target_crs:
                data[key] = data[key].to_crs(target_crs)
                logging.info(f"Converted {key} to CRS {target_crs}")
    
    # Fix geometries in all GeoDataFrames
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            data[key]['geometry'] = data[key]['geometry'].apply(fix_geometry)
            invalid_geoms = data[key][~data[key].geometry.is_valid]
            if not invalid_geoms.empty:
                logging.warning(f"Found {len(invalid_geoms)} invalid geometries in {key} after fixing.")
                data[key] = data[key][data[key].geometry.is_valid]
    
    # Compute intersections for neighborhoods
    logging.info("Computing intersections for neighborhoods...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf after loading: {list(roads_gdf.columns)}")
    
    # Extract endpoints from road segments
    logging.info("Extracting endpoints from road segments...")
    endpoints = []
    road_indices = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        geom = row['geometry']
        if geom.geom_type == 'LineString':
            coords = list(geom.coords)
            start_point = Point(coords[0])
            end_point = Point(coords[-1])
            if start_point.is_valid and end_point.is_valid:
                endpoints.extend([start_point, end_point])
                road_indices.extend([idx, idx])
        elif geom.geom_type == 'MultiLineString':
            for line in geom.geoms:
                coords = list(line.coords)
                start_point = Point(coords[0])
                end_point = Point(coords[-1])
                if start_point.is_valid and end_point.is_valid:
                    endpoints.extend([start_point, end_point])
                    road_indices.extend([idx, idx])
    
    if not endpoints:
        logging.warning("No valid endpoints extracted from road segments. Using fallback method for intersections.")
        road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
        intersection_counts = road_neighborhoods.groupby('index_right').size()
        neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    else:
        endpoints_gdf = gpd.GeoDataFrame({'geometry': endpoints, 'road_idx': road_indices}, crs=target_crs)
        
        # Create a spatial index for endpoints
        endpoints_sindex = endpoints_gdf.sindex
        
        # Cluster endpoints to identify intersections (points shared by 3 or more roads)
        logging.info("Building endpoint-to-road mapping...")
        endpoint_to_roads = {}
        for idx, point in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
            point_geom = point['geometry']
            road_idx = point['road_idx']
            point_key = (round(point_geom.x, 6), round(point_geom.y, 6))  # Round to avoid floating-point precision issues
            if point_key not in endpoint_to_roads:
                endpoint_to_roads[point_key] = set()
            endpoint_to_roads[point_key].add(road_idx)
        
        logging.info("Identifying intersections...")
        intersections = []
        for point_key, road_ids in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
            if len(road_ids) >= 3:  # Intersection if shared by 3 or more roads
                intersections.append(Point(point_key))
        
        if not intersections:
            logging.warning("No intersections found using endpoint clustering. Using fallback method.")
            road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = road_neighborhoods.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
        else:
            intersections_gdf = gpd.GeoDataFrame({'geometry': intersections}, crs=target_crs)
            
            # Count intersections per neighborhood
            logging.info("Counting intersections per neighborhood...")
            intersections_joined = gpd.sjoin(intersections_gdf, neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = intersections_joined.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    
    # Compute or verify area_km2
    if 'area_km2' not in neighborhoods_gdf.columns:
        logging.warning("'area_km2' column missing in neighborhoods_gdf. Computing from geometry...")
        neighborhoods_gdf['area_m2'] = neighborhoods_gdf['geometry'].area
        neighborhoods_gdf['area_km2'] = neighborhoods_gdf['area_m2'] / 1_000_000  # Convert m² to km²
        logging.info(f"Computed area_km2 stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    else:
        logging.info(f"area_km2 already present. Stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    
    # Compute intersection density
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"Intersection count stats:\n{neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats:\n{neighborhoods_gdf['intersection_density'].describe()}")
    
    # Cache the result
    try:
        neighborhoods_gdf.to_parquet(INTERSECTION_CACHE_PATH)
        logging.info(f"Saved neighborhoods with intersections to {INTERSECTION_CACHE_PATH}")
    except Exception as e:
        logging.error(f"Failed to save neighborhoods with intersections: {e}")
    
    data['neighborhoods'] = neighborhoods_gdf
    
    # Compute tree count per neighborhood
    logging.info("Computing tree count per neighborhood...")
    trees_gdf = data['trees']
    trees_joined = gpd.sjoin(trees_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    tree_counts = trees_joined.groupby('index_right').size()
    data['neighborhoods']['tree_count'] = tree_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute transit count per neighborhood
    logging.info("Computing transit count per neighborhood...")
    transit_gdf = data['transit']
    transit_joined = gpd.sjoin(transit_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    transit_counts = transit_joined.groupby('index_right').size()
    data['neighborhoods']['transit_count'] = transit_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute accident count per neighborhood
    logging.info("Computing accident count per neighborhood...")
    accidents_gdf = data['accidents']
    accidents_buffered = accidents_gdf.copy()
    accidents_buffered['geometry'] = accidents_buffered['geometry'].buffer(BUFFER_DISTANCE)
    accidents_joined = gpd.sjoin(accidents_buffered[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    accident_counts = accidents_joined.groupby('index_right').size()
    data['neighborhoods']['accident_count'] = accident_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute road density per neighborhood
    logging.info("Computing road density per neighborhood...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf before computing road density: {list(roads_gdf.columns)}")
    
    # Compute length_m if missing
    if 'length_m' not in roads_gdf.columns:
        logging.warning("'length_m' column missing in roads_gdf. Computing from geometry...")
        roads_gdf['length_m'] = roads_gdf['geometry'].length  # Length in meters (since CRS is EPSG:3826)
        logging.info(f"Computed length_m stats:\n{roads_gdf['length_m'].describe()}")
    
    roads_joined = gpd.sjoin(roads_gdf[['geometry', 'length_m']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    road_lengths = roads_joined.groupby('index_right')['length_m'].sum()
    data['neighborhoods']['road_density'] = road_lengths.reindex(data['neighborhoods'].index, fill_value=0) / (data['neighborhoods']['area_km2'] * 1000)
    logging.info(f"Road density stats:\n{data['neighborhoods']['road_density'].describe()}")
    
    # Merge population data
    logging.info("Merging population data...")
    population_df = data['population']
    population_df['LIE_NAME'] = population_df['LIE_NAME'].astype(str).str.strip()
    data['neighborhoods']['LIE_NAME'] = data['neighborhoods']['LIE_NAME'].astype(str).str.strip()
    
    # Check for possible column names for total_population and elderly_percentage
    expected_cols = ['total_population', 'elderly_percentage']
    population_cols = list(population_df.columns)
    missing_cols = [col for col in expected_cols if col not in population_cols]
    
    if missing_cols:
        logging.warning(f"Expected columns {missing_cols} not found in population_df. Attempting to find alternatives...")
        total_pop_alt = None
        elderly_alt = None
        for col in population_cols:
            col_lower = col.lower()
            if 'population' in col_lower and total_pop_alt is None:
                total_pop_alt = col
                logging.info(f"Found alternative for total_population: {col}")
            if 'elderly' in col_lower and elderly_alt is None:
                elderly_alt = col
                logging.info(f"Found alternative for elderly_percentage: {col}")
        
        # Rename columns if alternatives are found
        if total_pop_alt:
            population_df = population_df.rename(columns={total_pop_alt: 'total_population'})
        else:
            logging.warning("No alternative found for total_population. Setting to 0.")
            population_df['total_population'] = 0
        if elderly_alt:
            population_df = population_df.rename(columns={elderly_alt: 'elderly_percentage'})
        else:
            logging.warning("No alternative found for elderly_percentage. Setting to 0.")
            population_df['elderly_percentage'] = 0
    
    # Perform the merge
    data['neighborhoods'] = data['neighborhoods'].merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']],
        on='LIE_NAME',
        how='left'
    )
    
    # Compute land use percentages
    logging.info("Computing land use percentages for neighborhoods...")
    urban_masterplan_gdf = data['urban_masterplan']
    print_percentage_calculation(data['neighborhoods'], urban_masterplan_gdf, sample_size=3)
    
    for idx, row in data['neighborhoods'].iterrows():
        neighborhood_geom = row['geometry']
        if not neighborhood_geom.is_valid:
            continue
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area = intersected.geometry.union_all().area
        remaining_geom = intersected.geometry.union_all()
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                remaining_geom = remaining_geom.difference(category_area_geom)
            except Exception as e:
                logging.warning(f"Topology error for category {category} in neighborhood {row['LIE_NAME']}: {e}")
                category_areas[category] = 0.0
        
        for category in CATEGORY_PRIORITY.keys():
            area = category_areas.get(category, 0.0)
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            data['neighborhoods'].at[idx, f'land_use_{category.lower()}_percent'] = percentage
    
    # Fill NaN values in land use percentages
    for category in CATEGORY_PRIORITY.keys():
        col = f'land_use_{category.lower()}_percent'
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0.0)
    
    # Fill NaN values in other columns
    for col in ['intersection_count', 'intersection_density', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'total_population', 'elderly_percentage']:
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0)
    
    # Print data structure summary
    print_data_structure(data)
    
    logging.info("Finished loading and preparing data.")
    return data

Cell 5 compute_intersection_counts

In [11]:
def compute_intersection_counts(neighborhoods_gdf, roads_gdf):
    logging.info("Computing intersection counts for neighborhoods...")
    
    # Extract endpoints from road segments
    def get_endpoints(line):
        if line is None or line.is_empty:
            return []
        coords = list(line.coords)
        return [Point(coords[0]), Point(coords[-1])]
    
    endpoints = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        points = get_endpoints(row['geometry'])
        for point in points:
            endpoints.append({'geometry': point, 'road_idx': idx})
    
    endpoints_gdf = gpd.GeoDataFrame(endpoints, crs='EPSG:3826')
    
    # Build a mapping of endpoints to road indices
    endpoint_to_roads = {}
    for idx, row in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
        point = row['geometry']
        road_idx = row['road_idx']
        point_tuple = (point.x, point.y)
        if point_tuple not in endpoint_to_roads:
            endpoint_to_roads[point_tuple] = set()
        endpoint_to_roads[point_tuple].add(road_idx)
    
    # Identify intersections (endpoints shared by 3 or more roads)
    intersections = []
    for point_tuple, road_indices in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
        if len(road_indices) >= 3:  # Intersection if 3 or more roads share the endpoint
            intersections.append({'geometry': Point(point_tuple)})
    
    if not intersections:
        logging.warning("No intersections found. Setting intersection counts to 0.")
        neighborhoods_gdf['intersection_count'] = 0
        neighborhoods_gdf['intersection_density'] = 0.0
        return neighborhoods_gdf
    
    intersections_gdf = gpd.GeoDataFrame(intersections, crs='EPSG:3826')
    
    # Spatial join to count intersections per neighborhood
    intersection_counts = gpd.sjoin(
        neighborhoods_gdf[['geometry', 'LIE_NAME']],
        intersections_gdf,
        how='left',
        predicate='contains'
    )
    intersection_counts = intersection_counts.groupby('LIE_NAME').size().reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
    neighborhoods_gdf['intersection_count'] = intersection_counts
    
    # Compute intersection density (intersections per km²)
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2']
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_density'].replace([np.inf, -np.inf], 0).fillna(0)
    
    logging.info(f"Intersection count stats: {neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats: {neighborhoods_gdf['intersection_density'].describe()}")
    
    return neighborhoods_gdf

def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building the graph...")
    
    # Compute data hash to check if graph needs recomputing
    data_hash = compute_data_hash(data)
    cached_hash = None
    if os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read().strip()
    
    if not force_recompute and cached_hash == data_hash and all(
        os.path.exists(path) for path in [GRAPH_NODES_CACHE_PATH, GRAPH_EDGES_CACHE_PATH, GRAPH_NODE_ID_CACHE_PATH]
    ):
        logging.info("Data unchanged. Loading graph from cache...")
        nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
            node_id_to_index = json.load(f)
        G = cugraph.Graph()
        G.from_cudf_edgelist(
            edges_df,
            source='src',
            destination='dst',
            edge_attr='weight'
        )
        G._nodes = nodes_df
        logging.info("Graph loaded from cache.")
        return G
    
    neighborhoods_gdf = data['neighborhoods'].copy()
    buildings_gdf = data['buildings'].copy()
    roads_gdf = data['roads'].copy()
    trees_gdf = data['trees'].copy()
    transit_gdf = data['transit'].copy()
    
    # Create nodes for neighborhoods, buildings, roads, trees, and transit
    nodes = []
    node_id_to_index = {}
    current_idx = 0
    
    # Neighborhood nodes
    for idx, row in neighborhoods_gdf.iterrows():
        node_id = f"neighborhood_{row['LIE_NAME']}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'geometry': row['geometry'].centroid,
            'ndvi_mean': row['ndvi_mean'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            'area_km2': row['area_km2']
        })
        current_idx += 1
    
    # Building nodes
    for idx, row in buildings_gdf.iterrows():
        node_id = f"building_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'building',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'building_type': row['building'],
            'area_m2': row['area_m2']
        })
        current_idx += 1
    
    # Road nodes
    for idx, row in roads_gdf.iterrows():
        node_id = f"road_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'road',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'class': row['class'],
            'length_m': row['length_m']
        })
        current_idx += 1
    
    # Tree nodes
    for idx, row in trees_gdf.iterrows():
        node_id = f"tree_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'tree',
            'LIE_NAME': None,
            'geometry': row['geometry']
        })
        current_idx += 1
    
    # Transit nodes
    for idx, row in transit_gdf.iterrows():
        node_id = f"transit_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'transit',
            'LIE_NAME': None,
            'geometry': row['geometry'],
            'class': row['class']
        })
        current_idx += 1
    
    nodes_df = pd.DataFrame(nodes)
    nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs='EPSG:3826')
    nodes_df = cudf.from_pandas(nodes_df.drop(columns=['geometry']))
    
    # Create edges based on spatial proximity
    edges = []
    nodes_gdf_sindex = nodes_gdf.sindex
    
    # Neighborhood-to-neighborhood edges (shared borders)
    logging.info("Creating neighborhood-to-neighborhood edges...")
    for idx1, row1 in neighborhoods_gdf.iterrows():
        geom1 = row1['geometry']
        node_idx1 = node_id_to_index[f"neighborhood_{row1['LIE_NAME']}"]
        possible_matches = list(nodes_gdf_sindex.query(geom1, predicate='intersects'))
        for idx2 in possible_matches:
            row2 = nodes_gdf.iloc[idx2]
            if row2['type'] != 'neighborhood':
                continue
            if row1['LIE_NAME'] == row2['LIE_NAME']:
                continue
            geom2 = neighborhoods_gdf[neighborhoods_gdf['LIE_NAME'] == row2['LIE_NAME']]['geometry'].iloc[0]
            if geom1.intersects(geom2):
                node_idx2 = node_id_to_index[f"neighborhood_{row2['LIE_NAME']}"]
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': 1.0
                })
    
    # Other edges (neighborhood to building, road, tree, transit)
    logging.info("Creating edges between neighborhoods and other entities...")
    for idx, row in tqdm(nodes_gdf.iterrows(), total=len(nodes_gdf), desc="Creating edges"):
        if row['type'] == 'neighborhood':
            continue
        geom = row['geometry']
        possible_matches = list(neighborhoods_gdf.sindex.query(geom, predicate='contains'))
        for match_idx in possible_matches:
            neighborhood = neighborhoods_gdf.iloc[match_idx]
            if neighborhood['geometry'].contains(geom):
                node_idx1 = node_id_to_index[row['node_id']]
                node_idx2 = node_id_to_index[f"neighborhood_{neighborhood['LIE_NAME']}"]
                weight = 1.0
                if row['type'] == 'transit':
                    weight = 2.0  # Higher weight for transit nodes
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': weight
                })
                edges.append({
                    'src': node_idx2,
                    'dst': node_idx1,
                    'weight': weight
                })
    
    edges_df = cudf.DataFrame(edges)
    
    # Build the graph
    G = cugraph.Graph()
    G.from_cudf_edgelist(
        edges_df,
        source='src',
        destination='dst',
        edge_attr='weight'
    )
    G._nodes = nodes_df
    
    # Cache the graph
    nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
    edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
    with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
        json.dump(node_id_to_index, f)
    with open(GRAPH_DATA_HASH_PATH, 'w') as f:
        f.write(data_hash)
    
    logging.info("Graph construction completed.")
    return G

def prepare_gnn_data(G):
    logging.info("Stage 3: Preparing data for GNN...")
    
    nodes_df = G._nodes.to_pandas()
    edges_df = G.edgelist.edgelist_df.to_pandas()
    
    # Create node features
    feature_columns = [
        'ndvi_mean', 'total_population', 'elderly_percentage', 'area_km2',
        'area_m2', 'length_m'
    ]
    features = []
    for idx, row in nodes_df.iterrows():
        node_features = []
        for col in feature_columns:
            value = row.get(col, 0.0)
            if pd.isna(value):
                value = 0.0
            node_features.append(value)
        
        # One-hot encode node type
        node_type = row['type']
        type_encoding = [0] * 5  # 5 types: neighborhood, building, road, tree, transit
        type_mapping = {
            'neighborhood': 0,
            'building': 1,
            'road': 2,
            'tree': 3,
            'transit': 4
        }
        type_idx = type_mapping.get(node_type, 0)
        type_encoding[type_idx] = 1
        node_features.extend(type_encoding)
        
        features.append(node_features)
    
    feature_matrix = np.array(features, dtype=np.float32)
    
    # Normalize numerical features
    numerical_features = feature_matrix[:, :len(feature_columns)]
    means = numerical_features.mean(axis=0)
    stds = numerical_features.std(axis=0)
    stds[stds == 0] = 1  # Avoid division by zero
    numerical_features = (numerical_features - means) / stds
    feature_matrix[:, :len(feature_columns)] = numerical_features
    
    # Create edge indices for PyG
    edge_index = torch.tensor(
        np.array([edges_df['src'].values, edges_df['dst'].values]),
        dtype=torch.long
    )
    edge_attr = torch.tensor(edges_df['weight'].values, dtype=torch.float)
    
    # Create target (walkability score) for neighborhood nodes
    y = np.zeros(len(nodes_df), dtype=np.float32)
    if 'walkability_score' in nodes_df.columns:
        walkability_scores = nodes_df['walkability_score'].fillna(0).values
        mask = nodes_df['type'] == 'neighborhood'
        y[mask] = walkability_scores[mask]
    else:
        logging.warning("Walkability scores not found in nodes_df. Setting targets to 0.")
    
    y = torch.tensor(y, dtype=torch.float)
    
    node_type_mapping = {
        'neighborhood': 0,
        'building': 1,
        'road': 2,
        'tree': 3,
        'transit': 4
    }
    node_type = nodes_df['type'].map(node_type_mapping).fillna(-1).astype(int).values
    node_type = torch.tensor(node_type, dtype=torch.long)
    
    data = Data(
        x=torch.tensor(feature_matrix, dtype=torch.float),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=y,
        node_type=node_type
    )
    
    logging.info("GNN data prepared.")
    return data

Cell 6: Graph Construction (build_graph)

In [12]:
def compute_neighborhood_neighborhood_edges(args):
    idx, row, neighborhoods_gdf, neighborhood_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(neighborhood_sindex.intersection(geom.bounds))
    for other_idx in possible_matches_index:
        if other_idx != idx:
            other_row = neighborhoods_gdf.iloc[other_idx]
            other_geom = other_row['geometry']
            try:
                if geom.buffer(1e-3).intersects(other_geom.buffer(1e-3)) or geom.buffer(1e-3).touches(other_geom.buffer(1e-3)):
                    src = f"neighborhood_{idx}"
                    dst = f"neighborhood_{other_idx}"
                    edges.append({'src': src, 'dst': dst})
            except Exception as e:
                logging.warning(f"Error checking intersection between neighborhood {idx} and {other_idx}: {e}")
    return edges

def compute_neighborhood_building_edges(args):
    idx, row, buildings_gdf, building_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(building_sindex.intersection(geom.bounds))
    for building_idx in possible_matches_index:
        building_row = buildings_gdf.iloc[building_idx]
        building_geom = building_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(building_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"building_{building_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and building {building_idx}: {e}")
    return edges

def compute_neighborhood_road_edges(args):
    idx, row, roads_gdf, road_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(road_sindex.intersection(geom.bounds))
    for road_idx in possible_matches_index:
        road_row = roads_gdf.iloc[road_idx]
        road_geom = road_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(road_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"road_{road_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and road {road_idx}: {e}")
    return edges



In [13]:
def build_graph(data, force_recompute=False):
    import cudf
    import cugraph
    import logging
    import json
    import numpy as np
    from tqdm import tqdm
    
    logging.info("Stage 2: Building city graph...")
    
    # Compute data hash to check if cached graph can be used
    current_hash = compute_data_hash(data)
    if not force_recompute and os.path.exists(GRAPH_NODES_CACHE_PATH) and os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read()
        if cached_hash == current_hash:
            try:
                nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
                edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
                with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
                    node_id_to_vertex = json.load(f)
                G = cugraph.Graph()
                G._nodes = nodes_df
                if not edges_df.empty:
                    G.from_cudf_edgelist(edges_df, source='src', destination='dst')
                logging.info(f"Loaded cached graph: {len(nodes_df)} nodes, {len(edges_df)} edges")
                return G
            except Exception as e:
                logging.warning(f"Failed to load cached graph: {e}. Recomputing graph...")
    
    # Initialize node DataFrame
    nodes = []
    vertex_to_index = {}
    node_id_to_vertex = {}
    current_index = 0
    
    # Add neighborhood nodes
    logging.info("Adding neighborhood nodes...")
    neighborhoods_gdf = data['neighborhoods']
    for i, row in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood nodes"):
        node_id = f"neighborhood_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'neighborhood',
            'node_id': node_id,
            'LIE_NAME': row['LIE_NAME'],
            'avg_road_accident_density': row.get('avg_road_accident_density', 0),
            'pedestrian_road_density': row.get('pedestrian_road_density', 0),
            'ndvi': row.get('ndvi', 0),
            'tree_count': row.get('tree_count', 0),
            'transit_count': row.get('transit_count', 0),
            'accident_count': row.get('accident_count', 0),
            'road_density': row.get('road_density', 0),
            'intersection_density': row.get('intersection_density', 0),
            'total_population': row.get('total_population', 0),
            'elderly_percentage': row.get('elderly_percentage', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        for cat in CATEGORY_PRIORITY.keys():
            col = f'land_use_{cat.lower()}_percent'
            nodes[-1][col] = row.get(col, 0)
        current_index += 1
    
    # Add building nodes
    logging.info("Adding building nodes...")
    buildings_gdf = data['buildings']
    if 'area_m2' not in buildings_gdf.columns:
        logging.warning("'area_m2' missing. Computing from geometry...")
        buildings_gdf['area_m2'] = buildings_gdf.geometry.area
    else:
        logging.info("Using existing 'area_m2' column.")
    
    for i, row in tqdm(buildings_gdf.iterrows(), total=len(buildings_gdf), desc="Building nodes"):
        node_id = f"building_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'building',
            'node_id': node_id,
            'building': row.get('building', 'unknown'),
            'area_m2': row.get('area_m2', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        current_index += 1
    
    # Add road nodes
    logging.info("Adding road nodes...")
    roads_gdf = data['roads']
    for i, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Road nodes"):
        node_id = f"road_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'road',
            'node_id': node_id,
            'class': row.get('class', 'unknown'),
            'length_m': row.get('length_m', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        current_index += 1
    
    nodes_df = cudf.DataFrame(nodes)
    
    # Convert GeoDataFrames to cudf for GPU processing
    logging.info("Converting GeoDataFrames to cudf for GPU processing...")
    neighborhoods_cudf = cudf.DataFrame.from_pandas(neighborhoods_gdf.drop(columns=['geometry']))
    buildings_cudf = cudf.DataFrame.from_pandas(buildings_gdf.drop(columns=['geometry']))
    roads_cudf = cudf.DataFrame.from_pandas(roads_gdf.drop(columns=['geometry']))
    
    # Extract bounding box coordinates
    logging.info("Extracting bounding box coordinates...")
    neighborhoods_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in neighborhoods_gdf.geometry])
    
    buildings_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in buildings_gdf.geometry])
    buildings_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in buildings_gdf.geometry])
    buildings_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in buildings_gdf.geometry])
    buildings_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in buildings_gdf.geometry])
    
    roads_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in roads_gdf.geometry])
    roads_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in roads_gdf.geometry])
    roads_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in roads_gdf.geometry])
    roads_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in roads_gdf.geometry])
    
    logging.info(f"neighborhoods_cudf['min_x'] dtype: {neighborhoods_cudf['min_x'].dtype}")
    logging.info(f"buildings_cudf['min_x'] dtype: {buildings_cudf['min_x'].dtype}")
    logging.info(f"roads_cudf['min_x'] dtype: {roads_cudf['min_x'].dtype}")
    
    # Create edges using GPU-accelerated spatial joins
    logging.info("Creating edges using GPU-accelerated spatial joins...")
    edges = []
    edge_counts = {'neighborhood-neighborhood': 0, 'neighborhood-building': 0, 'neighborhood-road': 0}
    
    # Neighborhood-Neighborhood edges
    logging.info("Computing neighborhood-neighborhood edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Neighborhood edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = neighborhoods_cudf[
            ~((geom_max_x < neighborhoods_cudf['min_x']) |
              (geom_min_x > neighborhoods_cudf['max_x']) |
              (geom_max_y < neighborhoods_cudf['min_y']) |
              (geom_min_y > neighborhoods_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            if j != i:
                src_vertex = f"neighborhood_{i}"
                dst_vertex = f"neighborhood_{j}"
                src = vertex_to_index[src_vertex]
                dst = vertex_to_index[dst_vertex]
                edges.append({'src': src, 'dst': dst})
                edge_counts['neighborhood-neighborhood'] += 1
    
    # Neighborhood-Building edges
    logging.info("Computing neighborhood-building edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Building edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = buildings_cudf[
            ~((geom_max_x < buildings_cudf['min_x']) |
              (geom_min_x > buildings_cudf['max_x']) |
              (geom_max_y < buildings_cudf['min_y']) |
              (geom_min_y > buildings_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"building_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})
            edge_counts['neighborhood-building'] += 1
    
    # Neighborhood-Road edges
    logging.info("Computing neighborhood-road edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Road edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = roads_cudf[
            ~((geom_max_x < roads_cudf['min_x']) |
              (geom_min_x > roads_cudf['max_x']) |
              (geom_max_y < roads_cudf['min_y']) |
              (geom_min_y > roads_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"road_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})
            edge_counts['neighborhood-road'] += 1
    
    logging.info(f"Edge counts by type: {edge_counts}")
    
    edges_df = cudf.DataFrame(edges)
    logging.info(f"Created {len(edges_df)} total edges")
    
    # Validate edges
    valid_indices = set(nodes_df['index'].to_pandas())
    if edges_df.empty:
        logging.warning("No edges created. Graph will have nodes but no edges.")
    else:
        edges_df = edges_df[edges_df['src'].isin(valid_indices) & edges_df['dst'].isin(valid_indices)]
        logging.info(f"After validation, {len(edges_df)} edges remain")
        if not edges_df.empty:
            logging.info(f"Sample edges after validation:\n{edges_df.head().to_pandas()}")
    
    # Create graph
    G = cugraph.Graph()
    G._nodes = nodes_df
    if not edges_df.empty:
        G.from_cudf_edgelist(edges_df, source='src', destination='dst')
    else:
        logging.warning("No valid edges created. Graph will have nodes but no edges.")
    
    # Save graph data to cache
    logging.info("Saving graph data to cache...")
    try:
        nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_DATA_HASH_PATH, 'w') as f:
            f.write(current_hash)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
            json.dump(node_id_to_vertex, f)
        logging.info("Successfully saved graph data to cache.")
    except Exception as e:
        logging.error(f"Failed to save graph data to cache: {e}")
    
    logging.info(f"City graph constructed: {len(nodes_df)} nodes, {len(edges_df)} edges")
    return G

Cell 7: Rule-Based Walkability Scores (compute_walkability_scores)

In [14]:
def compute_walkability_scores(G, data):
    """
    Compute walkability scores for neighborhoods and update the graph.
    """
    logging.info("Computing walkability scores for neighborhoods...")
    nodes_df = G._nodes.to_pandas()
    
    # Compute walkability components
    walkability_components, components = compute_walkability_components_all(data['neighborhoods'], data)
    
    # Unpack the tuple
    walkability_df = walkability_components
    
    # Validate data
    logging.info(f"Number of neighborhood nodes in nodes_df: {len(nodes_df[nodes_df['type'] == 'neighborhood'])}")
    logging.info(f"Number of entries in walkability_components: {len(walkability_df)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {nodes_df[nodes_df['type'] == 'neighborhood']['LIE_NAME'].head().tolist()}")
    logging.info(f"Sample LIE_NAME in walkability_components: {walkability_df['LIE_NAME'].head().tolist()}")

    # Merge walkability scores into nodes_df
    nodes_df = nodes_df.merge(
        walkability_df[['LIE_NAME', 'walkability_score', 'walkability_category']],
        on='LIE_NAME',
        how='left'
    )

    # Check for unmatched neighborhoods
    unmatched = nodes_df[(nodes_df['type'] == 'neighborhood') & (nodes_df['walkability_score'].isna())]
    if not unmatched.empty:
        logging.warning(f"Found {len(unmatched)} neighborhood nodes without walkability scores:")
        logging.warning(unmatched[['LIE_NAME']])
        # Optionally fill missing scores
        nodes_df['walkability_score'] = nodes_df['walkability_score'].fillna(0.5)
        nodes_df['walkability_category'] = nodes_df['walkability_category'].fillna('medium')
    else:
        logging.info("All neighborhood nodes matched with walkability scores.")

    # Update the graph nodes
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished computing walkability scores.")
    return G

Cell 8 prepare_gnn_data

In [15]:
def prepare_gnn_data(G):
    import torch
    import logging
    import pandas as pd
    import numpy as np
    from tqdm import tqdm
    
    logging.info("Preparing data for GNN training...")
    nodes_df = G._nodes
    edges_df = G.edgelist.edgelist_df if G.edgelist else cudf.DataFrame()
    
    numerical_features = [
        'ndvi', 'tree_count', 'transit_count', 'accident_count',
        'road_density', 'intersection_density', 'total_population',
        'elderly_percentage', 'area_m2', 'length_m', 'avg_road_accident_density',
        'pedestrian_road_density'
    ] + [f'land_use_{cat.lower()}_percent' for cat in CATEGORY_PRIORITY.keys()]
    
    numerical_features.append('land_use_diversity')
    
    building_types = nodes_df[nodes_df['type'] == 'building']['building'].to_pandas().unique()
    road_classes = nodes_df[nodes_df['type'] == 'road']['class'].to_pandas().unique()
    categorical_features = (
        [f'building_{bt}' for bt in building_types if pd.notna(bt)] +
        [f'road_class_{rc}' for rc in road_classes if pd.notna(rc)]
    )
    
    all_features = numerical_features + categorical_features
    
    features_list = []
    labels_list = []
    node_types = []
    
    for node_type in tqdm(['neighborhood', 'building', 'road'], desc="Normalizing features by node type"):
        subset = nodes_df[nodes_df['type'] == node_type].to_pandas()
        if subset.empty:
            logging.warning(f"No nodes of type {node_type} found. Skipping.")
            continue
        
        subset_features = pd.DataFrame(0.0, index=subset.index, columns=all_features)
        
        if node_type == 'neighborhood':
            # Compute land_use_diversity
            land_use_cols = [col for col in subset.columns if col.startswith('land_use_') and col.endswith('_percent')]
            if land_use_cols:
                subset['land_use_diversity'] = subset[land_use_cols].apply(
                    lambda row: -np.sum([p * np.log(p + 1e-10) for p in row / 100.0 if p > 0]), axis=1
                )
            else:
                subset['land_use_diversity'] = 0
            
            for col in numerical_features:
                if col in subset.columns:
                    subset_features[col] = subset[col].astype(float).fillna(0)
                    logging.debug(f"{node_type} - {col} pre-normalization std: {subset[col].std():.4f}")
                else:
                    logging.warning(f"Column {col} missing in neighborhood nodes. Setting to 0.")
                    subset_features[col] = 0
        elif node_type == 'building':
            if 'area_m2' in subset.columns:
                subset_features['area_m2'] = subset['area_m2'].astype(float).fillna(0)
                logging.debug(f"{node_type} - area_m2 pre-normalization std: {subset['area_m2'].std():.4f}")
        else:  # road
            if 'length_m' in subset.columns:
                subset_features['length_m'] = subset['length_m'].astype(float).fillna(0)
                logging.debug(f"{node_type} - length_m pre-normalization std: {subset['length_m'].std():.4f}")
        
        if node_type == 'building':
            for bt in building_types:
                if pd.notna(bt):
                    subset_features[f'building_{bt}'] = (subset['building'] == bt).astype(float)
        elif node_type == 'road':
            for rc in road_classes:
                if pd.notna(rc):
                    subset_features[f'road_class_{rc}'] = (subset['class'] == rc).astype(float)
        
        # Z-score normalization for numerical features
        for col in numerical_features:
            if col in subset_features.columns and subset_features[col].std() > 0:
                subset_features[col] = (
                    (subset_features[col] - subset_features[col].mean()) / subset_features[col].std()
                ).fillna(0)
                logging.debug(f"{node_type} - {col} post-normalization std: {subset_features[col].std():.4f}")
            else:
                logging.debug(f"Column {col} has zero variance or is missing for {node_type}. Setting to 0.")
        
        logging.info(f"Node type {node_type}: {len(subset)} nodes, feature shape: {subset_features.shape}")
        
        features_list.append(subset_features.values)
        
        if node_type == 'neighborhood':
            labels = subset['walkability_score'].astype(float).fillna(0).values
            labels_list.append(labels[:, None])  # Shape [n, 1]
        else:
            labels_list.append(np.zeros((len(subset), 1)))
        
        node_types.extend([node_type] * len(subset))
    
    try:
        features = np.vstack(features_list)
    except ValueError as e:
        logging.error(f"Failed to stack features: {e}")
        raise
    
    labels = np.vstack(labels_list)
    features_tensor = torch.tensor(features, dtype=torch.float)
    labels_tensor = torch.tensor(labels, dtype=torch.float)
    
    if not edges_df.empty:
        edge_index = torch.tensor(edges_df[['src', 'dst']].to_pandas().values.T, dtype=torch.long)
        logging.info(f"Edge index created with {edge_index.shape[1]} edges")
        max_index = nodes_df['index'].max()
        if edge_index.max() > max_index or edge_index.min() < 0:
            logging.warning(f"Edge indices out of bounds: min={edge_index.min()}, max={edge_index.max()}, expected max={max_index}")
            edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        logging.warning("No edges found in graph.")
    
    data = Data(
        x=features_tensor,
        edge_index=edge_index,
        y=labels_tensor
    )
    
    data.node_types = node_types
    
    logging.info(f"Prepared GNN data: {features_tensor.shape[0]} nodes, {edge_index.shape[1]} edges")
    logging.info(f"Feature matrix shape: {features_tensor.shape}")
    logging.info(f"Label tensor shape: {labels_tensor.shape}")
    return data

Cell 9: WalkabilityGNN, train_gnn_model, predict_walkability

In [16]:
class GNNWalkabilityPredictor(torch.nn.Module):
    def __init__(self, num_features, hidden_dim=128, num_heads=4, dropout_rate=0.3):
        super(GNNWalkabilityPredictor, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=num_heads, concat=True)
        self.bn1 = BatchNorm(hidden_dim * num_heads)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim // 2, heads=1, concat=True)
        self.bn2 = BatchNorm(hidden_dim // 2)
        self.fc1 = torch.nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.fc2 = torch.nn.Linear(hidden_dim // 4, 1)
        self.dropout = torch.nn.Dropout(dropout_rate)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        if edge_index.numel() > 0:
            x = self.conv1(x, edge_index)
            x = self.bn1(x)
            x = F.relu(x)
            x = self.dropout(x)
            x = self.conv2(x, edge_index)
            x = self.bn2(x)
            x = F.relu(x)
        else:
            logging.warning("No edges in the graph. Using linear layer for node features only.")
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

def train_gnn_model(data_gnn, hidden_dim=128, num_heads=4, dropout_rate=0.3, lr=0.005, weight_decay=1e-4, epochs=500, patience=30):
    logging.info("Stage 4: Training GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_gnn = data_gnn.to(device)
    
    neighborhood_mask = np.array([t == 'neighborhood' for t in data_gnn.node_types])
    train_indices = np.where(neighborhood_mask)[0]
    
    if len(train_indices) == 0:
        logging.error("No neighborhood nodes found for training.")
        raise ValueError("No neighborhood nodes found for training.")
    
    np.random.seed(42)
    train_idx = np.random.choice(train_indices, size=int(0.8 * len(train_indices)), replace=False)
    val_idx = np.setdiff1d(train_indices, train_idx)
    
    train_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    data_gnn.train_mask = train_mask
    data_gnn.val_mask = val_mask
    
    neighborhood_labels = data_gnn.y[neighborhood_mask].cpu().numpy()
    logging.info(f"Target (walkability_score) distribution for neighborhood nodes:\n{pd.Series(neighborhood_labels.flatten()).describe()}")
    
    model = GNNWalkabilityPredictor(num_features=data_gnn.x.shape[1], hidden_dim=hidden_dim, num_heads=num_heads, dropout_rate=dropout_rate).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)
    criterion = torch.nn.MSELoss()
    
    best_val_loss = float('inf')
    best_model_path = os.path.join(CHECKPOINT_DIR, 'best_gnn_model.pth')
    early_stop_counter = 0
    
    train_losses = []
    val_losses = []
    train_maes = []
    val_maes = []
    train_r2s = []
    val_r2s = []
    
    for epoch in tqdm(range(epochs), desc="Training epochs"):
        model.train()
        optimizer.zero_grad()
        out = model(data_gnn)
        loss = criterion(out[data_gnn.train_mask], data_gnn.y[data_gnn.train_mask])
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_out = model(data_gnn)
            val_loss = criterion(val_out[data_gnn.val_mask], data_gnn.y[data_gnn.val_mask])
            
            train_pred = out[data_gnn.train_mask].detach().cpu().numpy()
            train_true = data_gnn.y[data_gnn.train_mask].cpu().numpy()
            val_pred = val_out[data_gnn.val_mask].detach().cpu().numpy()
            val_true = data_gnn.y[data_gnn.val_mask].cpu().numpy()
            
            train_mae = mean_absolute_error(train_true, train_pred)
            train_r2 = r2_score(train_true, train_pred)
            val_mae = mean_absolute_error(val_true, val_pred)
            val_r2 = r2_score(val_true, val_pred)
            
            train_losses.append(loss.item())
            val_losses.append(val_loss.item())
            train_maes.append(train_mae)
            val_maes.append(val_mae)
            train_r2s.append(train_r2)
            val_r2s.append(val_r2)
        
        scheduler.step(val_loss)
        
        if epoch % 10 == 0:
            logging.info(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Train MAE: {train_mae:.4f}, Train R2: {train_r2:.4f}, Val Loss: {val_loss.item():.4f}, Val MAE: {val_mae:.4f}, Val R2: {val_r2:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                logging.info(f"Early stopping at epoch {epoch}")
                break
    
    model.load_state_dict(torch.load(best_model_path))
    logging.info("Finished training GNN model.")
    return {
        'model': model,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_maes': train_maes,
        'val_maes': val_maes,
        'train_r2s': train_r2s,
        'val_r2s': val_r2s
    }

def predict_walkability(G, model):
    logging.info("Predicting walkability scores using GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    data_gnn = prepare_gnn_data(G)
    data_gnn = data_gnn.to(device)
    
    with torch.no_grad():
        predictions = model(data_gnn)
    
    nodes_df = G._nodes.to_pandas()
    neighborhood_mask = nodes_df['type'] == 'neighborhood'
    nodes_df.loc[neighborhood_mask, 'walkability_gnn'] = predictions[neighborhood_mask].cpu().numpy().flatten()
    
    nodes_df['walkability_gnn'] = nodes_df['walkability_gnn'].clip(0, 1)
    
    # Compute walkability_category with dynamic thresholds for GNN predictions
    low_threshold = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].quantile(0.33)
    high_threshold = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].quantile(0.66)
    logging.info(f"GNN walkability category thresholds - low: {low_threshold:.4f}, high: {high_threshold:.4f}")
    
    def categorize_gnn_score(score):
        if score < low_threshold:
            return 'low'
        elif score < high_threshold:
            return 'medium'
        else:
            return 'high'
    
    nodes_df.loc[neighborhood_mask, 'walkability_category'] = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].apply(categorize_gnn_score)
    
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished predicting walkability scores.")
    return G

Cell 10: Interactive Map Generation (create_interactive_map)

In [17]:
def create_interactive_map(G, data):
    """Generate an interactive Kepler.gl map to visualize walkability scores and other geodata."""
    logging.info("Generating interactive Kepler.gl map...")
    
    nodes_df = G._nodes.to_pandas()
    neighborhoods_gdf = data['neighborhoods'].copy()

    # Standardize LIE_NAME for merging
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].astype(str).str.strip()

    # Filter for neighborhood nodes and select necessary columns
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'][['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category']]

    # Merge data
    map_data = neighborhoods_gdf[['LIE_NAME', 'geometry']].merge(
        neighborhood_nodes,
        on='LIE_NAME',
        how='left'
    )

    # Drop duplicates in-place
    map_data.drop_duplicates(subset=['LIE_NAME'], keep='first', inplace=True)

    # Fill NaN values
    map_data['walkability_score'] = map_data['walkability_score'].fillna(0)
    map_data['walkability_gnn'] = map_data['walkability_gnn'].fillna(0)
    map_data['walkability_category'] = map_data['walkability_category'].fillna('low')

    # Convert to GeoDataFrame and transform CRS
    map_data = gpd.GeoDataFrame(map_data, geometry='geometry', crs='EPSG:3826')
    map_data['geometry'] = map_data['geometry'].to_crs('EPSG:4326')

    # Prepare kepler_data
    kepler_data = {
        'neighborhoods': map_data[['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category', 'geometry']].to_json()
    }

    # Prepare roads data
    if 'roads' in data:
        roads_gdf = data['roads'].copy()
        if roads_gdf.crs != 'EPSG:4326':
            roads_gdf = roads_gdf.to_crs('EPSG:4326')
        road_columns = ['class', 'length_m', 'geometry']
        available_columns = [col for col in road_columns if col in roads_gdf.columns]
        if 'geometry' in available_columns:
            kepler_data['roads'] = roads_gdf[available_columns].to_json()
        else:
            logging.warning("Roads GeoDataFrame missing 'geometry' column. Skipping roads layer.")
    else:
        logging.warning("Roads data not found in data dictionary. Skipping roads layer.")

    # Prepare buildings data
    if 'buildings' in data:
        buildings_gdf = data['buildings'].copy()
        if buildings_gdf.crs != 'EPSG:4326':
            buildings_gdf = buildings_gdf.to_crs('EPSG:4326')
        building_columns = ['building', 'area_m2', 'geometry']
        available_columns = [col for col in building_columns if col in buildings_gdf.columns]
        if 'geometry' in available_columns:
            kepler_data['buildings'] = buildings_gdf[available_columns].to_json()
        else:
            logging.warning("Buildings GeoDataFrame missing 'geometry' column. Skipping buildings layer.")
    else:
        logging.warning("Buildings data not found in data dictionary. Skipping buildings layer.")

    # Define neighborhoods layer
    neighborhoods_layer = {
        "id": "neighborhoods",
        "type": "geojson",
        "config": {
            "dataId": "neighborhoods",
            "label": "Neighborhoods",
            "color": [18, 147, 154],
            "columns": {
                "geojson": "geometry"
            },
            "isVisible": True,
            "visConfig": {
                "opacity": 0.7,
                "strokeOpacity": 0.9,
                "thickness": 1,
                "strokeColor": [255, 255, 255],
                "colorRange": {
                    "name": "Global Warming",
                    "type": "sequential",
                    "colors": [
                        "#5A1846", "#900C3F", "#C70039",
                        "#E3611C", "#F1920E", "#FFC107"
                    ]
                },
                "strokeColorRange": {
                    "name": "Global Warming",
                    "type": "sequential",
                    "colors": [
                        "#5A1846", "#900C3F", "#C70039",
                        "#E3611C", "#F1920E", "#FFC107"
                    ]
                },
                "colorField": {
                    "name": "walkability_gnn",
                    "type": "real"
                },
                "colorScale": "quantile"
            }
        },
        "visualChannels": {
            "colorField": {
                "name": "walkability_gnn",
                "type": "real"
            },
            "colorScale": "quantile"
        }
    }

    # Define roads layer if available
    if 'roads' in kepler_data:
        roads_layer = {
            "id": "roads",
            "type": "geojson",
            "config": {
                "dataId": "roads",
                "label": "Roads",
                "color": [255, 0, 0],
                "columns": {
                    "geojson": "geometry"
                },
                "isVisible": True,
                "visConfig": {
                    "opacity": 0.8,
                    "strokeOpacity": 0.8,
                    "thickness": 2,
                    "strokeColor": [255, 0, 0],
                    "colorField": {
                        "name": "class",
                        "type": "string"
                    },
                    "colorScale": "ordinal",
                    "colorRange": {
                        "name": "ColorBrewer Paired-12",
                        "type": "all",
                        "category": "ColorBrewer",
                        "colors": ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"]
                    }
                }
            },
            "visualChannels": {
                "colorField": {
                    "name": "class",
                    "type": "string"
                },
                "colorScale": "ordinal"
            }
        }
    else:
        roads_layer = None

    # Define buildings layer if available
    if 'buildings' in kepler_data:
        buildings_layer = {
            "id": "buildings",
            "type": "geojson",
            "config": {
                "dataId": "buildings",
                "label": "Buildings",
                "color": [0, 255, 0],
                "columns": {
                    "geojson": "geometry"
                },
                "isVisible": True,
                "visConfig": {
                    "opacity": 0.5,
                    "strokeOpacity": 0.5,
                    "thickness": 0.5,
                    "strokeColor": [0, 0, 0],
                    "colorField": {
                        "name": "building",
                        "type": "string"
                    },
                    "colorScale": "ordinal",
                    "colorRange": {
                        "name": "ColorBrewer Set3-12",
                        "type": "all",
                        "category": "ColorBrewer",
                        "colors": ["#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f"]
                    }
                }
            },
            "visualChannels": {
                "colorField": {
                    "name": "building",
                    "type": "string"
                },
                "colorScale": "ordinal"
            }
        }
    else:
        buildings_layer = None

    # Create layers list
    layers = [neighborhoods_layer]
    if roads_layer:
        layers.append(roads_layer)
    if buildings_layer:
        layers.append(buildings_layer)

    # Define tooltips
    tooltips = {
        "neighborhoods": [
            {"name": "LIE_NAME", "format": None},
            {"name": "walkability_score", "format": "{:.3f}"},
            {"name": "walkability_gnn", "format": "{:.3f}"},
            {"name": "walkability_category", "format": None}
        ]
    }
    if 'roads' in kepler_data:
        tooltips['roads'] = [
            {"name": "class", "format": None},
            {"name": "length_m", "format": "{:.2f}"}
        ]
    if 'buildings' in kepler_data:
        tooltips['buildings'] = [
            {"name": "building", "format": None},
            {"name": "area_m2", "format": "{:.2f}"}
        ]

    # Update config
    config = {
        "version": "v1",
        "config": {
            "visState": {
                "layers": layers,
                "interactionConfig": {
                    "tooltip": {
                        "fieldsToShow": tooltips,
                        "enabled": True
                    }
                }
            },
            "mapState": {
                "latitude": 25.0330,
                "longitude": 121.5654,
                "zoom": 11
            },
            "mapStyle": {
                "styleType": "dark"
            }
        }
    }

    map_1 = KeplerGl(height=800, data=kepler_data, config=config)
    map_path = os.path.join(BASE_DIR, 'taipei_walkability_map.html')
    map_1.save_to_html(file_name=map_path)
    logging.info(f"Interactive map generated and saved as {map_path}")
    print(f"Map saved to {map_path}!")

Cell 11: Main Execution (main)

In [18]:
def load_and_prepare_data():
    """Load and prepare data for analysis."""
    logging.info("Starting load_and_prepare_data...")
    data = {}
    
    try:
        # Load neighborhoods
        logging.info(f"Loading neighborhoods from {LANDUSE_NDVI_PATH}")
        data['neighborhoods'] = gpd.read_file(LANDUSE_NDVI_PATH)
        logging.info(f"Neighborhoods loaded: shape={data['neighborhoods'].shape}, columns={list(data['neighborhoods'].columns)}")
        
        # Compute area_km2 if missing
        if 'area_km2' not in data['neighborhoods'].columns:
            logging.warning("'area_km2' column missing in neighborhoods. Computing from geometry...")
            if data['neighborhoods'].crs is None:
                logging.info("No CRS defined for neighborhoods. Assuming EPSG:3826.")
                data['neighborhoods'].set_crs('EPSG:3826', inplace=True)
            data['neighborhoods']['area_km2'] = data['neighborhoods'].geometry.area / 1e6  # Convert m² to km²
            logging.info(f"Computed area_km2 stats:\n{data['neighborhoods']['area_km2'].describe()}")
        
        # Load roads
        logging.info(f"Loading roads from {OSM_ROADS_PATH}")
        data['roads'] = gpd.read_parquet(OSM_ROADS_PATH)
        logging.info(f"Roads loaded: shape={data['roads'].shape}, columns={list(data['roads'].columns)}")
        
        # Compute length_m if missing
        if 'length_m' not in data['roads'].columns:
            logging.warning("'length_m' column missing in roads. Computing from geometry...")
            if data['roads'].crs is None:
                logging.info("No CRS defined for roads. Assuming EPSG:3826.")
                data['roads'].set_crs('EPSG:3826', inplace=True)
            data['roads']['length_m'] = data['roads'].geometry.length
            logging.info(f"Computed length_m stats:\n{data['roads']['length_m'].describe()}")
        
        # Load accidents
        logging.info(f"Loading accidents from {ACCIDENTS_PATH}")
        data['accidents'] = gpd.read_file(ACCIDENTS_PATH)
        logging.info(f"Accidents loaded: shape={data['accidents'].shape}, columns={list(data['accidents'].columns)}")
        
        # Load urban masterplan
        logging.info(f"Loading urban masterplan from {URBAN_MASTERPLAN_PATH}")
        data['urban_masterplan'] = gpd.read_file(URBAN_MASTERPLAN_PATH)
        logging.info(f"Urban masterplan loaded: shape={data['urban_masterplan'].shape}, columns={list(data['urban_masterplan'].columns)}")
        if 'Category' not in data['urban_masterplan'].columns:
            logging.warning("'Category' column missing in urban_masterplan. Adding default category.")
            data['urban_masterplan']['Category'] = 'Unknown'
        
        # Load transit
        logging.info(f"Loading transit from {OSM_TRANSIT_PATH}")
        data['transit'] = gpd.read_parquet(OSM_TRANSIT_PATH)
        logging.info(f"Transit loaded: shape={data['transit'].shape}, columns={list(data['transit'].columns)}")
        
        # Load trees
        logging.info(f"Loading trees from {OSM_TREES_PATH}")
        data['trees'] = gpd.read_parquet(OSM_TREES_PATH)
        logging.info(f"Trees loaded: shape={data['trees'].shape}, columns={list(data['trees'].columns)}")
        
        # Load population
        logging.info(f"Loading population from {POPULATION_PATH}")
        with open(POPULATION_PATH, 'r') as f:
            pop_data = json.load(f)
        data['population'] = pd.DataFrame(pop_data)
        logging.info(f"Population loaded: shape={data['population'].shape}, columns={list(data['population'].columns)}")
        
        # Validate data
        for key, df in data.items():
            if isinstance(df, gpd.GeoDataFrame):
                if df.empty:
                    logging.error(f"{key} is empty.")
                    raise ValueError(f"{key} is empty.")
                if df.geometry.isna().any():
                    logging.error(f"Missing geometries in {key}.")
                    raise ValueError(f"Missing geometries in {key}.")
                if not all(df.geometry.is_valid):
                    logging.warning(f"Invalid geometries in {key}. Attempting to fix.")
                    df['geometry'] = df['geometry'].apply(make_valid)
        
        # Add category_priority and land_use_weights to data
        data['category_priority'] = CATEGORY_PRIORITY
        data['land_use_weights'] = land_use_weights
        
        logging.info("load_and_prepare_data completed successfully.")
        return data
    
    except Exception as e:
        logging.error(f"Error in load_and_prepare_data: {str(e)}")
        raise

In [19]:
def main(force_recompute_graph=False):
    """Main execution pipeline for the analysis."""
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    os.makedirs(SUBGRAPH_DIR, exist_ok=True)
    logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

    # Import f_oneway for ANOVA test
    from scipy.stats import f_oneway

    # Track timing for each step
    timings = {}
    
    try:
        # Step 1: Load and prepare data
        start_time = time.time()
        print("Starting load_and_prepare_data...")
        data = load_and_prepare_data()
        print_data_structure(data)
        timings['load_and_prepare_data'] = time.time() - start_time

        # Validate urban_masterplan
        if 'urban_masterplan' not in data:
            logging.error("urban_masterplan missing from data dictionary.")
            raise KeyError("urban_masterplan missing from data dictionary.")
        if not isinstance(data['urban_masterplan'], gpd.GeoDataFrame):
            logging.error("urban_masterplan is not a GeoDataFrame.")
            raise TypeError("urban_masterplan is not a GeoDataFrame.")
        logging.info(f"urban_masterplan shape: {data['urban_masterplan'].shape}")
        logging.info(f"urban_masterplan columns: {list(data['urban_masterplan'].columns)}")
        logging.info(f"urban_masterplan sample (first 2 rows):\n{data['urban_masterplan'].head(2).to_string()}")

        # Step 2: Compute road type accident correlation
        start_time = time.time()
        print("Starting compute_road_type_accident_correlation...")
        road_accident_summary, roads_data, neighborhoods_gdf = compute_road_type_accident_correlation(
            data['roads'], data['neighborhoods'], data['accidents']
        )
        data['neighborhoods'] = neighborhoods_gdf
        timings['compute_road_type_accident_correlation'] = time.time() - start_time

        # Step 2.5: Compute pedestrian road density
        start_time = time.time()
        print("Starting compute_pedestrian_road_density...")
        data['neighborhoods'] = compute_pedestrian_road_density(data['roads'], data['neighborhoods'])
        timings['compute_pedestrian_road_density'] = time.time() - start_time

        # Step 3: Build graph
        start_time = time.time()
        print("Starting build_graph...")
        G = build_graph(data, force_recompute=force_recompute_graph)
        timings['build_graph'] = time.time() - start_time

        # Validate edge counts
        edge_count = G.edgelist.edgelist_df.shape[0] if G.edgelist else 0
        logging.info(f"Graph edge count: {edge_count}")
        if edge_count == 0:
            logging.warning("Graph has no edges. GNN will not utilize graph structure.")

        # Step 4: Compute walkability scores
        start_time = time.time()
        print("Starting compute_walkability_scores...")
        walkability_components, components = compute_walkability_components_all(data['neighborhoods'], data)
        logging.info(f"Columns in walkability_components: {list(walkability_components.columns)}")
        G = compute_walkability_scores(G, data)
        timings['compute_walkability_scores'] = time.time() - start_time

        # Update data['neighborhoods'] with the cluster column
        data['neighborhoods'] = data['neighborhoods'].merge(
            walkability_components[['LIE_NAME', 'cluster']],
            on='LIE_NAME',
            how='left'
        )

        # Step 5: Prepare GNN data
        start_time = time.time()
        print("Starting prepare_gnn_data...")
        data_gnn = prepare_gnn_data(G)
        timings['prepare_gnn_data'] = time.time() - start_time

        # Step 6: Train GNN model
        start_time = time.time()
        print("Starting train_gnn_model...")
        results = train_gnn_model(data_gnn)
        model = results['model']
        timings['train_gnn_model'] = time.time() - start_time

        # Step 7: Predict walkability
        start_time = time.time()
        print("Starting predict_walkability...")
        G = predict_walkability(G, model)
        timings['predict_walkability'] = time.time() - start_time

        # Step 8: Create interactive map
        start_time = time.time()
        print("Starting create_interactive_map...")
        create_interactive_map(G, data)
        timings['create_interactive_map'] = time.time() - start_time

        # Final validation: Check walkability scores
        nodes_df = G._nodes.to_pandas()
        neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood']
        walkability_score_stats = neighborhood_nodes['walkability_score'].describe()
        walkability_gnn_stats = neighborhood_nodes['walkability_gnn'].describe()
        walkability_category_dist = neighborhood_nodes['walkability_category'].value_counts()
        non_zero_walkability = (neighborhood_nodes['walkability_score'] > 0).sum()
        non_zero_walkability_gnn = (neighborhood_nodes['walkability_gnn'] > 0).sum()
        
        logging.info("Final validation - Walkability scores in neighborhood nodes:")
        logging.info(f"Walkability score distribution:\n{walkability_score_stats}")
        logging.info(f"Walkability GNN distribution:\n{walkability_gnn_stats}")
        logging.info(f"Walkability category distribution:\n{walkability_category_dist}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_score: {non_zero_walkability}/{len(neighborhood_nodes)}")
        logging.info(f"Number of neighborhood nodes with non-zero walkability_gnn: {non_zero_walkability_gnn}/{len(neighborhood_nodes)}")

        # Check for low variation in walkability scores
        if walkability_score_stats['std'] < 0.05:
            logging.warning("Walkability scores have low variation (std < 0.05). Components may need adjustment.")
        if walkability_gnn_stats['std'] < 0.05:
            logging.warning("GNN predictions have low variation (std < 0.05). Check edge creation and model training.")

        # Compute correlation between walkability_score and walkability_gnn
        corr, p_value = pearsonr(neighborhood_nodes['walkability_score'], neighborhood_nodes['walkability_gnn'])
        logging.info(f"Correlation between walkability_score and walkability_gnn: {corr:.2f} (p-value: {p_value:.2f})")
        if corr < 0.5:
            logging.warning("Low correlation between walkability_score and walkability_gnn. GNN predictions may not align well with rule-based scores.")

        # Generate additional charts
        # 1. Distribution of accident_density Before and After Smoothing
        fig, ax1 = plt.subplots(figsize=(12, 6))
        roads_data['accident_density_adjusted'] = roads_data['accident_density_raw'] + 0.01
        roads_data['accident_density_adjusted'] = np.log1p(roads_data['accident_density_adjusted'])
        sns.histplot(roads_data['accident_density_raw'], kde=True, color='blue', label='Raw (Before Smoothing)', stat='density', log_scale=True, ax=ax1)
        sns.histplot(roads_data['accident_density_adjusted'], kde=True, color='orange', label='After Smoothing & Log Transform', stat='density', ax=ax1)
        ax1.set_xlabel('Accident Density (accidents/km, log scale)')
        ax1.set_ylabel('Density')
        ax1.legend(loc='upper left')
        ax1.grid(True, linestyle='--', alpha=0.7)

        ax2 = ax1.twiny()
        ax2.set_xlim(ax1.get_xlim())
        ax2.set_xticks(ax1.get_xticks())
        ax2.set_xticklabels([f"{10**tick:.0f}" for tick in ax1.get_xticks()])
        ax2.set_xlabel('Accident Density (accidents/km, linear scale)')

        ax3 = ax1.twinx()
        sns.ecdfplot(roads_data['accident_density_raw'], color='blue', linestyle='-.', linewidth=2, ax=ax3, label='Raw CDF')
        sns.ecdfplot(roads_data['accident_density_adjusted'], color='orange', linestyle='-.', linewidth=2, ax=ax3, label='Transformed CDF')
        ax3.set_ylabel('Cumulative Probability')
        ax3.legend(loc='upper right')

        plt.title('Distribution of Accident Density Before and After Smoothing')
        plt.tight_layout()
        accident_density_dist_path = os.path.join(BASE_DIR, 'accident_density_distribution.png')
        plt.savefig(accident_density_dist_path)
        plt.close()
        logging.info(f"Accident density distribution plot saved to {accident_density_dist_path}")

        # 2. Comparison of avg_road_accident_density Across Neighborhood Clusters
        plt.figure(figsize=(10, 6))
        logging.info(f"Columns in data['neighborhoods']: {list(data['neighborhoods'].columns)}")
        if 'avg_road_accident_density' not in data['neighborhoods'].columns:
            logging.error("'avg_road_accident_density' column missing in data['neighborhoods']")
            raise KeyError("'avg_road_accident_density' column missing")
        sns.boxplot(x='cluster', y='avg_road_accident_density', data=data['neighborhoods'], palette='Set2')
        sns.stripplot(x='cluster', y='avg_road_accident_density', data=data['neighborhoods'], color='black', size=3, alpha=0.5)
        plt.xlabel('Neighborhood Cluster (0=Urban, 1=Suburban, 2=Rural)')
        plt.ylabel('Avg. Road Accident Density\n(accidents/km, log scale)')
        plt.title('Average Road Accident Density by Neighborhood Cluster')
        cluster_counts = data['neighborhoods']['cluster'].value_counts().sort_index()
        max_y = data['neighborhoods']['avg_road_accident_density'].max() + 0.2
        for i, count in enumerate(cluster_counts):
            plt.text(i, max_y, f'n={count}', horizontalalignment='center', fontsize=10)
        clusters = data['neighborhoods'].groupby('cluster')['avg_road_accident_density'].apply(list)
        f_stat, p_value = f_oneway(*clusters)
        plt.text(1, max_y + 0.1, f'ANOVA p={p_value:.4f}', horizontalalignment='center', fontsize=10)
        plt.tight_layout()
        accident_density_cluster_path = os.path.join(BASE_DIR, 'accident_density_by_cluster.png')
        plt.savefig(accident_density_cluster_path)
        plt.close()
        logging.info(f"Accident density by cluster plot saved to {accident_density_cluster_path}")
        logging.info(f"ANOVA for avg_road_accident_density across clusters: F={f_stat:.2f}, p={p_value:.4f}")

        # 3. Walkability Score Distributions
        plt.figure(figsize=(10, 6))
        sns.kdeplot(neighborhood_nodes['walkability_score'], label='Rule-Based Walkability Score', color='blue')
        sns.kdeplot(neighborhood_nodes['walkability_gnn'], label='GNN Predicted Walkability', color='orange')
        plt.xlabel('Walkability Score')
        plt.ylabel('Density')
        plt.title('Distribution of Walkability Scores: Rule-Based vs. GNN Predicted')
        plt.legend()
        plt.tight_layout()
        walkability_dist_path = os.path.join(BASE_DIR, 'walkability_score_distribution.png')
        plt.savefig(walkability_dist_path)
        plt.close()
        logging.info(f"Walkability score distribution plot saved to {walkability_dist_path}")

        # 4. Walkability Category Distribution
        plt.figure(figsize=(8, 6))
        sns.countplot(x='walkability_category', data=neighborhood_nodes, order=['low', 'medium', 'high'])
        plt.xlabel('Walkability Category')
        plt.ylabel('Number of Neighborhoods')
        plt.title('Distribution of Walkability Categories')
        plt.tight_layout()
        walkability_category_path = os.path.join(BASE_DIR, 'walkability_category_distribution.png')
        plt.savefig(walkability_category_path)
        plt.close()
        logging.info(f"Walkability category distribution plot saved to {walkability_category_path}")

        # 5. Correlation Between walkability_score and walkability_gnn
        plt.figure(figsize=(8, 6))
        sns.scatterplot(x='walkability_score', y='walkability_gnn', data=neighborhood_nodes)
        sns.regplot(x='walkability_score', y='walkability_gnn', data=neighborhood_nodes, scatter=False, color='red')
        plt.xlabel('Rule-Based Walkability Score')
        plt.ylabel('GNN Predicted Walkability Score')
        plt.title(f'Correlation: Rule-Based vs. GNN Predicted Walkability (r={corr:.2f})')
        plt.tight_layout()
        walkability_correlation_path = os.path.join(BASE_DIR, 'walkability_correlation.png')
        plt.savefig(walkability_correlation_path)
        plt.close()
        logging.info(f"Walkability correlation plot saved to {walkability_correlation_path}")

        # 6. Component Contributions to walkability_score
        if 'green_space' not in walkability_components.columns:
            logging.error("'green_space' column is missing in walkability_components.")
            raise KeyError("'green_space' column is missing.")
        logging.info(f"Raw green_space stats (before normalization):\n{walkability_components['green_space'].describe()}")
        logging.info(f"Green space normalized stats:\n{walkability_components['green_space_norm'].describe()}")
        plt.figure(figsize=(10, 6))
        sns.histplot(walkability_components['green_space'], kde=True, color='green', label='Raw Green Space', stat='density')
        sns.histplot(walkability_components['green_space_norm'], kde=True, color='darkgreen', label='Normalized Green Space', stat='density')
        plt.xlabel('Green Space Value')
        plt.ylabel('Density')
        plt.title('Distribution of Green Space (Raw vs. Normalized)')
        plt.legend()
        plt.tight_layout()
        green_space_dist_path = os.path.join(BASE_DIR, 'green_space_distribution.png')
        plt.savefig(green_space_dist_path)
        plt.close()
        logging.info(f"Green space distribution plot saved to {green_space_dist_path}")

        logging.info(f"Components: {list(components.keys())}")
        logging.info(f"Number of components: {len(components)}")
        logging.info(f"Walkability components columns: {list(walkability_components.columns)}")

        components = {
            'land_use_score': 'higher_better',
            'intersection_density': 'higher_better',
            'population_density': 'higher_better',
            'transit_density': 'higher_better',
            'green_space': 'higher_better',
            'accident_density': 'lower_better',
            'sidewalk_coverage': 'higher_better',
            'amenities_density': 'higher_better',
            'pop_transit_interaction': 'higher_better',
            'safety_green_interaction': 'lower_better'
        }

        component_cols = [f'{comp}_norm' for comp in components]
        missing_cols = [col for col in component_cols if col not in walkability_components.columns]
        if missing_cols:
            logging.error(f"Missing component columns in walkability_components: {missing_cols}")
            raise ValueError(f"Missing component columns: {missing_cols}")

        walkability_components[component_cols].to_csv(os.path.join(BASE_DIR, 'walkability_components.csv'), index=False)
        logging.info(f"walkability_components saved to {os.path.join(BASE_DIR, 'walkability_components.csv')}")
        logging.info(f"walkability_components shape: {walkability_components.shape}")
        logging.info(f"walkability_components summary statistics:\n{walkability_components[component_cols].describe()}")
        logging.info(f"walkability_components non-NaN counts:\n{walkability_components[component_cols].notna().sum()}")
        logging.info(f"walkability_components sample data (first 5 rows):\n{walkability_components[component_cols].head()}")

        base_weights = {
            'land_use_score': 0.20,
            'intersection_density': 0.05,
            'population_density': 0.20,
            'transit_density': 0.20,
            'green_space': 0.15,
            'accident_density': 0.10,
            'sidewalk_coverage': 0.05,
            'amenities_density': 0.05,
            'pop_transit_interaction': 0.05,
            'safety_green_interaction': 0.05
        }

        def adjust_weights(cluster):
            weights = base_weights.copy()
            if cluster == 0:  # Urban
                weights['transit_density'] += 0.20
                weights['amenities_density'] += 0.20
                weights['green_space'] -= 0.20
            elif cluster == 1:  # Suburban
                weights['sidewalk_coverage'] += 0.20
                weights['population_density'] -= 0.20
            else:  # Rural
                weights['green_space'] += 0.20
                weights['accident_density'] += 0.20
                weights['transit_density'] -= 0.20
            total = sum(weights.values())
            return {k: v/total for k, v in weights.items()}

        weighted_contributions = pd.DataFrame(index=walkability_components.index, columns=component_cols)
        for idx, row in walkability_components.iterrows():
            cluster = row['cluster']
            weights = adjust_weights(cluster)
            for comp in components:
                norm_col = f'{comp}_norm'
                weighted_contributions.at[idx, norm_col] = weights.get(comp, 0.05) * row[norm_col]

        component_means = pd.Series(index=component_cols, dtype=float)
        component_stds = pd.Series(index=component_cols, dtype=float)
        for col in component_cols:
            if col in weighted_contributions.columns and weighted_contributions[col].notna().sum() > 0:
                component_means[col] = weighted_contributions[col].mean()
                component_stds[col] = weighted_contributions[col].std()
            else:
                logging.warning(f"No valid data for component {col}. Setting mean and std to 0.")
                component_means[col] = 0.0
                component_stds[col] = 0.0
        logging.info(f"Computed component_means (weighted):\n{component_means}")
        logging.info(f"Computed component_stds (weighted):\n{component_stds}")

        plot_data_list = [
            {'Component': col, 'Mean': component_means[col], 'Std': component_stds[col]}
            for col in component_cols
        ]
        plot_data = pd.DataFrame(plot_data_list)
        plot_data['Mean'] = plot_data['Mean'].astype(float)
        plot_data['Std'] = plot_data['Std'].astype(float)
        plot_data.to_csv(os.path.join(BASE_DIR, 'plot_data.csv'), index=False)
        logging.info(f"plot_data saved to {os.path.join(BASE_DIR, 'plot_data.csv')}")
        logging.info(f"plot_data shape: {plot_data.shape}")
        logging.info(f"plot_data dtypes: {plot_data.dtypes}")
        logging.info(f"plot_data contents:\n{plot_data}")

        if len(plot_data) != len(component_cols):
            logging.error(f"plot_data has incorrect number of rows: {len(plot_data)}, expected {len(component_cols)}")
            raise ValueError(f"plot_data has incorrect number of rows: {len(plot_data)}")

        logging.info(f"Input to sns.barplot: x='Component', y='Mean', data=\n{plot_data[['Component', 'Mean']]}")

        plt.figure(figsize=(12, 6))
        sns.barplot(x='Component', y='Mean', data=plot_data, capsize=0.2)
        plt.axhline(y=0, color='black', linestyle='--', linewidth=1)
        plt.xlabel('Component')
        plt.ylabel('Mean Weighted Contribution to Walkability Score')
        plt.title('Average Weighted Contribution of Components to Walkability Score')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        component_contribution_path = os.path.join(BASE_DIR, 'component_contributions.png')
        plt.savefig(component_contribution_path)
        plt.close()
        logging.info(f"Component contributions plot saved to {component_contribution_path}")

        # 7. Spatial Distribution of Walkability Scores
        import contextily as ctx
        neighborhoods_gdf = data['neighborhoods'].merge(
            neighborhood_nodes[['LIE_NAME', 'walkability_score', 'walkability_gnn']],
            on='LIE_NAME',
            how='left'
        )
        fig, ax = plt.subplots(figsize=(12, 10))
        neighborhoods_gdf.plot(
            column='walkability_score', 
            cmap='YlGn', 
            legend=True, 
            ax=ax,
            legend_kwds={
                'label': "Walkability Score",
                'orientation': "horizontal",
                'pad': 0.05,
                'fraction': 0.05,
                'aspect': 30
            }
        )
        ctx.add_basemap(
            ax, 
            crs=neighborhoods_gdf.crs.to_string(), 
            source=ctx.providers.OpenStreetMap.Mapnik,
            attribution="© OpenStreetMap contributors",
            attribution_size=8
        )
        plt.title('Spatial Distribution of Rule-Based Walkability Scores', pad=20)
        plt.xlabel('Easting (EPSG:3826)', labelpad=10)
        plt.ylabel('Northing (EPSG:3826)', labelpad=10)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.15)
        walkability_map_path = os.path.join(BASE_DIR, 'walkability_score_map.png')
        plt.savefig(walkability_map_path, bbox_inches='tight')
        plt.close()
        logging.info(f"Walkability score map saved to {walkability_map_path}")

        fig, ax = plt.subplots(figsize=(12, 10))
        neighborhoods_gdf.plot(
            column='walkability_gnn', 
            cmap='YlGn', 
            legend=True, 
            ax=ax,
            legend_kwds={
                'label': "GNN Predicted Walkability",
                'orientation': "horizontal",
                'pad': 0.05,
                'fraction': 0.05,
                'aspect': 30
            }
        )
        ctx.add_basemap(
            ax, 
            crs=neighborhoods_gdf.crs.to_string(), 
            source=ctx.providers.OpenStreetMap.Mapnik,
            attribution="© OpenStreetMap contributors",
            attribution_size=8
        )
        plt.title('Spatial Distribution of GNN Predicted Walkability Scores', pad=20)
        plt.xlabel('Easting (EPSG:3826)', labelpad=10)
        plt.ylabel('Northing (EPSG:3826)', labelpad=10)
        plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.15)
        walkability_gnn_map_path = os.path.join(BASE_DIR, 'walkability_gnn_map.png')
        plt.savefig(walkability_gnn_map_path, bbox_inches='tight')
        plt.close()
        logging.info(f"GNN walkability map saved to {walkability_gnn_map_path}")

        logging.info("Processing complete. Timing summary:")
        for step, duration in timings.items():
            logging.info(f"{step}: {duration:.2f} seconds")
        
        print("Pipeline completed successfully.")
        print(G.edgelist.edgelist_df.to_pandas().head())

        return results

    except Exception as e:
        logging.error(f"Pipeline failed with error: {str(e)}")
        raise

def plot_training_history(results):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 5))
    plt.plot(results['train_losses'], label='Train Loss')
    plt.plot(results['val_losses'], label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.savefig(os.path.join(BASE_DIR, 'training_validation_loss.png'))
    plt.close()
    logging.info(f"Training and validation loss plot saved to {os.path.join(BASE_DIR, 'training_validation_loss.png')}")

    plt.figure(figsize=(10, 5))
    plt.plot(results['train_maes'], label='Train MAE')
    plt.plot(results['val_maes'], label='Val MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    plt.title('Training and Validation MAE')
    plt.savefig(os.path.join(BASE_DIR, 'training_validation_mae.png'))
    plt.close()
    logging.info(f"Training and validation MAE plot saved to {os.path.join(BASE_DIR, 'training_validation_mae.png')}")

    plt.figure(figsize=(10, 5))
    plt.plot(results['train_r2s'], label='Train R2')
    plt.plot(results['val_r2s'], label='Val R2')
    plt.xlabel('Epoch')
    plt.ylabel('R2 Score')
    plt.legend()
    plt.title('Training and Validation R2 Score')
    plt.savefig(os.path.join(BASE_DIR, 'training_validation_r2.png'))
    plt.close()
    logging.info(f"Training and validation R2 plot saved to {os.path.join(BASE_DIR, 'training_validation_r2.png')}")

if __name__ == "__main__":
    results = main(force_recompute_graph=True)
    plot_training_history(results)

2025-05-05 17:04:14,765 - INFO - Ensured subgraph directory exists: /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs
2025-05-05 17:04:14,765 - INFO - Starting load_and_prepare_data...
2025-05-05 17:04:14,766 - INFO - Loading neighborhoods from /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/neighborhoods_with_ndvi_numerical_corrected.geojson
2025-05-05 17:04:14,903 - INFO - Neighborhoods loaded: shape=(456, 57), columns=['LIE_NAME', 'SECT_NAME', '2024population', 'land_use_city_open_area_count', 'land_use_city_open_area_area_m2', 'land_use_city_open_area_percent', 'land_use_commercial_count', 'land_use_commercial_area_m2', 'land_use_commercial_percent', 'land_use_infrastructure_count', 'land_use_infrastructure_area_m2', 'land_use_infrastructure_percent', 'land_use_government_count', 'land_use_government_area_m2', 'land_use_government_percent', 'land_use_public_transportation_count', 'land_use_public_transportation_area_m2', 'land_use_public_transportation_percent

Starting load_and_prepare_data...


2025-05-05 17:04:15,096 - INFO - Roads loaded: shape=(81444, 2), columns=['class', 'geometry']
2025-05-05 17:04:15,105 - INFO - Computed length_m stats:
count     81444.000000
mean        145.622456
std        2304.902398
min           0.030284
25%          28.160770
50%          61.698697
75%         130.534001
max      426414.891763
Name: length_m, dtype: float64
2025-05-05 17:04:15,105 - INFO - Loading accidents from /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/2023_accidents.geojson
2025-05-05 17:04:15,658 - INFO - Accidents loaded: shape=(56133, 8), columns=['Month', 'Day', 'Hours', 'Minute', 'Location', 'Speed_limit', 'Roadtype', 'geometry']
2025-05-05 17:04:15,658 - INFO - Loading urban masterplan from /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/Taipei_urban_masterplan.geojson
2025-05-05 17:04:16,334 - INFO - Urban masterplan loaded: shape=(15521, 15), columns=['編號', '圖層', '顏色', '街廓編號', '分區代碼', '分區簡稱', '使用分區', '分區說明', '原屬分區', '變更前代碼', '變更前簡稱', '變更前分區', 'Cate


--- Data Structure Summary ---

Dataset: neighborhoods
Shape: (456, 58)
Columns and Data Types:
LIE_NAME                                    object
SECT_NAME                                   object
2024population                               int32
land_use_city_open_area_count                int32
land_use_city_open_area_area_m2            float64
land_use_city_open_area_percent            float64
land_use_commercial_count                    int32
land_use_commercial_area_m2                float64
land_use_commercial_percent                float64
land_use_infrastructure_count                int32
land_use_infrastructure_area_m2            float64
land_use_infrastructure_percent            float64
land_use_government_count                    int32
land_use_government_area_m2                float64
land_use_government_percent                float64
land_use_public_transportation_count         int32
land_use_public_transportation_area_m2     float64
land_use_public_transportation_perce

See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#implementing-a-custom-legend-handler
  plt.legend()
2025-05-05 17:04:18,705 - INFO - Overlay plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/roads_neighborhoods_overlap.png
2025-05-05 17:04:19,172 - INFO - Assigning accidents to nearest road...
2025-05-05 17:04:24,302 - INFO - Matched 56133 accidents out of 56133
2025-05-05 17:04:24,307 - INFO - Reassigning 4991 accidents from footway/cycleway...
2025-05-05 17:04:24,407 - INFO - Reassigned 1429 accidents to wider roads
2025-05-05 17:04:24,415 - INFO - Accidents by road type:
class
bridleway            0
cycleway           247
footway           3315
living_street       79
motorway           109
path                86
pedestrian          84
primary           6535
residential      10110
secondary        16180
service           5011
steps               47
tertiary          9135
track                8
trunk             2493
unclassified      166

Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


2025-05-05 17:04:25,025 - INFO - Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png
2025-05-05 17:04:25,209 - INFO - Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
2025-05-05 17:04:25,211 - INFO - Top 3 road types by accident density:
        class  accident_density
8   secondary              2.14
12    primary              2.10
16      trunk              1.16
2025-05-05 17:04:25,212 - INFO - Computing average road accident density per neighborhood...
2025-05-05 17:04:25,213 - INFO - Roads DataFrame shape before join: (75149, 7)
2025-05-05 17:04:25,213 - INFO - Neighborhoods DataFrame shape before join: (456, 58)


Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png
Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
Top 3 road types by accident density:
        class  accident_density
8   secondary              2.14
12    primary              2.10
16      trunk              1.16


2025-05-05 17:04:25,541 - INFO - Road-neighborhood join resulted in 85340 matches with columns: ['geometry', 'class', 'length_m', 'width_rank', 'accident_density', 'index_right', 'LIE_NAME']
2025-05-05 17:04:25,543 - INFO - Non-NaN LIE_NAME count: 60885
2025-05-05 17:04:25,544 - INFO - Non-NaN accident_density count: 85340
2025-05-05 17:04:25,545 - INFO - Unique LIE_NAME values: 456
2025-05-05 17:04:25,549 - INFO - Number of neighborhoods with calculated avg_accident_density: 456
2025-05-05 17:04:25,549 - INFO - NaN in avg_accident_density: 0
2025-05-05 17:04:25,551 - INFO - Assigned avg_road_accident_density to 456 neighborhoods
2025-05-05 17:04:25,553 - INFO - Avg road accident density stats:
count    456.000000
mean       0.818358
std        0.343088
min        0.101213
25%        0.545024
50%        0.806933
75%        1.051564
max        1.904460
Name: avg_road_accident_density, dtype: float64
2025-05-05 17:04:25,569 - INFO - Computing pedestrian road density per neighborhood...


Starting compute_pedestrian_road_density...


2025-05-05 17:04:25,979 - INFO - pedestrian_road_density stats:
count    456.000000
mean      15.841759
std       10.276229
min        0.000000
25%        8.202007
50%       15.006599
75%       21.558429
max       57.267153
Name: pedestrian_road_density, dtype: float64
2025-05-05 17:04:25,983 - INFO - Stage 2: Building city graph...
2025-05-05 17:04:25,984 - INFO - Dataset neighborhoods column types:
LIE_NAME                                    object
SECT_NAME                                   object
2024population                               int32
land_use_city_open_area_count                int32
land_use_city_open_area_area_m2            float64
land_use_city_open_area_percent            float64
land_use_commercial_count                    int32
land_use_commercial_area_m2                float64
land_use_commercial_percent                float64
land_use_infrastructure_count                int32
land_use_infrastructure_area_m2            float64
land_use_infrastructure_percent    

Starting build_graph...


Neighborhood nodes: 100%|██████████| 456/456 [00:00<00:00, 10078.85it/s]
2025-05-05 17:04:26,057 - INFO - Adding building nodes...
2025-05-05 17:04:26,057 - ERROR - Pipeline failed with error: 'buildings'


KeyError: 'buildings'