Cell 0: CUDA Version check

In [1]:
import cudf, cugraph
print(cudf.__version__, cugraph.__version__)

25.02.02 25.02.00


Cell 1: Imports ,Global Constants

In [None]:
# Standard library imports
import os
import logging
import hashlib
import json
import time

# Third-party imports
import geopandas as gpd
import pandas as pd
import cudf
import cugraph
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from scipy.stats import pearsonr, spearmanr
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import networkx as nx
from tqdm import tqdm
from keplergl import KeplerGl
from multiprocessing import Pool, cpu_count
from sklearn.metrics import mean_absolute_error, r2_score
from torch_geometric.nn import GATConv, BatchNorm
import torch.nn.functional as F
# Plotting configuration
%matplotlib inline
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC', 'Noto Serif CJK TC', 'Noto Sans Mono CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Directory and file paths
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical_corrected.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population_corrected.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
INTERSECTION_CACHE_PATH = os.path.join(BASE_DIR, "neighborhoods_with_intersections.geoparquet")
GRAPH_NODES_CACHE_PATH = os.path.join(BASE_DIR, "graph_nodes.parquet")
GRAPH_EDGES_CACHE_PATH = os.path.join(BASE_DIR, "graph_edges.parquet")
GRAPH_NODE_ID_CACHE_PATH = os.path.join(BASE_DIR, "graph_node_id_to_index.json")
GRAPH_DATA_HASH_PATH = os.path.join(BASE_DIR, "graph_data_hash.txt")

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Constants for spatial analysis
BUFFER_DISTANCE = 10  # Meters, buffer distance for border sharing of accidents (tunable based on spatial resolution)
MIN_ROAD_LENGTH = 10  # Meters, minimum road length to avoid inflated accident density (tunable based on dataset)

# Land use category priorities for area assignment
CATEGORY_PRIORITY = {
    'City_Open_Area': 10,
    'Pedestrian': 9,
    'Public_Transportation': 8,
    'Amenity': 7,
    'Education': 6,
    'Medical': 5,
    'Commercial': 4,
    'Residential': 3,
    'Natural': 2,
    'Road': 1,
    'River': 1,
    'Infrastructure': 1,
    'Government': 1,
    'Special_Zone': 1,
    'Military': 1,
    'Industrial': 1,
    'Agriculture': 1
}

# Weights for land use diversity in walkability scoring
land_use_weights = {
    'city_open_area': 0.8,
    'commercial': 0.7,
    'infrastructure': 0.4,
    'government': 0.5,
    'public_transportation': 0.8,
    'education': 0.7,
    'medical': 0.6,
    'amenity': 0.8,
    'road': 0.3,
    'pedestrian': 1.0,
    'natural': 0.7,
    'special_zone': 0.4,
    'river': 0.7,
    'military': 0.2,
    'residential': 0.6,
    'industrial': 0.3,
    'agriculture': 0.4
}

"""
Normalise Information to Standard 
Read and Group to standard
Merge to standard
graph
kepler merge
"""

In [24]:
gpd.read_file(OSM_BUILDINGS_PATH)

Unnamed: 0,full_id,osm_id,building,屋齡,建物高度,地上層數,構造種類,使用分區,geometry
0,r2633015,2633015,dormitory,,,,Unknown,Unknown,"POLYGON ((121.53954 25.13558, 121.53983 25.135..."
1,r2633016,2633016,university,,,,Unknown,Unknown,"POLYGON ((121.5391 25.13456, 121.53905 25.1347..."
2,r2633017,2633017,yes,,,,Unknown,Unknown,"POLYGON ((121.53777 25.13606, 121.53826 25.136..."
3,r2633018,2633018,yes,,,,Unknown,Unknown,"POLYGON ((121.53752 25.13559, 121.53777 25.135..."
4,r2633019,2633019,yes,,,,Unknown,Unknown,"POLYGON ((121.53791 25.13688, 121.53797 25.136..."
...,...,...,...,...,...,...,...,...,...
74301,w1318077126,1318077126,house,,,,Unknown,Unknown,"POLYGON ((121.49534 25.02304, 121.49555 25.023..."
74302,w1324938980,1324938980,apartments,,,,Unknown,Unknown,"POLYGON ((121.50182 25.04497, 121.50201 25.045..."
74303,w1329134163,1329134163,residential,,,,Unknown,Unknown,"POLYGON ((121.50542 25.04627, 121.50548 25.046..."
74304,w1329134164,1329134164,residential,28.0,36.85,12.0,RC造,商4,"POLYGON ((121.50544 25.04632, 121.50542 25.046..."


Cell 2: Utility Functions

In [3]:
def compute_walkability_components_all(neighborhoods_gdf, data):
    import pandas as pd
    import numpy as np
    import logging
    
    # Compute land_use_score
    land_use_categories = list(land_use_weights.keys())
    
    def compute_land_use_score(row):
        score = 0.0
        for cat in land_use_categories:
            col = f'land_use_{cat}_percent'
            if col in row:
                p = row[col] / 100.0
                score += land_use_weights[cat] * p
        return score
    
    neighborhoods_gdf['land_use_score'] = neighborhoods_gdf.apply(compute_land_use_score, axis=1)
    logging.info(f"land_use_score stats:\n{neighborhoods_gdf['land_use_score'].describe()}")
    
    # Compute population_density
    neighborhoods_gdf['population_density'] = neighborhoods_gdf['total_population'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"population_density stats:\n{neighborhoods_gdf['population_density'].describe()}")
    
    # Compute transit_density
    neighborhoods_gdf['transit_density'] = neighborhoods_gdf['transit_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"transit_density stats:\n{neighborhoods_gdf['transit_density'].describe()}")
    
    # Use ndvi for green_space, fallback to tree_density if ndvi is missing
    if 'ndvi' in neighborhoods_gdf.columns:
        neighborhoods_gdf['green_space'] = neighborhoods_gdf['ndvi']
        logging.info(f"ndvi stats:\n{neighborhoods_gdf['ndvi'].describe()}")
    else:
        logging.warning("'ndvi' column missing. Using tree_density instead.")
        neighborhoods_gdf['tree_density'] = neighborhoods_gdf['tree_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
        neighborhoods_gdf['green_space'] = neighborhoods_gdf['tree_density']
    logging.info(f"green_space stats:\n{neighborhoods_gdf['green_space'].describe()}")
    
    # Compute accident_density
    neighborhoods_gdf['accident_density'] = neighborhoods_gdf['accident_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"accident_density stats:\n{neighborhoods_gdf['accident_density'].describe()}")
    
    # Log intersection_density if present
    if 'intersection_density' in neighborhoods_gdf.columns:
        logging.info(f"intersection_density stats:\n{neighborhoods_gdf['intersection_density'].describe()}")
    else:
        logging.warning("intersection_density column missing.")
    
    # Define components and their directions
    components = {
        'land_use_score': 'higher_better',
        'intersection_density': 'higher_better',
        'population_density': 'higher_better',
        'transit_density': 'higher_better',
        'green_space': 'higher_better',
        'accident_density': 'lower_better'
    }
    
    # Define weights to emphasize high-variation components
    component_weights = {
        'land_use_score': 0.20,  # Increase if diversity is key
        'intersection_density': 0.05,
        'population_density': 0.25,
        'transit_density': 0.25,
        'green_space': 0.15,
        'accident_density': 0.10  # Increase if safety is critical
    }
    
    # Compute robust normalization (median and IQR)
    medians = {}
    iqrs = {}
    for comp in components:
        if comp == 'land_use_score':
            medians[comp] = 0.0
            iqrs[comp] = 1.0
            neighborhoods_gdf[f'{comp}_norm'] = neighborhoods_gdf[comp]
        else:
            medians[comp] = neighborhoods_gdf[comp].median()
            q75, q25 = neighborhoods_gdf[comp].quantile([0.75, 0.25])
            iqrs[comp] = q75 - q25 if q75 != q25 else 1.0
            if iqrs[comp] == 0:
                logging.warning(f"Component {comp} has no IQR variation. Setting normalized value to 0.5")
                neighborhoods_gdf[f'{comp}_norm'] = 0.5
            else:
                if components[comp] == 'higher_better':
                    neighborhoods_gdf[f'{comp}_norm'] = (neighborhoods_gdf[comp] - neighborhoods_gdf[comp].min()) / (neighborhoods_gdf[comp].max() - neighborhoods_gdf[comp].min())
                else:
                    neighborhoods_gdf[f'{comp}_norm'] = (neighborhoods_gdf[comp].max() - neighborhoods_gdf[comp]) / (neighborhoods_gdf[comp].max() - neighborhoods_gdf[comp].min())
    
    logging.info("Normalization statistics:")
    for comp in components:
        logging.info(f"{comp} - median: {medians[comp]:.4f}, IQR: {iqrs[comp]:.4f}")
    
    # Compute walkability_score as weighted sum
    neighborhoods_gdf['walkability_score'] = 0.0
    for comp in components:
        norm_col = f'{comp}_norm'
        if norm_col in neighborhoods_gdf.columns:
            neighborhoods_gdf['walkability_score'] += component_weights[comp] * neighborhoods_gdf[norm_col]
        else:
            logging.error(f"Normalized column {norm_col} not found.")
    
    # Ensure it's between 0 and 1
    neighborhoods_gdf['walkability_score'] = neighborhoods_gdf['walkability_score'].clip(0, 1)
    logging.info(f"walkability_score stats:\n{neighborhoods_gdf['walkability_score'].describe()}")
    
    # Compute walkability_category with dynamic thresholds
    low_threshold = neighborhoods_gdf['walkability_score'].quantile(0.33)
    high_threshold = neighborhoods_gdf['walkability_score'].quantile(0.66)
    logging.info(f"Walkability category thresholds - low: {low_threshold:.4f}, high: {high_threshold:.4f}")
    
    def categorize_score(score):
        if score < low_threshold:
            return 'low'
        elif score < high_threshold:
            return 'medium'
        else:
            return 'high'
    
    neighborhoods_gdf['walkability_category'] = neighborhoods_gdf['walkability_score'].apply(categorize_score)
    logging.info(f"Walkability category distribution:\n{neighborhoods_gdf['walkability_category'].value_counts()}")
    
    # Return relevant columns
    return neighborhoods_gdf[['LIE_NAME', 'walkability_score', 'walkability_category']]

In [4]:
def fix_geometry(geom, buffer_size=1e-5):
    """Fix invalid geometries with logging for debugging."""
    if geom is None or geom.is_empty:
        logging.debug("Geometry is None or empty, returning a default Point(0,0).")
        return Point(0, 0)
    try:
        geom = make_valid(geom)
        if not geom.is_valid:
            logging.debug(f"Geometry invalid after make_valid, applying buffer(0): {geom.bounds}")
            geom = geom.buffer(0)
            if not geom.is_valid:
                logging.debug(f"Geometry still invalid, applying buffer with size {buffer_size}: {geom.bounds}")
                geom = geom.buffer(buffer_size)
                geom = make_valid(geom)
        if not geom.is_valid:
            logging.warning(f"Geometry remains invalid after all attempts: {geom.bounds}. Returning default Point(0,0).")
            return Point(0, 0)
        return geom
    except Exception as e:
        logging.error(f"Error fixing geometry: {e}. Returning default Point(0,0).")
        return Point(0, 0)

def print_data_structure(data_dict):
    """Print a detailed summary of the data structure for each dataset."""
    print("\n--- Data Structure Summary ---")
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            print(f"\nDataset: {key}")
            print(f"Shape: {df.shape}")
            print(f"Columns and Data Types:\n{df.dtypes}")
            print(f"Missing values (total): {df.isnull().sum().sum()}")
            print(f"Missing values per column:\n{df.isnull().sum()}")
            if 'LIE_NAME' in df.columns:
                print(f"Unique LIE_NAME: {df['LIE_NAME'].nunique()}")
            if 'class' in df.columns and key == 'roads':
                print(f"Road class counts:\n{df['class'].value_counts()}")
            print(f"Sample data (first 2 rows):\n{df.head(2)}")
    print("--- End of Data Structure Summary ---\n")

def print_percentage_calculation(neighborhoods_gdf, urban_masterplan_gdf, sample_size=3):
    """Print the land use percentage calculation process for a sample of neighborhoods."""
    print("\n--- Percentage Calculation Process ---")
    sample_neighborhoods = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    for idx, row in sample_neighborhoods.iterrows():
        lie_name = row['LIE_NAME']
        print(f"\nNeighborhood: {lie_name} (Index: {idx})")
        
        neighborhood_geom = fix_geometry(row['geometry'])
        if not neighborhood_geom.is_valid:
            print(f"Neighborhood geometry is invalid after fixing: {lie_name}")
            continue
        
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            print("No master plan polygons intersect with this neighborhood.")
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            print("No valid intersections after overlay.")
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            print("No valid geometries after fixing intersected polygons.")
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area_geom = intersected.geometry.union_all()
        total_area = total_area_geom.area
        print(f"Total unique master plan area: {total_area:.2f} m²")
        
        remaining_geom = total_area_geom
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                print(f"Area of {category} (priority {CATEGORY_PRIORITY.get(category, 0)}): {category_area:.2f} m²")
                remaining_geom = remaining_geom.difference(category_area_geom)
            except GEOSException as e:
                print(f"Topology error for category {category}: {e}")
                category_areas[category] = 0.0
        
        print("\nPercentages:")
        total_percentage = 0.0
        for category, area in category_areas.items():
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            total_percentage += percentage
            print(f"{category}: {percentage:.2f}%")
        print(f"Sum of percentages: {total_percentage:.2f}%")
    print("--- End of Percentage Calculation Process ---\n")

def compute_data_hash(data_dict):
    """Compute a hash of the data for caching purposes."""
    hasher = hashlib.sha256()
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            hasher.update(str(df.shape).encode('utf-8'))
            hasher.update(str(sorted(df.columns)).encode('utf-8'))
            
            logging.info(f"Dataset {key} column types:\n{df.dtypes}")
            
            sample_df = df.head(5).copy()
            if 'geometry' in sample_df.columns:
                sample_df = sample_df.drop(columns=['geometry'])
            for col in sample_df.columns:
                sample_df[col] = sample_df[col].apply(
                    lambda x: x.tolist() if isinstance(x, np.ndarray) else
                              float(x) if isinstance(x, (np.floating, np.integer)) else x
                )
            try:
                sample = sample_df.to_json()
                hasher.update(sample.encode('utf-8'))
            except Exception as e:
                logging.error(f"Failed to serialize sample for dataset {key}: {e}")
                sample = str(sample_df.to_dict())
                hasher.update(sample.encode('utf-8'))
    return hasher.hexdigest()

def check_spatial_overlap(gdf1, gdf2, label1="gdf1", label2="gdf2"):
    """Check for spatial overlap between two GeoDataFrames and log the results."""
    logging.info(f"Checking spatial overlap between {label1} and {label2}...")
    gdf1 = gdf1.copy()
    gdf2 = gdf2.copy()
    
    if gdf1.crs != gdf2.crs:
        logging.warning(f"CRS mismatch between {label1} ({gdf1.crs}) and {label2} ({gdf2.crs}). Aligning to {gdf1.crs}...")
        gdf2 = gdf2.to_crs(gdf1.crs)
    
    gdf1['geometry'] = gdf1['geometry'].apply(fix_geometry)
    gdf2['geometry'] = gdf2['geometry'].apply(fix_geometry)
    
    gdf1_bounds = gdf1.total_bounds
    gdf2_bounds = gdf2.total_bounds
    logging.info(f"{label1} bounds: {gdf1_bounds}")
    logging.info(f"{label2} bounds: {gdf2_bounds}")
    
    bounds_overlap = not (gdf1_bounds[2] < gdf2_bounds[0] or
                         gdf1_bounds[0] > gdf2_bounds[2] or
                         gdf1_bounds[3] < gdf2_bounds[1] or
                         gdf1_bounds[1] > gdf2_bounds[3])
    logging.info(f"Bounding boxes overlap: {bounds_overlap}")
    
    sample_size = min(10, len(gdf1), len(gdf2))
    if sample_size > 0:
        sample_gdf1 = gdf1.sample(sample_size, random_state=42)
        intersects = gpd.sjoin(sample_gdf1, gdf2, how='inner', predicate='intersects')
        logging.info(f"Sample intersection check: {len(intersects)} intersections found out of {sample_size} samples.")
    
    return bounds_overlap

def validate_data(gdf, required_cols, name="GeoDataFrame"):
    """Validate that the GeoDataFrame has all required columns, no missing geometries, and valid geometries."""
    if gdf.empty:
        logging.error(f"{name} is empty.")
        raise ValueError(f"{name} is empty.")
    missing_cols = [col for col in required_cols if col not in gdf.columns]
    if missing_cols:
        logging.error(f"Missing columns in {name}: {missing_cols}")
        raise KeyError(f"Missing columns in {name}: {missing_cols}")
    if gdf.geometry.isna().any():
        logging.error(f"Missing geometries in {name}")
        raise ValueError(f"Missing geometries in {name}")
    if not all(gdf.geometry.is_valid):
        logging.error(f"Invalid geometries in {name}")
        raise ValueError(f"Invalid geometries in {name}")


Cell 3: Walkability Computation Functions

In [5]:
def compute_road_type_accident_correlation(roads_gdf, neighborhoods_gdf, accidents_gdf):
    logging.info("Computing correlation between road types and accident density...")
    
    # Validate input data
    validate_data(roads_gdf, ['class', 'geometry', 'length_m'], "roads_gdf")
    validate_data(neighborhoods_gdf, ['LIE_NAME', 'geometry'], "neighborhoods_gdf")
    validate_data(accidents_gdf, ['geometry'], "accidents_gdf")
    
    # Ensure correct CRS
    target_crs = 'EPSG:3826'
    source_crs = 'EPSG:4326'
    for gdf, name in [(roads_gdf, "roads"), (neighborhoods_gdf, "neighborhoods"), (accidents_gdf, "accidents")]:
        if gdf.crs is None:
            logging.warning(f"{name} has no CRS defined. Assuming {source_crs}.")
            gdf.set_crs(source_crs, inplace=True)
        if gdf.crs != target_crs:
            logging.info(f"Reprojecting {name} from {gdf.crs} to {target_crs}")
            gdf.to_crs(target_crs, inplace=True)
    
    # Log CRS, bounds, and sample geometries for debugging
    logging.info(f"Roads CRS: {roads_gdf.crs}, Bounds: {roads_gdf.total_bounds}")
    logging.info(f"Neighborhoods CRS: {neighborhoods_gdf.crs}, Bounds: {neighborhoods_gdf.total_bounds}")
    logging.info(f"Accidents CRS: {accidents_gdf.crs}, Bounds: {accidents_gdf.total_bounds}")
    logging.info(f"Roads geometry types: {roads_gdf.geometry.type.unique()}")
    logging.info(f"Neighborhoods geometry types: {neighborhoods_gdf.geometry.type.unique()}")
    sample_roads = roads_gdf.head(5)['geometry'].apply(lambda x: str(x)[:100])
    sample_neighborhoods = neighborhoods_gdf.head(5)['geometry'].apply(lambda x: str(x)[:100])
    logging.info(f"Sample road geometries:\n{sample_roads}")
    logging.info(f"Sample neighborhood geometries:\n{sample_neighborhoods}")
    
    # Visualize data for debugging
    fig, ax = plt.subplots(figsize=(10, 10))
    neighborhoods_gdf.plot(ax=ax, color='blue', alpha=0.5, label='Neighborhoods')
    roads_gdf.plot(ax=ax, color='red', alpha=0.5, label='Roads')
    plt.legend()
    plt.title('Roads and Neighborhoods Overlay')
    plt.savefig(os.path.join(BASE_DIR, 'roads_neighborhoods_overlap.png'))
    plt.close()
    logging.info(f"Overlay plot saved to {os.path.join(BASE_DIR, 'roads_neighborhoods_overlap.png')}")
    
    # Make local copies for roads and accidents
    roads_gdf_local = roads_gdf.copy()
    accidents_gdf_local = accidents_gdf.copy()
    
    # Add unique identifier to accidents
    accidents_gdf_local['accident_id'] = range(len(accidents_gdf_local))
    
    # Define width ranking
    width_ranking = {
        'motorway': 5, 'trunk': 5, 'primary': 4, 'secondary': 4, 'tertiary': 3,
        'residential': 3, 'living_street': 3, 'service': 2, 'track': 2,
        'path': 1, 'footway': 1, 'cycleway': 1, 'steps': 1, 'pedestrian': 1,
        'unclassified': 0, 'bridleway': 0, 'unknown': 0
    }
    roads_gdf_local['width_rank'] = roads_gdf_local['class'].map(width_ranking).fillna(0).astype(int)
    
    # Buffer wider roads for accident assignment
    roads_gdf_buffered = roads_gdf_local.copy()
    roads_gdf_buffered['geometry'] = roads_gdf_buffered.apply(
        lambda row: row['geometry'].buffer(5) if row['width_rank'] >= 4 else row['geometry'], axis=1
    )
    
    logging.info("Assigning accidents to nearest road...")
    accidents_gdf_local['geometry'] = accidents_gdf_local['geometry'].apply(fix_geometry)
    accidents_gdf_local = accidents_gdf_local[accidents_gdf_local['geometry'].is_valid & ~accidents_gdf_local['geometry'].is_empty]
    
    if accidents_gdf_local.empty:
        logging.warning("No valid accidents after geometry fixing.")
        return pd.DataFrame()
    
    # Assign accidents to nearest road
    nearest = gpd.sjoin_nearest(
        accidents_gdf_local,
        roads_gdf_buffered[['geometry', 'class', 'width_rank']],
        how='left',
        distance_col='distance'
    )
    nearest['weighted_distance'] = nearest['distance'] / (nearest['width_rank'].replace(0, 1) ** 2)
    nearest = nearest.sort_values('weighted_distance').drop_duplicates(subset=['accident_id'], keep='first')
    
    matched_accidents = nearest[['accident_id', 'index_right']].copy()
    matched_accidents.columns = ['accident_id', 'road_idx']
    matched_accidents = matched_accidents.dropna(subset=['road_idx'])
    matched_accidents['road_idx'] = matched_accidents['road_idx'].astype(int)
    
    logging.info(f"Matched {len(matched_accidents)} accidents out of {len(accidents_gdf_local)}")
    
    # Reassign accidents from footway/cycleway to wider roads if possible
    footway_cycleway_accidents = matched_accidents[
        matched_accidents['road_idx'].isin(
            roads_gdf_local[roads_gdf_local['class'].isin(['footway', 'cycleway'])].index
        )
    ]
    if not footway_cycleway_accidents.empty:
        logging.info(f"Reassigning {len(footway_cycleway_accidents)} accidents from footway/cycleway...")
        accidents_to_reassign = accidents_gdf_local[accidents_gdf_local['accident_id'].isin(footway_cycleway_accidents['accident_id'])].copy()
        wider_roads = roads_gdf_buffered[roads_gdf_buffered['width_rank'] >= 4]
        if not wider_roads.empty:
            reassigned = gpd.sjoin_nearest(
                accidents_to_reassign,
                wider_roads[['geometry', 'class']],
                how='left',
                max_distance=10
            )
            reassigned_matches = reassigned[['accident_id', 'index_right']].copy()
            reassigned_matches.columns = ['accident_id', 'road_idx']
            reassigned_matches = reassigned_matches.dropna(subset=['road_idx'])
            reassigned_matches['road_idx'] = reassigned_matches['road_idx'].astype(int)
            matched_accidents = matched_accidents[~matched_accidents['accident_id'].isin(reassigned_matches['accident_id'])]
            matched_accidents = pd.concat([matched_accidents, reassigned_matches], ignore_index=True)
            logging.info(f"Reassigned {len(reassigned_matches)} accidents to wider roads")
    
    # Count accidents per road
    accident_counts = matched_accidents.groupby('road_idx').size().reindex(roads_gdf_local.index, fill_value=0)
    roads_gdf_local['accident_count'] = accident_counts
    
    logging.info(f"Accidents by road type:\n{roads_gdf_local.groupby('class')['accident_count'].sum()}")
    
    # Filter roads by minimum length
    roads_gdf_local = roads_gdf_local[roads_gdf_local['length_m'] >= MIN_ROAD_LENGTH]
    
    # Log data integrity before calculating accident density
    logging.info(f"length_m stats:\n{roads_gdf_local['length_m'].describe()}")
    logging.info(f"accident_count stats:\n{roads_gdf_local['accident_count'].describe()}")
    
    # Calculate accident density
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_count'] / (roads_gdf_local['length_m'] / 1000)
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'].fillna(0).replace([np.inf, -np.inf], 0)
    
    # Adjust density by width rank
    roads_gdf_local['accident_density'] = roads_gdf_local['accident_density'] * (roads_gdf_local['width_rank'].replace(0, 1) / 5)
    
    # Log accident density integrity
    logging.info(f"NaN in accident_density: {roads_gdf_local['accident_density'].isna().sum()}")
    logging.info(f"accident_density stats:\n{roads_gdf_local['accident_density'].describe()}")
    
    logging.info(f"Road type counts:\n{roads_gdf_local['class'].value_counts()}")
    print(f"Road type counts:\n{roads_gdf_local['class'].value_counts()}")
    
    # Summarize by road type
    summary = roads_gdf_local.groupby('class').agg({
        'length_m': 'sum',
        'accident_count': 'sum',
        'accident_density': 'mean',
        'width_rank': 'first'
    }).reset_index()
    
    summary = summary[summary['length_m'] > 1000]
    summary = summary[summary['width_rank'] > 0]
    
    print("\n--- Road Type Accident Density Summary ---")
    print(summary[['class', 'length_m', 'accident_count', 'accident_density', 'width_rank']].round(2))
    
    if len(summary) >= 2:
        corr, p_value = spearmanr(summary['width_rank'], summary['accident_density'])
        logging.info(f"Spearman's correlation between road width rank and accident density: {corr:.3f} (p-value: {p_value:.3f})")
        print(f"Spearman's correlation: {corr:.3f} (p-value: {p_value:.3f})")
    else:
        logging.warning("Insufficient road types for correlation analysis.")
        print("Insufficient road types for correlation analysis.")
    
    # Compute average road accident density per neighborhood
    logging.info("Computing average road accident density per neighborhood...")
    logging.info(f"Roads DataFrame shape before join: {roads_gdf_local.shape}")
    logging.info(f"Neighborhoods DataFrame shape before join: {neighborhoods_gdf.shape}")
    
    # Fix geometries before join
    roads_gdf_with_idx = roads_gdf_local[['geometry', 'class', 'length_m', 'width_rank', 'accident_density']].copy()
    roads_gdf_with_idx['geometry'] = roads_gdf_with_idx['geometry'].apply(fix_geometry)
    neighborhoods_gdf_with_idx = neighborhoods_gdf[['geometry', 'LIE_NAME']].copy()
    neighborhoods_gdf_with_idx['geometry'] = neighborhoods_gdf_with_idx['geometry'].apply(fix_geometry)
    
    # Buffer roads slightly to capture near-intersections
    roads_gdf_with_idx['geometry'] = roads_gdf_with_idx['geometry'].buffer(1)  # 1-meter buffer
    
    # Perform spatial join
    road_neighborhoods = gpd.sjoin(
        roads_gdf_with_idx,
        neighborhoods_gdf_with_idx,
        how='left',
        predicate='intersects'
    )
    logging.info(f"Road-neighborhood join resulted in {len(road_neighborhoods)} matches with columns: {road_neighborhoods.columns.tolist()}")
    
    # Log non-NaN counts
    logging.info(f"Non-NaN LIE_NAME count: {road_neighborhoods['LIE_NAME'].notna().sum()}")
    logging.info(f"Non-NaN accident_density count: {road_neighborhoods['accident_density'].notna().sum()}")
    logging.info(f"Unique LIE_NAME values: {road_neighborhoods['LIE_NAME'].nunique()}")
    
    # Calculate average accident density
    avg_accident_density = road_neighborhoods.groupby('LIE_NAME')['accident_density'].mean()
    logging.info(f"Number of neighborhoods with calculated avg_accident_density: {len(avg_accident_density)}")
    logging.info(f"NaN in avg_accident_density: {avg_accident_density.isna().sum()}")
    
    # Assign to neighborhoods with robust fallback
    if avg_accident_density.empty or avg_accident_density.isna().all():
        logging.error("No valid accident density averages. Assigning 0 to all neighborhoods.")
        neighborhoods_gdf['avg_road_accident_density'] = 0
    else:
        neighborhoods_gdf['avg_road_accident_density'] = neighborhoods_gdf['LIE_NAME'].map(avg_accident_density).fillna(0)
    
    assigned_count = sum(~neighborhoods_gdf['avg_road_accident_density'].isna())
    logging.info(f"Assigned avg_road_accident_density to {assigned_count} neighborhoods")
    logging.info(f"Avg road accident density stats:\n{neighborhoods_gdf['avg_road_accident_density'].describe()}")
    
    # Generate plots
    plt.figure(figsize=(12, 6))
    summary_sorted = summary.sort_values('width_rank', ascending=False)
    sns.barplot(data=summary_sorted, x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Mean Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    bar_path = os.path.join(BASE_DIR, 'road_type_accident_bar.png')
    plt.savefig(bar_path)
    plt.close()
    logging.info(f"Bar chart saved to {bar_path}")
    print(f"Bar chart saved to {bar_path}")
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=roads_gdf_local[roads_gdf_local['class'].isin(summary['class'])], 
                x='class', y='accident_density', hue='width_rank', dodge=False)
    plt.xlabel('Road Type')
    plt.ylabel('Accident Density (Accidents per km)')
    plt.title('Distribution of Accident Density by Road Type')
    plt.xticks(rotation=45, ha='right')
    plt.yscale('log')
    plt.legend(title='Width Rank')
    plt.tight_layout()
    box_path = os.path.join(BASE_DIR, 'road_type_accident_box.png')
    plt.savefig(box_path)
    plt.close()
    logging.info(f"Box chart saved to {box_path}")
    print(f"Box chart saved to {box_path}")
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=summary, x='width_rank', y='accident_density', 
                    size='length_m', sizes=(50, 500), hue='class', style='class', alpha=0.7)
    z = np.polyfit(summary['width_rank'], summary['accident_density'], 1)
    p = np.poly1d(z)
    plt.plot(summary['width_rank'], p(summary['width_rank']), "r--", alpha=0.5)
    plt.xlabel('Road Width Rank (1=Path, 5=Motorway)')
    plt.ylabel('Mean Accident Density (Accidents per km)')
    plt.title('Road Type vs. Accident Density')
    plt.yscale('log')
    plt.grid(True)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    scatter_path = os.path.join(BASE_DIR, 'road_type_accident_scatter.png')
    plt.savefig(scatter_path)
    plt.close()
    logging.info(f"Scatter plot saved to {scatter_path}")
    print(f"Scatter plot saved to {scatter_path}")
    
    top_types = summary.nlargest(3, 'accident_density')[['class', 'accident_density']]
    logging.info(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    print(f"Top 3 road types by accident density:\n{top_types.round(2)}")
    
    return summary

In [6]:
def compute_pedestrian_road_density(roads_gdf, neighborhoods_gdf):
    # Filters roads classified as 'footway', 'pedestrian', or 'cycleway'.
    # Performs a spatial join to sum road lengths per neighborhood.
    # Calculates density as length (m) / area (km²).
    """Compute pedestrian road density (length of pedestrian roads per km²) for each neighborhood."""
    logging.info("Computing pedestrian road density per neighborhood...")
    
    # Filter pedestrian roads (e.g., footway, pedestrian, cycleway)
    pedestrian_classes = ['footway', 'pedestrian', 'cycleway']
    pedestrian_roads = roads_gdf[roads_gdf['class'].isin(pedestrian_classes)].copy()
    
    # Fix geometries
    pedestrian_roads['geometry'] = pedestrian_roads['geometry'].apply(fix_geometry)
    neighborhoods_gdf_with_idx = neighborhoods_gdf[['geometry', 'LIE_NAME', 'area_km2']].copy()
    neighborhoods_gdf_with_idx['geometry'] = neighborhoods_gdf_with_idx['geometry'].apply(fix_geometry)
    
    # Perform spatial join
    pedestrian_road_neighborhoods = gpd.sjoin(
        pedestrian_roads[['geometry', 'length_m']],
        neighborhoods_gdf_with_idx,
        how='left',
        predicate='intersects'
    )
    
    # Sum pedestrian road lengths per neighborhood
    pedestrian_length = pedestrian_road_neighborhoods.groupby('LIE_NAME')['length_m'].sum()
    
    # Calculate density (length in meters per km²)
    neighborhoods_gdf['pedestrian_road_density'] = neighborhoods_gdf['LIE_NAME'].map(pedestrian_length).fillna(0) / (neighborhoods_gdf['area_km2'].replace(0, 1e-6) * 1000)
    
    logging.info(f"pedestrian_road_density stats:\n{neighborhoods_gdf['pedestrian_road_density'].describe()}")
    return neighborhoods_gdf

Cell 4 Main Data Loading and Processing

In [7]:
def load_and_prepare_data():
    logging.info("Stage 1: Loading and preparing data...")
    
    # Define file paths and their corresponding keys
    data_files = {
        'neighborhoods': LANDUSE_NDVI_PATH,
        'buildings': OSM_BUILDINGS_PATH,
        'roads': OSM_ROADS_PATH,
        'trees': OSM_TREES_PATH,
        'transit': OSM_TRANSIT_PATH,
        'urban_masterplan': URBAN_MASTERPLAN_PATH,
        'accidents': ACCIDENTS_PATH,
        'population': POPULATION_PATH
    }
    
    data = {}
    
    # Load data with progress bar
    for key, path in tqdm(data_files.items(), desc="Loading files"):
        try:
            if key == 'population':
                with open(path, 'r') as f:
                    data[key] = pd.DataFrame(json.load(f))
                logging.info(f"Columns in population_df after loading: {list(data[key].columns)}")
            elif path.endswith('.geoparquet'):
                data[key] = gpd.read_parquet(path)
            else:
                data[key] = gpd.read_file(path)
            logging.info(f"Loaded {key} with shape {data[key].shape}")
        except Exception as e:
            logging.error(f"Failed to load {key} from {path}: {e}")
            raise
    
    # Log columns of neighborhoods_gdf to debug missing 'ndvi' and 'area_km2'
    neighborhoods_gdf = data['neighborhoods']
    logging.info(f"Columns in neighborhoods_gdf after loading: {list(neighborhoods_gdf.columns)}")
    
    # Check for alternative NDVI column names and rename if found
    possible_ndvi_columns = ['ndvi_mean', 'NDVI', 'ndvi_value']
    for col in possible_ndvi_columns:
        if col in neighborhoods_gdf.columns and 'ndvi' not in neighborhoods_gdf.columns:
            logging.info(f"Found alternative NDVI column '{col}'. Renaming to 'ndvi'.")
            neighborhoods_gdf['ndvi'] = neighborhoods_gdf[col]
            break
    
    # Ensure all GeoDataFrames are in the same CRS
    target_crs = 'EPSG:3826'
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            if data[key].crs is None:
                logging.warning(f"No CRS defined for {key}. Assuming EPSG:4326.")
                data[key].set_crs('EPSG:4326', inplace=True)
            if data[key].crs != target_crs:
                data[key] = data[key].to_crs(target_crs)
                logging.info(f"Converted {key} to CRS {target_crs}")
    
    # Fix geometries in all GeoDataFrames
    for key in ['neighborhoods', 'buildings', 'roads', 'trees', 'transit', 'urban_masterplan', 'accidents']:
        if key in data and isinstance(data[key], gpd.GeoDataFrame):
            data[key]['geometry'] = data[key]['geometry'].apply(fix_geometry)
            invalid_geoms = data[key][~data[key].geometry.is_valid]
            if not invalid_geoms.empty:
                logging.warning(f"Found {len(invalid_geoms)} invalid geometries in {key} after fixing.")
                data[key] = data[key][data[key].geometry.is_valid]
    
    # Compute intersections for neighborhoods
    logging.info("Computing intersections for neighborhoods...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf after loading: {list(roads_gdf.columns)}")
    
    # Extract endpoints from road segments
    logging.info("Extracting endpoints from road segments...")
    endpoints = []
    road_indices = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        geom = row['geometry']
        if geom.geom_type == 'LineString':
            coords = list(geom.coords)
            start_point = Point(coords[0])
            end_point = Point(coords[-1])
            if start_point.is_valid and end_point.is_valid:
                endpoints.extend([start_point, end_point])
                road_indices.extend([idx, idx])
        elif geom.geom_type == 'MultiLineString':
            for line in geom.geoms:
                coords = list(line.coords)
                start_point = Point(coords[0])
                end_point = Point(coords[-1])
                if start_point.is_valid and end_point.is_valid:
                    endpoints.extend([start_point, end_point])
                    road_indices.extend([idx, idx])
    
    if not endpoints:
        logging.warning("No valid endpoints extracted from road segments. Using fallback method for intersections.")
        road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
        intersection_counts = road_neighborhoods.groupby('index_right').size()
        neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    else:
        endpoints_gdf = gpd.GeoDataFrame({'geometry': endpoints, 'road_idx': road_indices}, crs=target_crs)
        
        # Create a spatial index for endpoints
        endpoints_sindex = endpoints_gdf.sindex
        
        # Cluster endpoints to identify intersections (points shared by 3 or more roads)
        logging.info("Building endpoint-to-road mapping...")
        endpoint_to_roads = {}
        for idx, point in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
            point_geom = point['geometry']
            road_idx = point['road_idx']
            point_key = (round(point_geom.x, 6), round(point_geom.y, 6))  # Round to avoid floating-point precision issues
            if point_key not in endpoint_to_roads:
                endpoint_to_roads[point_key] = set()
            endpoint_to_roads[point_key].add(road_idx)
        
        logging.info("Identifying intersections...")
        intersections = []
        for point_key, road_ids in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
            if len(road_ids) >= 3:  # Intersection if shared by 3 or more roads
                intersections.append(Point(point_key))
        
        if not intersections:
            logging.warning("No intersections found using endpoint clustering. Using fallback method.")
            road_neighborhoods = gpd.sjoin(roads_gdf[['geometry']], neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = road_neighborhoods.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
        else:
            intersections_gdf = gpd.GeoDataFrame({'geometry': intersections}, crs=target_crs)
            
            # Count intersections per neighborhood
            logging.info("Counting intersections per neighborhood...")
            intersections_joined = gpd.sjoin(intersections_gdf, neighborhoods_gdf[['geometry', 'LIE_NAME']], how='left', predicate='intersects')
            intersection_counts = intersections_joined.groupby('index_right').size()
            neighborhoods_gdf['intersection_count'] = intersection_counts.reindex(neighborhoods_gdf.index, fill_value=0)
    
    # Compute or verify area_km2
    if 'area_km2' not in neighborhoods_gdf.columns:
        logging.warning("'area_km2' column missing in neighborhoods_gdf. Computing from geometry...")
        neighborhoods_gdf['area_m2'] = neighborhoods_gdf['geometry'].area
        neighborhoods_gdf['area_km2'] = neighborhoods_gdf['area_m2'] / 1_000_000  # Convert m² to km²
        logging.info(f"Computed area_km2 stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    else:
        logging.info(f"area_km2 already present. Stats:\n{neighborhoods_gdf['area_km2'].describe()}")
    
    # Compute intersection density
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2'].replace(0, 1e-6)
    logging.info(f"Intersection count stats:\n{neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats:\n{neighborhoods_gdf['intersection_density'].describe()}")
    
    # Cache the result
    try:
        neighborhoods_gdf.to_parquet(INTERSECTION_CACHE_PATH)
        logging.info(f"Saved neighborhoods with intersections to {INTERSECTION_CACHE_PATH}")
    except Exception as e:
        logging.error(f"Failed to save neighborhoods with intersections: {e}")
    
    data['neighborhoods'] = neighborhoods_gdf
    
    # Compute tree count per neighborhood
    logging.info("Computing tree count per neighborhood...")
    trees_gdf = data['trees']
    trees_joined = gpd.sjoin(trees_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    tree_counts = trees_joined.groupby('index_right').size()
    data['neighborhoods']['tree_count'] = tree_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute transit count per neighborhood
    logging.info("Computing transit count per neighborhood...")
    transit_gdf = data['transit']
    transit_joined = gpd.sjoin(transit_gdf[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    transit_counts = transit_joined.groupby('index_right').size()
    data['neighborhoods']['transit_count'] = transit_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute accident count per neighborhood
    logging.info("Computing accident count per neighborhood...")
    accidents_gdf = data['accidents']
    accidents_buffered = accidents_gdf.copy()
    accidents_buffered['geometry'] = accidents_buffered['geometry'].buffer(BUFFER_DISTANCE)
    accidents_joined = gpd.sjoin(accidents_buffered[['geometry']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    accident_counts = accidents_joined.groupby('index_right').size()
    data['neighborhoods']['accident_count'] = accident_counts.reindex(data['neighborhoods'].index, fill_value=0)
    
    # Compute road density per neighborhood
    logging.info("Computing road density per neighborhood...")
    roads_gdf = data['roads']
    
    # Log columns of roads_gdf to debug missing 'length_m'
    logging.info(f"Columns in roads_gdf before computing road density: {list(roads_gdf.columns)}")
    
    # Compute length_m if missing
    if 'length_m' not in roads_gdf.columns:
        logging.warning("'length_m' column missing in roads_gdf. Computing from geometry...")
        roads_gdf['length_m'] = roads_gdf['geometry'].length  # Length in meters (since CRS is EPSG:3826)
        logging.info(f"Computed length_m stats:\n{roads_gdf['length_m'].describe()}")
    
    roads_joined = gpd.sjoin(roads_gdf[['geometry', 'length_m']], data['neighborhoods'][['geometry', 'LIE_NAME']], how='left', predicate='intersects')
    road_lengths = roads_joined.groupby('index_right')['length_m'].sum()
    data['neighborhoods']['road_density'] = road_lengths.reindex(data['neighborhoods'].index, fill_value=0) / (data['neighborhoods']['area_km2'] * 1000)
    logging.info(f"Road density stats:\n{data['neighborhoods']['road_density'].describe()}")
    
    # Merge population data
    logging.info("Merging population data...")
    population_df = data['population']
    population_df['LIE_NAME'] = population_df['LIE_NAME'].astype(str).str.strip()
    data['neighborhoods']['LIE_NAME'] = data['neighborhoods']['LIE_NAME'].astype(str).str.strip()
    
    # Check for possible column names for total_population and elderly_percentage
    expected_cols = ['total_population', 'elderly_percentage']
    population_cols = list(population_df.columns)
    missing_cols = [col for col in expected_cols if col not in population_cols]
    
    if missing_cols:
        logging.warning(f"Expected columns {missing_cols} not found in population_df. Attempting to find alternatives...")
        total_pop_alt = None
        elderly_alt = None
        for col in population_cols:
            col_lower = col.lower()
            if 'population' in col_lower and total_pop_alt is None:
                total_pop_alt = col
                logging.info(f"Found alternative for total_population: {col}")
            if 'elderly' in col_lower and elderly_alt is None:
                elderly_alt = col
                logging.info(f"Found alternative for elderly_percentage: {col}")
        
        # Rename columns if alternatives are found
        if total_pop_alt:
            population_df = population_df.rename(columns={total_pop_alt: 'total_population'})
        else:
            logging.warning("No alternative found for total_population. Setting to 0.")
            population_df['total_population'] = 0
        if elderly_alt:
            population_df = population_df.rename(columns={elderly_alt: 'elderly_percentage'})
        else:
            logging.warning("No alternative found for elderly_percentage. Setting to 0.")
            population_df['elderly_percentage'] = 0
    
    # Perform the merge
    data['neighborhoods'] = data['neighborhoods'].merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']],
        on='LIE_NAME',
        how='left'
    )
    
    # Compute land use percentages
    logging.info("Computing land use percentages for neighborhoods...")
    urban_masterplan_gdf = data['urban_masterplan']
    print_percentage_calculation(data['neighborhoods'], urban_masterplan_gdf, sample_size=3)
    
    for idx, row in data['neighborhoods'].iterrows():
        neighborhood_geom = row['geometry']
        if not neighborhood_geom.is_valid:
            continue
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area = intersected.geometry.union_all().area
        remaining_geom = intersected.geometry.union_all()
        category_areas = {}
        for category in intersected['Category'].unique():
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area = category_area_geom.area
                category_areas[category] = category_area
                remaining_geom = remaining_geom.difference(category_area_geom)
            except Exception as e:
                logging.warning(f"Topology error for category {category} in neighborhood {row['LIE_NAME']}: {e}")
                category_areas[category] = 0.0
        
        for category in CATEGORY_PRIORITY.keys():
            area = category_areas.get(category, 0.0)
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            data['neighborhoods'].at[idx, f'land_use_{category.lower()}_percent'] = percentage
    
    # Fill NaN values in land use percentages
    for category in CATEGORY_PRIORITY.keys():
        col = f'land_use_{category.lower()}_percent'
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0.0)
    
    # Fill NaN values in other columns
    for col in ['intersection_count', 'intersection_density', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'total_population', 'elderly_percentage']:
        data['neighborhoods'][col] = data['neighborhoods'][col].fillna(0)
    
    # Print data structure summary
    print_data_structure(data)
    
    logging.info("Finished loading and preparing data.")
    return data

Cell 5 compute_intersection_counts

In [8]:
def compute_intersection_counts(neighborhoods_gdf, roads_gdf):
    logging.info("Computing intersection counts for neighborhoods...")
    
    # Extract endpoints from road segments
    def get_endpoints(line):
        if line is None or line.is_empty:
            return []
        coords = list(line.coords)
        return [Point(coords[0]), Point(coords[-1])]
    
    endpoints = []
    for idx, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Extracting endpoints"):
        points = get_endpoints(row['geometry'])
        for point in points:
            endpoints.append({'geometry': point, 'road_idx': idx})
    
    endpoints_gdf = gpd.GeoDataFrame(endpoints, crs='EPSG:3826')
    
    # Build a mapping of endpoints to road indices
    endpoint_to_roads = {}
    for idx, row in tqdm(endpoints_gdf.iterrows(), total=len(endpoints_gdf), desc="Building endpoint-to-road mapping"):
        point = row['geometry']
        road_idx = row['road_idx']
        point_tuple = (point.x, point.y)
        if point_tuple not in endpoint_to_roads:
            endpoint_to_roads[point_tuple] = set()
        endpoint_to_roads[point_tuple].add(road_idx)
    
    # Identify intersections (endpoints shared by 3 or more roads)
    intersections = []
    for point_tuple, road_indices in tqdm(endpoint_to_roads.items(), desc="Identifying intersections"):
        if len(road_indices) >= 3:  # Intersection if 3 or more roads share the endpoint
            intersections.append({'geometry': Point(point_tuple)})
    
    if not intersections:
        logging.warning("No intersections found. Setting intersection counts to 0.")
        neighborhoods_gdf['intersection_count'] = 0
        neighborhoods_gdf['intersection_density'] = 0.0
        return neighborhoods_gdf
    
    intersections_gdf = gpd.GeoDataFrame(intersections, crs='EPSG:3826')
    
    # Spatial join to count intersections per neighborhood
    intersection_counts = gpd.sjoin(
        neighborhoods_gdf[['geometry', 'LIE_NAME']],
        intersections_gdf,
        how='left',
        predicate='contains'
    )
    intersection_counts = intersection_counts.groupby('LIE_NAME').size().reindex(neighborhoods_gdf['LIE_NAME'], fill_value=0)
    neighborhoods_gdf['intersection_count'] = intersection_counts
    
    # Compute intersection density (intersections per km²)
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2']
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_density'].replace([np.inf, -np.inf], 0).fillna(0)
    
    logging.info(f"Intersection count stats: {neighborhoods_gdf['intersection_count'].describe()}")
    logging.info(f"Intersection density stats: {neighborhoods_gdf['intersection_density'].describe()}")
    
    return neighborhoods_gdf

def build_graph(data, force_recompute=False):
    logging.info("Stage 2: Building the graph...")
    
    # Compute data hash to check if graph needs recomputing
    data_hash = compute_data_hash(data)
    cached_hash = None
    if os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read().strip()
    
    if not force_recompute and cached_hash == data_hash and all(
        os.path.exists(path) for path in [GRAPH_NODES_CACHE_PATH, GRAPH_EDGES_CACHE_PATH, GRAPH_NODE_ID_CACHE_PATH]
    ):
        logging.info("Data unchanged. Loading graph from cache...")
        nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
            node_id_to_index = json.load(f)
        G = cugraph.Graph()
        G.from_cudf_edgelist(
            edges_df,
            source='src',
            destination='dst',
            edge_attr='weight'
        )
        G._nodes = nodes_df
        logging.info("Graph loaded from cache.")
        return G
    
    neighborhoods_gdf = data['neighborhoods'].copy()
    buildings_gdf = data['buildings'].copy()
    roads_gdf = data['roads'].copy()
    trees_gdf = data['trees'].copy()
    transit_gdf = data['transit'].copy()
    
    # Create nodes for neighborhoods, buildings, roads, trees, and transit
    nodes = []
    node_id_to_index = {}
    current_idx = 0
    
    # Neighborhood nodes
    for idx, row in neighborhoods_gdf.iterrows():
        node_id = f"neighborhood_{row['LIE_NAME']}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'geometry': row['geometry'].centroid,
            'ndvi_mean': row['ndvi_mean'],
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            'area_km2': row['area_km2']
        })
        current_idx += 1
    
    # Building nodes
    for idx, row in buildings_gdf.iterrows():
        node_id = f"building_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'building',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'building_type': row['building'],
            'area_m2': row['area_m2']
        })
        current_idx += 1
    
    # Road nodes
    for idx, row in roads_gdf.iterrows():
        node_id = f"road_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'road',
            'LIE_NAME': None,
            'geometry': row['geometry'].centroid,
            'class': row['class'],
            'length_m': row['length_m']
        })
        current_idx += 1
    
    # Tree nodes
    for idx, row in trees_gdf.iterrows():
        node_id = f"tree_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'tree',
            'LIE_NAME': None,
            'geometry': row['geometry']
        })
        current_idx += 1
    
    # Transit nodes
    for idx, row in transit_gdf.iterrows():
        node_id = f"transit_{idx}"
        node_id_to_index[node_id] = current_idx
        nodes.append({
            'node_idx': current_idx,
            'node_id': node_id,
            'type': 'transit',
            'LIE_NAME': None,
            'geometry': row['geometry'],
            'class': row['class']
        })
        current_idx += 1
    
    nodes_df = pd.DataFrame(nodes)
    nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs='EPSG:3826')
    nodes_df = cudf.from_pandas(nodes_df.drop(columns=['geometry']))
    
    # Create edges based on spatial proximity
    edges = []
    nodes_gdf_sindex = nodes_gdf.sindex
    
    # Neighborhood-to-neighborhood edges (shared borders)
    logging.info("Creating neighborhood-to-neighborhood edges...")
    for idx1, row1 in neighborhoods_gdf.iterrows():
        geom1 = row1['geometry']
        node_idx1 = node_id_to_index[f"neighborhood_{row1['LIE_NAME']}"]
        possible_matches = list(nodes_gdf_sindex.query(geom1, predicate='intersects'))
        for idx2 in possible_matches:
            row2 = nodes_gdf.iloc[idx2]
            if row2['type'] != 'neighborhood':
                continue
            if row1['LIE_NAME'] == row2['LIE_NAME']:
                continue
            geom2 = neighborhoods_gdf[neighborhoods_gdf['LIE_NAME'] == row2['LIE_NAME']]['geometry'].iloc[0]
            if geom1.intersects(geom2):
                node_idx2 = node_id_to_index[f"neighborhood_{row2['LIE_NAME']}"]
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': 1.0
                })
    
    # Other edges (neighborhood to building, road, tree, transit)
    logging.info("Creating edges between neighborhoods and other entities...")
    for idx, row in tqdm(nodes_gdf.iterrows(), total=len(nodes_gdf), desc="Creating edges"):
        if row['type'] == 'neighborhood':
            continue
        geom = row['geometry']
        possible_matches = list(neighborhoods_gdf.sindex.query(geom, predicate='contains'))
        for match_idx in possible_matches:
            neighborhood = neighborhoods_gdf.iloc[match_idx]
            if neighborhood['geometry'].contains(geom):
                node_idx1 = node_id_to_index[row['node_id']]
                node_idx2 = node_id_to_index[f"neighborhood_{neighborhood['LIE_NAME']}"]
                weight = 1.0
                if row['type'] == 'transit':
                    weight = 2.0  # Higher weight for transit nodes
                edges.append({
                    'src': node_idx1,
                    'dst': node_idx2,
                    'weight': weight
                })
                edges.append({
                    'src': node_idx2,
                    'dst': node_idx1,
                    'weight': weight
                })
    
    edges_df = cudf.DataFrame(edges)
    
    # Build the graph
    G = cugraph.Graph()
    G.from_cudf_edgelist(
        edges_df,
        source='src',
        destination='dst',
        edge_attr='weight'
    )
    G._nodes = nodes_df
    
    # Cache the graph
    nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
    edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
    with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
        json.dump(node_id_to_index, f)
    with open(GRAPH_DATA_HASH_PATH, 'w') as f:
        f.write(data_hash)
    
    logging.info("Graph construction completed.")
    return G

def prepare_gnn_data(G):
    logging.info("Stage 3: Preparing data for GNN...")
    
    nodes_df = G._nodes.to_pandas()
    edges_df = G.edgelist.edgelist_df.to_pandas()
    
    # Create node features
    feature_columns = [
        'ndvi_mean', 'total_population', 'elderly_percentage', 'area_km2',
        'area_m2', 'length_m'
    ]
    features = []
    for idx, row in nodes_df.iterrows():
        node_features = []
        for col in feature_columns:
            value = row.get(col, 0.0)
            if pd.isna(value):
                value = 0.0
            node_features.append(value)
        
        # One-hot encode node type
        node_type = row['type']
        type_encoding = [0] * 5  # 5 types: neighborhood, building, road, tree, transit
        type_mapping = {
            'neighborhood': 0,
            'building': 1,
            'road': 2,
            'tree': 3,
            'transit': 4
        }
        type_idx = type_mapping.get(node_type, 0)
        type_encoding[type_idx] = 1
        node_features.extend(type_encoding)
        
        features.append(node_features)
    
    feature_matrix = np.array(features, dtype=np.float32)
    
    # Normalize numerical features
    numerical_features = feature_matrix[:, :len(feature_columns)]
    means = numerical_features.mean(axis=0)
    stds = numerical_features.std(axis=0)
    stds[stds == 0] = 1  # Avoid division by zero
    numerical_features = (numerical_features - means) / stds
    feature_matrix[:, :len(feature_columns)] = numerical_features
    
    # Create edge indices for PyG
    edge_index = torch.tensor(
        np.array([edges_df['src'].values, edges_df['dst'].values]),
        dtype=torch.long
    )
    edge_attr = torch.tensor(edges_df['weight'].values, dtype=torch.float)
    
    # Create target (walkability score) for neighborhood nodes
    y = np.zeros(len(nodes_df), dtype=np.float32)
    if 'walkability_score' in nodes_df.columns:
        walkability_scores = nodes_df['walkability_score'].fillna(0).values
        mask = nodes_df['type'] == 'neighborhood'
        y[mask] = walkability_scores[mask]
    else:
        logging.warning("Walkability scores not found in nodes_df. Setting targets to 0.")
    
    y = torch.tensor(y, dtype=torch.float)
    
    node_type_mapping = {
        'neighborhood': 0,
        'building': 1,
        'road': 2,
        'tree': 3,
        'transit': 4
    }
    node_type = nodes_df['type'].map(node_type_mapping).fillna(-1).astype(int).values
    node_type = torch.tensor(node_type, dtype=torch.long)
    
    data = Data(
        x=torch.tensor(feature_matrix, dtype=torch.float),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=y,
        node_type=node_type
    )
    
    logging.info("GNN data prepared.")
    return data

Cell 6: Graph Construction (build_graph)

In [9]:
def compute_neighborhood_neighborhood_edges(args):
    idx, row, neighborhoods_gdf, neighborhood_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(neighborhood_sindex.intersection(geom.bounds))
    for other_idx in possible_matches_index:
        if other_idx != idx:
            other_row = neighborhoods_gdf.iloc[other_idx]
            other_geom = other_row['geometry']
            try:
                if geom.buffer(1e-3).intersects(other_geom.buffer(1e-3)) or geom.buffer(1e-3).touches(other_geom.buffer(1e-3)):
                    src = f"neighborhood_{idx}"
                    dst = f"neighborhood_{other_idx}"
                    edges.append({'src': src, 'dst': dst})
            except Exception as e:
                logging.warning(f"Error checking intersection between neighborhood {idx} and {other_idx}: {e}")
    return edges

def compute_neighborhood_building_edges(args):
    idx, row, buildings_gdf, building_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(building_sindex.intersection(geom.bounds))
    for building_idx in possible_matches_index:
        building_row = buildings_gdf.iloc[building_idx]
        building_geom = building_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(building_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"building_{building_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and building {building_idx}: {e}")
    return edges

def compute_neighborhood_road_edges(args):
    idx, row, roads_gdf, road_sindex = args
    edges = []
    geom = row['geometry']
    possible_matches_index = list(road_sindex.intersection(geom.bounds))
    for road_idx in possible_matches_index:
        road_row = roads_gdf.iloc[road_idx]
        road_geom = road_row['geometry']
        try:
            if geom.buffer(1e-3).intersects(road_geom.buffer(1e-3)):
                src = f"neighborhood_{idx}"
                dst = f"road_{road_idx}"
                edges.append({'src': src, 'dst': dst})
        except Exception as e:
            logging.warning(f"Error checking intersection between neighborhood {idx} and road {road_idx}: {e}")
    return edges



In [10]:
def build_graph(data, force_recompute=False):
    import cudf
    import cugraph
    import logging
    import json
    import numpy as np
    from tqdm import tqdm
    
    logging.info("Stage 2: Building city graph...")
    
    # Compute data hash to check if cached graph can be used
    current_hash = compute_data_hash(data)
    if not force_recompute and os.path.exists(GRAPH_NODES_CACHE_PATH) and os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read()
        if cached_hash == current_hash:
            try:
                nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
                edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
                with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
                    node_id_to_vertex = json.load(f)
                G = cugraph.Graph()
                G._nodes = nodes_df
                if not edges_df.empty:
                    G.from_cudf_edgelist(edges_df, source='src', destination='dst')
                logging.info(f"Loaded cached graph: {len(nodes_df)} nodes, {len(edges_df)} edges")
                return G
            except Exception as e:
                logging.warning(f"Failed to load cached graph: {e}. Recomputing graph...")
    
    # Initialize node DataFrame
    nodes = []
    vertex_to_index = {}
    node_id_to_vertex = {}
    current_index = 0
    
    # Add neighborhood nodes
    logging.info("Adding neighborhood nodes...")
    neighborhoods_gdf = data['neighborhoods']
    for i, row in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood nodes"):
        node_id = f"neighborhood_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'neighborhood',
            'node_id': node_id,
            'LIE_NAME': row['LIE_NAME'],
            'avg_road_accident_density': row.get('avg_road_accident_density', 0),
            'pedestrian_road_density': row.get('pedestrian_road_density', 0),
            'ndvi': row.get('ndvi', 0),
            'tree_count': row.get('tree_count', 0),
            'transit_count': row.get('transit_count', 0),
            'accident_count': row.get('accident_count', 0),
            'road_density': row.get('road_density', 0),
            'intersection_density': row.get('intersection_density', 0),
            'total_population': row.get('total_population', 0),
            'elderly_percentage': row.get('elderly_percentage', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        for cat in CATEGORY_PRIORITY.keys():
            col = f'land_use_{cat.lower()}_percent'
            nodes[-1][col] = row.get(col, 0)
        current_index += 1
    
    # Add building nodes
    logging.info("Adding building nodes...")
    buildings_gdf = data['buildings']
    if 'area_m2' not in buildings_gdf.columns:
        logging.warning("'area_m2' missing. Computing from geometry...")
        buildings_gdf['area_m2'] = buildings_gdf.geometry.area
    else:
        logging.info("Using existing 'area_m2' column.")
    
    for i, row in tqdm(buildings_gdf.iterrows(), total=len(buildings_gdf), desc="Building nodes"):
        node_id = f"building_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'building',
            'node_id': node_id,
            'building': row.get('building', 'unknown'),
            'area_m2': row.get('area_m2', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        current_index += 1
    
    # Add road nodes
    logging.info("Adding road nodes...")
    roads_gdf = data['roads']
    for i, row in tqdm(roads_gdf.iterrows(), total=len(roads_gdf), desc="Road nodes"):
        node_id = f"road_{i}"
        vertex_to_index[node_id] = current_index
        node_id_to_vertex[str(i)] = node_id
        nodes.append({
            'index': current_index,
            'type': 'road',
            'node_id': node_id,
            'class': row.get('class', 'unknown'),
            'length_m': row.get('length_m', 0),
            'min_x': float(row.geometry.bounds[0]),
            'min_y': float(row.geometry.bounds[1]),
            'max_x': float(row.geometry.bounds[2]),
            'max_y': float(row.geometry.bounds[3])
        })
        current_index += 1
    
    nodes_df = cudf.DataFrame(nodes)
    
    # Convert GeoDataFrames to cudf for GPU processing
    logging.info("Converting GeoDataFrames to cudf for GPU processing...")
    neighborhoods_cudf = cudf.DataFrame.from_pandas(neighborhoods_gdf.drop(columns=['geometry']))
    buildings_cudf = cudf.DataFrame.from_pandas(buildings_gdf.drop(columns=['geometry']))
    roads_cudf = cudf.DataFrame.from_pandas(roads_gdf.drop(columns=['geometry']))
    
    # Extract bounding box coordinates
    logging.info("Extracting bounding box coordinates...")
    neighborhoods_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in neighborhoods_gdf.geometry])
    neighborhoods_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in neighborhoods_gdf.geometry])
    
    buildings_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in buildings_gdf.geometry])
    buildings_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in buildings_gdf.geometry])
    buildings_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in buildings_gdf.geometry])
    buildings_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in buildings_gdf.geometry])
    
    roads_cudf['min_x'] = cudf.Series([float(g.bounds[0]) for g in roads_gdf.geometry])
    roads_cudf['min_y'] = cudf.Series([float(g.bounds[1]) for g in roads_gdf.geometry])
    roads_cudf['max_x'] = cudf.Series([float(g.bounds[2]) for g in roads_gdf.geometry])
    roads_cudf['max_y'] = cudf.Series([float(g.bounds[3]) for g in roads_gdf.geometry])
    
    logging.info(f"neighborhoods_cudf['min_x'] dtype: {neighborhoods_cudf['min_x'].dtype}")
    logging.info(f"buildings_cudf['min_x'] dtype: {buildings_cudf['min_x'].dtype}")
    logging.info(f"roads_cudf['min_x'] dtype: {roads_cudf['min_x'].dtype}")
    
    # Create edges using GPU-accelerated spatial joins
    logging.info("Creating edges using GPU-accelerated spatial joins...")
    edges = []
    edge_counts = {'neighborhood-neighborhood': 0, 'neighborhood-building': 0, 'neighborhood-road': 0}
    
    # Neighborhood-Neighborhood edges
    logging.info("Computing neighborhood-neighborhood edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Neighborhood edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = neighborhoods_cudf[
            ~((geom_max_x < neighborhoods_cudf['min_x']) |
              (geom_min_x > neighborhoods_cudf['max_x']) |
              (geom_max_y < neighborhoods_cudf['min_y']) |
              (geom_min_y > neighborhoods_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            if j != i:
                src_vertex = f"neighborhood_{i}"
                dst_vertex = f"neighborhood_{j}"
                src = vertex_to_index[src_vertex]
                dst = vertex_to_index[dst_vertex]
                edges.append({'src': src, 'dst': dst})
                edge_counts['neighborhood-neighborhood'] += 1
    
    # Neighborhood-Building edges
    logging.info("Computing neighborhood-building edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Building edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = buildings_cudf[
            ~((geom_max_x < buildings_cudf['min_x']) |
              (geom_min_x > buildings_cudf['max_x']) |
              (geom_max_y < buildings_cudf['min_y']) |
              (geom_min_y > buildings_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"building_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})
            edge_counts['neighborhood-building'] += 1
    
    # Neighborhood-Road edges
    logging.info("Computing neighborhood-road edges...")
    for i in tqdm(range(len(neighborhoods_gdf)), desc="Neighborhood-Road edges"):
        row = neighborhoods_cudf.iloc[[i]]
        geom_min_x = float(row['min_x'].values[0])
        geom_min_y = float(row['min_y'].values[0])
        geom_max_x = float(row['max_x'].values[0])
        geom_max_y = float(row['max_y'].values[0])
        matches = roads_cudf[
            ~((geom_max_x < roads_cudf['min_x']) |
              (geom_min_x > roads_cudf['max_x']) |
              (geom_max_y < roads_cudf['min_y']) |
              (geom_min_y > roads_cudf['max_y']))
        ]
        for j in matches.index.values_host:
            src_vertex = f"neighborhood_{i}"
            dst_vertex = f"road_{j}"
            src = vertex_to_index[src_vertex]
            dst = vertex_to_index[dst_vertex]
            edges.append({'src': src, 'dst': dst})
            edge_counts['neighborhood-road'] += 1
    
    logging.info(f"Edge counts by type: {edge_counts}")
    
    edges_df = cudf.DataFrame(edges)
    logging.info(f"Created {len(edges_df)} total edges")
    
    # Validate edges
    valid_indices = set(nodes_df['index'].to_pandas())
    if edges_df.empty:
        logging.warning("No edges created. Graph will have nodes but no edges.")
    else:
        edges_df = edges_df[edges_df['src'].isin(valid_indices) & edges_df['dst'].isin(valid_indices)]
        logging.info(f"After validation, {len(edges_df)} edges remain")
        if not edges_df.empty:
            logging.info(f"Sample edges after validation:\n{edges_df.head().to_pandas()}")
    
    # Create graph
    G = cugraph.Graph()
    G._nodes = nodes_df
    if not edges_df.empty:
        G.from_cudf_edgelist(edges_df, source='src', destination='dst')
    else:
        logging.warning("No valid edges created. Graph will have nodes but no edges.")
    
    # Save graph data to cache
    logging.info("Saving graph data to cache...")
    try:
        nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_DATA_HASH_PATH, 'w') as f:
            f.write(current_hash)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
            json.dump(node_id_to_vertex, f)
        logging.info("Successfully saved graph data to cache.")
    except Exception as e:
        logging.error(f"Failed to save graph data to cache: {e}")
    
    logging.info(f"City graph constructed: {len(nodes_df)} nodes, {len(edges_df)} edges")
    return G

Cell 7: Rule-Based Walkability Scores (compute_walkability_scores)

In [11]:
def compute_walkability_scores(G, data):
    """
    Compute walkability scores for neighborhood nodes in the graph and assign them.
    
    Args:
        G (cugraph.Graph): The city graph with nodes and edges.
        data (dict): Dictionary containing roads and other datasets.
    
    Returns:
        cugraph.Graph: Updated graph with walkability scores assigned to neighborhood nodes.
    """
    logging.info("Computing walkability scores for neighborhoods...")
    nodes_df = G._nodes.to_pandas()
    
    walkability_components = compute_walkability_components_all(data['neighborhoods'], data)
    
    logging.info(f"Number of neighborhood nodes in nodes_df: {len(nodes_df[nodes_df['type'] == 'neighborhood'])}")
    logging.info(f"Number of entries in walkability_components: {len(walkability_components)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {nodes_df[nodes_df['type'] == 'neighborhood']['LIE_NAME'].head().tolist()}")
    logging.info(f"Sample LIE_NAME in walkability_components: {walkability_components['LIE_NAME'].head().tolist()}")
    
    nodes_df = nodes_df.merge(
        walkability_components[['LIE_NAME', 'walkability_score', 'walkability_category']],
        on='LIE_NAME',
        how='left'
    )
    
    unmatched = nodes_df[(nodes_df['type'] == 'neighborhood') & (nodes_df['walkability_score'].isna())]
    if len(unmatched) > 0:
        logging.warning(f"Found {len(unmatched)} neighborhood nodes without walkability scores. Filling with 0.")
        nodes_df.loc[nodes_df['type'] == 'neighborhood', 'walkability_score'] = nodes_df['walkability_score'].fillna(0)
        nodes_df.loc[nodes_df['type'] == 'neighborhood', 'walkability_category'] = nodes_df['walkability_category'].fillna('low')
    
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished computing walkability scores.")
    return G

Cell 8 prepare_gnn_data

In [12]:
def prepare_gnn_data(G):
    import torch
    import logging
    import pandas as pd
    import numpy as np
    from tqdm import tqdm
    
    logging.info("Preparing data for GNN training...")
    nodes_df = G._nodes
    edges_df = G.edgelist.edgelist_df if G.edgelist else cudf.DataFrame()
    
    numerical_features = [
        'ndvi', 'tree_count', 'transit_count', 'accident_count',
        'road_density', 'intersection_density', 'total_population',
        'elderly_percentage', 'area_m2', 'length_m', 'avg_road_accident_density',
        'pedestrian_road_density'
    ] + [f'land_use_{cat.lower()}_percent' for cat in CATEGORY_PRIORITY.keys()]
    
    numerical_features.append('land_use_diversity')
    
    building_types = nodes_df[nodes_df['type'] == 'building']['building'].to_pandas().unique()
    road_classes = nodes_df[nodes_df['type'] == 'road']['class'].to_pandas().unique()
    categorical_features = (
        [f'building_{bt}' for bt in building_types if pd.notna(bt)] +
        [f'road_class_{rc}' for rc in road_classes if pd.notna(rc)]
    )
    
    all_features = numerical_features + categorical_features
    
    features_list = []
    labels_list = []
    node_types = []
    
    for node_type in tqdm(['neighborhood', 'building', 'road'], desc="Normalizing features by node type"):
        subset = nodes_df[nodes_df['type'] == node_type].to_pandas()
        if subset.empty:
            logging.warning(f"No nodes of type {node_type} found. Skipping.")
            continue
        
        subset_features = pd.DataFrame(0.0, index=subset.index, columns=all_features)
        
        if node_type == 'neighborhood':
            # Compute land_use_diversity
            land_use_cols = [col for col in subset.columns if col.startswith('land_use_') and col.endswith('_percent')]
            if land_use_cols:
                subset['land_use_diversity'] = subset[land_use_cols].apply(
                    lambda row: -np.sum([p * np.log(p + 1e-10) for p in row / 100.0 if p > 0]), axis=1
                )
            else:
                subset['land_use_diversity'] = 0
            
            for col in numerical_features:
                if col in subset.columns:
                    subset_features[col] = subset[col].astype(float).fillna(0)
                    logging.debug(f"{node_type} - {col} pre-normalization std: {subset[col].std():.4f}")
                else:
                    logging.warning(f"Column {col} missing in neighborhood nodes. Setting to 0.")
                    subset_features[col] = 0
        elif node_type == 'building':
            if 'area_m2' in subset.columns:
                subset_features['area_m2'] = subset['area_m2'].astype(float).fillna(0)
                logging.debug(f"{node_type} - area_m2 pre-normalization std: {subset['area_m2'].std():.4f}")
        else:  # road
            if 'length_m' in subset.columns:
                subset_features['length_m'] = subset['length_m'].astype(float).fillna(0)
                logging.debug(f"{node_type} - length_m pre-normalization std: {subset['length_m'].std():.4f}")
        
        if node_type == 'building':
            for bt in building_types:
                if pd.notna(bt):
                    subset_features[f'building_{bt}'] = (subset['building'] == bt).astype(float)
        elif node_type == 'road':
            for rc in road_classes:
                if pd.notna(rc):
                    subset_features[f'road_class_{rc}'] = (subset['class'] == rc).astype(float)
        
        # Z-score normalization for numerical features
        for col in numerical_features:
            if col in subset_features.columns and subset_features[col].std() > 0:
                subset_features[col] = (
                    (subset_features[col] - subset_features[col].mean()) / subset_features[col].std()
                ).fillna(0)
                logging.debug(f"{node_type} - {col} post-normalization std: {subset_features[col].std():.4f}")
            else:
                logging.debug(f"Column {col} has zero variance or is missing for {node_type}. Setting to 0.")
        
        logging.info(f"Node type {node_type}: {len(subset)} nodes, feature shape: {subset_features.shape}")
        
        features_list.append(subset_features.values)
        
        if node_type == 'neighborhood':
            labels = subset['walkability_score'].astype(float).fillna(0).values
            labels_list.append(labels[:, None])  # Shape [n, 1]
        else:
            labels_list.append(np.zeros((len(subset), 1)))
        
        node_types.extend([node_type] * len(subset))
    
    try:
        features = np.vstack(features_list)
    except ValueError as e:
        logging.error(f"Failed to stack features: {e}")
        raise
    
    labels = np.vstack(labels_list)
    features_tensor = torch.tensor(features, dtype=torch.float)
    labels_tensor = torch.tensor(labels, dtype=torch.float)
    
    if not edges_df.empty:
        edge_index = torch.tensor(edges_df[['src', 'dst']].to_pandas().values.T, dtype=torch.long)
        logging.info(f"Edge index created with {edge_index.shape[1]} edges")
        max_index = nodes_df['index'].max()
        if edge_index.max() > max_index or edge_index.min() < 0:
            logging.warning(f"Edge indices out of bounds: min={edge_index.min()}, max={edge_index.max()}, expected max={max_index}")
            edge_index = torch.empty((2, 0), dtype=torch.long)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long)
        logging.warning("No edges found in graph.")
    
    data = Data(
        x=features_tensor,
        edge_index=edge_index,
        y=labels_tensor
    )
    
    data.node_types = node_types
    
    logging.info(f"Prepared GNN data: {features_tensor.shape[0]} nodes, {edge_index.shape[1]} edges")
    logging.info(f"Feature matrix shape: {features_tensor.shape}")
    logging.info(f"Label tensor shape: {labels_tensor.shape}")
    return data

Cell 9: WalkabilityGNN, train_gnn_model, predict_walkability

In [13]:
class GNNWalkabilityPredictor(torch.nn.Module):
    def __init__(self, num_features, hidden_dim=128, num_heads=4, dropout_rate=0.3):
        super(GNNWalkabilityPredictor, self).__init__()
        self.conv1 = GATConv(num_features, hidden_dim, heads=num_heads, concat=True)
        self.bn1 = BatchNorm(hidden_dim * num_heads)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim // 2, heads=1, concat=True)
        self.bn2 = BatchNorm(hidden_dim // 2)
        self.fc1 = torch.nn.Linear(hidden_dim // 2, hidden_dim // 4)
        self.fc2 = torch.nn.Linear(hidden_dim // 4, 1)
        self.dropout = torch.nn.Dropout(dropout_rate)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        if edge_index.numel() > 0:
            x = self.conv1(x, edge_index)
            x = self.bn1(x)
            x = F.relu(x)
            x = self.dropout(x)
            x = self.conv2(x, edge_index)
            x = self.bn2(x)
            x = F.relu(x)
        else:
            logging.warning("No edges in the graph. Using linear layer for node features only.")
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return torch.sigmoid(x)

def train_gnn_model(data_gnn, hidden_dim=128, num_heads=4, dropout_rate=0.3, lr=0.005, weight_decay=1e-4, epochs=500, patience=30):
    logging.info("Stage 4: Training GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data_gnn = data_gnn.to(device)
    
    neighborhood_mask = np.array([t == 'neighborhood' for t in data_gnn.node_types])
    train_indices = np.where(neighborhood_mask)[0]
    
    if len(train_indices) == 0:
        logging.error("No neighborhood nodes found for training.")
        raise ValueError("No neighborhood nodes found for training.")
    
    np.random.seed(42)
    train_idx = np.random.choice(train_indices, size=int(0.8 * len(train_indices)), replace=False)
    val_idx = np.setdiff1d(train_indices, train_idx)
    
    train_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(data_gnn.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    data_gnn.train_mask = train_mask
    data_gnn.val_mask = val_mask
    
    neighborhood_labels = data_gnn.y[neighborhood_mask].cpu().numpy()
    logging.info(f"Target (walkability_score) distribution for neighborhood nodes:\n{pd.Series(neighborhood_labels.flatten()).describe()}")
    
    model = GNNWalkabilityPredictor(num_features=data_gnn.x.shape[1], hidden_dim=hidden_dim, num_heads=num_heads, dropout_rate=dropout_rate).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)
    criterion = torch.nn.MSELoss()
    
    best_val_loss = float('inf')
    best_model_path = os.path.join(CHECKPOINT_DIR, 'best_gnn_model.pth')
    early_stop_counter = 0
    
    train_losses = []
    val_losses = []
    train_maes = []
    val_maes = []
    train_r2s = []
    val_r2s = []
    
    for epoch in tqdm(range(epochs), desc="Training epochs"):
        model.train()
        optimizer.zero_grad()
        out = model(data_gnn)
        loss = criterion(out[data_gnn.train_mask], data_gnn.y[data_gnn.train_mask])
        loss.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            val_out = model(data_gnn)
            val_loss = criterion(val_out[data_gnn.val_mask], data_gnn.y[data_gnn.val_mask])
            
            train_pred = out[data_gnn.train_mask].detach().cpu().numpy()
            train_true = data_gnn.y[data_gnn.train_mask].cpu().numpy()
            val_pred = val_out[data_gnn.val_mask].detach().cpu().numpy()
            val_true = data_gnn.y[data_gnn.val_mask].cpu().numpy()
            
            train_mae = mean_absolute_error(train_true, train_pred)
            train_r2 = r2_score(train_true, train_pred)
            val_mae = mean_absolute_error(val_true, val_pred)
            val_r2 = r2_score(val_true, val_pred)
            
            train_losses.append(loss.item())
            val_losses.append(val_loss.item())
            train_maes.append(train_mae)
            val_maes.append(val_mae)
            train_r2s.append(train_r2)
            val_r2s.append(val_r2)
        
        scheduler.step(val_loss)
        
        if epoch % 10 == 0:
            logging.info(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Train MAE: {train_mae:.4f}, Train R2: {train_r2:.4f}, Val Loss: {val_loss.item():.4f}, Val MAE: {val_mae:.4f}, Val R2: {val_r2:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                logging.info(f"Early stopping at epoch {epoch}")
                break
    
    model.load_state_dict(torch.load(best_model_path))
    logging.info("Finished training GNN model.")
    return {
        'model': model,
        'train_losses': train_losses,
        'val_losses': val_losses,
        'train_maes': train_maes,
        'val_maes': val_maes,
        'train_r2s': train_r2s,
        'val_r2s': val_r2s
    }

def predict_walkability(G, model):
    logging.info("Predicting walkability scores using GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    model.eval()
    
    data_gnn = prepare_gnn_data(G)
    data_gnn = data_gnn.to(device)
    
    with torch.no_grad():
        predictions = model(data_gnn)
    
    nodes_df = G._nodes.to_pandas()
    neighborhood_mask = nodes_df['type'] == 'neighborhood'
    nodes_df.loc[neighborhood_mask, 'walkability_gnn'] = predictions[neighborhood_mask].cpu().numpy().flatten()
    
    nodes_df['walkability_gnn'] = nodes_df['walkability_gnn'].clip(0, 1)
    
    # Compute walkability_category with dynamic thresholds for GNN predictions
    low_threshold = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].quantile(0.33)
    high_threshold = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].quantile(0.66)
    logging.info(f"GNN walkability category thresholds - low: {low_threshold:.4f}, high: {high_threshold:.4f}")
    
    def categorize_gnn_score(score):
        if score < low_threshold:
            return 'low'
        elif score < high_threshold:
            return 'medium'
        else:
            return 'high'
    
    nodes_df.loc[neighborhood_mask, 'walkability_category'] = nodes_df.loc[neighborhood_mask, 'walkability_gnn'].apply(categorize_gnn_score)
    
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Finished predicting walkability scores.")
    return G

Cell 10: Interactive Map Generation (create_interactive_map)

In [14]:
def create_interactive_map(G, data):
    """Generate an interactive Kepler.gl map to visualize walkability scores and other geodata."""
    logging.info("Generating interactive Kepler.gl map...")
    
    nodes_df = G._nodes.to_pandas()
    neighborhoods_gdf = data['neighborhoods'].copy()

    # Standardize LIE_NAME for merging
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].astype(str).str.strip()

    # Filter for neighborhood nodes and select necessary columns
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'][['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category']]

    # Merge data
    map_data = neighborhoods_gdf[['LIE_NAME', 'geometry']].merge(
        neighborhood_nodes,
        on='LIE_NAME',
        how='left'
    )

    # Drop duplicates in-place
    map_data.drop_duplicates(subset=['LIE_NAME'], keep='first', inplace=True)

    # Fill NaN values
    map_data['walkability_score'] = map_data['walkability_score'].fillna(0)
    map_data['walkability_gnn'] = map_data['walkability_gnn'].fillna(0)
    map_data['walkability_category'] = map_data['walkability_category'].fillna('low')

    # Convert to GeoDataFrame and transform CRS
    map_data = gpd.GeoDataFrame(map_data, geometry='geometry', crs='EPSG:3826')
    map_data['geometry'] = map_data['geometry'].to_crs('EPSG:4326')

    # Prepare kepler_data
    kepler_data = {
        'neighborhoods': map_data[['LIE_NAME', 'walkability_score', 'walkability_gnn', 'walkability_category', 'geometry']].to_json()
    }

    # Prepare roads data
    if 'roads' in data:
        roads_gdf = data['roads'].copy()
        if roads_gdf.crs != 'EPSG:4326':
            roads_gdf = roads_gdf.to_crs('EPSG:4326')
        road_columns = ['class', 'length_m', 'geometry']
        available_columns = [col for col in road_columns if col in roads_gdf.columns]
        if 'geometry' in available_columns:
            kepler_data['roads'] = roads_gdf[available_columns].to_json()
        else:
            logging.warning("Roads GeoDataFrame missing 'geometry' column. Skipping roads layer.")
    else:
        logging.warning("Roads data not found in data dictionary. Skipping roads layer.")

    # Prepare buildings data
    if 'buildings' in data:
        buildings_gdf = data['buildings'].copy()
        if buildings_gdf.crs != 'EPSG:4326':
            buildings_gdf = buildings_gdf.to_crs('EPSG:4326')
        building_columns = ['building', 'area_m2', 'geometry']
        available_columns = [col for col in building_columns if col in buildings_gdf.columns]
        if 'geometry' in available_columns:
            kepler_data['buildings'] = buildings_gdf[available_columns].to_json()
        else:
            logging.warning("Buildings GeoDataFrame missing 'geometry' column. Skipping buildings layer.")
    else:
        logging.warning("Buildings data not found in data dictionary. Skipping buildings layer.")

    # Define neighborhoods layer
    neighborhoods_layer = {
        "id": "neighborhoods",
        "type": "geojson",
        "config": {
            "dataId": "neighborhoods",
            "label": "Neighborhoods",
            "color": [18, 147, 154],
            "columns": {
                "geojson": "geometry"
            },
            "isVisible": True,
            "visConfig": {
                "opacity": 0.7,
                "strokeOpacity": 0.9,
                "thickness": 1,
                "strokeColor": [255, 255, 255],
                "colorRange": {
                    "name": "Global Warming",
                    "type": "sequential",
                    "colors": [
                        "#5A1846", "#900C3F", "#C70039",
                        "#E3611C", "#F1920E", "#FFC107"
                    ]
                },
                "strokeColorRange": {
                    "name": "Global Warming",
                    "type": "sequential",
                    "colors": [
                        "#5A1846", "#900C3F", "#C70039",
                        "#E3611C", "#F1920E", "#FFC107"
                    ]
                },
                "colorField": {
                    "name": "walkability_gnn",
                    "type": "real"
                },
                "colorScale": "quantile"
            }
        },
        "visualChannels": {
            "colorField": {
                "name": "walkability_gnn",
                "type": "real"
            },
            "colorScale": "quantile"
        }
    }

    # Define roads layer if available
    if 'roads' in kepler_data:
        roads_layer = {
            "id": "roads",
            "type": "geojson",
            "config": {
                "dataId": "roads",
                "label": "Roads",
                "color": [255, 0, 0],
                "columns": {
                    "geojson": "geometry"
                },
                "isVisible": True,
                "visConfig": {
                    "opacity": 0.8,
                    "strokeOpacity": 0.8,
                    "thickness": 2,
                    "strokeColor": [255, 0, 0],
                    "colorField": {
                        "name": "class",
                        "type": "string"
                    },
                    "colorScale": "ordinal",
                    "colorRange": {
                        "name": "ColorBrewer Paired-12",
                        "type": "all",
                        "category": "ColorBrewer",
                        "colors": ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928"]
                    }
                }
            },
            "visualChannels": {
                "colorField": {
                    "name": "class",
                    "type": "string"
                },
                "colorScale": "ordinal"
            }
        }
    else:
        roads_layer = None

    # Define buildings layer if available
    if 'buildings' in kepler_data:
        buildings_layer = {
            "id": "buildings",
            "type": "geojson",
            "config": {
                "dataId": "buildings",
                "label": "Buildings",
                "color": [0, 255, 0],
                "columns": {
                    "geojson": "geometry"
                },
                "isVisible": True,
                "visConfig": {
                    "opacity": 0.5,
                    "strokeOpacity": 0.5,
                    "thickness": 0.5,
                    "strokeColor": [0, 0, 0],
                    "colorField": {
                        "name": "building",
                        "type": "string"
                    },
                    "colorScale": "ordinal",
                    "colorRange": {
                        "name": "ColorBrewer Set3-12",
                        "type": "all",
                        "category": "ColorBrewer",
                        "colors": ["#8dd3c7", "#ffffb3", "#bebada", "#fb8072", "#80b1d3", "#fdb462", "#b3de69", "#fccde5", "#d9d9d9", "#bc80bd", "#ccebc5", "#ffed6f"]
                    }
                }
            },
            "visualChannels": {
                "colorField": {
                    "name": "building",
                    "type": "string"
                },
                "colorScale": "ordinal"
            }
        }
    else:
        buildings_layer = None

    # Create layers list
    layers = [neighborhoods_layer]
    if roads_layer:
        layers.append(roads_layer)
    if buildings_layer:
        layers.append(buildings_layer)

    # Define tooltips
    tooltips = {
        "neighborhoods": [
            {"name": "LIE_NAME", "format": None},
            {"name": "walkability_score", "format": "{:.3f}"},
            {"name": "walkability_gnn", "format": "{:.3f}"},
            {"name": "walkability_category", "format": None}
        ]
    }
    if 'roads' in kepler_data:
        tooltips['roads'] = [
            {"name": "class", "format": None},
            {"name": "length_m", "format": "{:.2f}"}
        ]
    if 'buildings' in kepler_data:
        tooltips['buildings'] = [
            {"name": "building", "format": None},
            {"name": "area_m2", "format": "{:.2f}"}
        ]

    # Update config
    config = {
        "version": "v1",
        "config": {
            "visState": {
                "layers": layers,
                "interactionConfig": {
                    "tooltip": {
                        "fieldsToShow": tooltips,
                        "enabled": True
                    }
                }
            },
            "mapState": {
                "latitude": 25.0330,
                "longitude": 121.5654,
                "zoom": 11
            },
            "mapStyle": {
                "styleType": "dark"
            }
        }
    }

    map_1 = KeplerGl(height=800, data=kepler_data, config=config)
    map_path = os.path.join(BASE_DIR, 'taipei_walkability_map.html')
    map_1.save_to_html(file_name=map_path)
    logging.info(f"Interactive map generated and saved as {map_path}")
    print(f"Map saved to {map_path}!")

Cell 11: Main Execution (main)

In [15]:
def main(force_recompute_graph=False):
      """Main execution pipeline for the analysis."""
      logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
      os.makedirs(SUBGRAPH_DIR, exist_ok=True)
      logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

      # Track timing for each step
      timings = {}
      
      try:
          # Step 1: Load and prepare data
          start_time = time.time()
          print("Starting load_and_prepare_data...")
          data = load_and_prepare_data()
          timings['load_and_prepare_data'] = time.time() - start_time

          # Step 2: Compute road type accident correlation
          start_time = time.time()
          print("Starting compute_road_type_accident_correlation...")
          road_accident_summary = compute_road_type_accident_correlation(
              data['roads'], data['neighborhoods'], data['accidents']
          )
          timings['compute_road_type_accident_correlation'] = time.time() - start_time

          # Step 2.5: Compute pedestrian road density
          start_time = time.time()
          print("Starting compute_pedestrian_road_density...")
          data['neighborhoods'] = compute_pedestrian_road_density(data['roads'], data['neighborhoods'])
          timings['compute_pedestrian_road_density'] = time.time() - start_time

          # Step 3: Build graph
          start_time = time.time()
          print("Starting build_graph...")
          G = build_graph(data, force_recompute=force_recompute_graph)
          timings['build_graph'] = time.time() - start_time

          # Validate edge counts
          edge_count = G.edgelist.edgelist_df.shape[0] if G.edgelist else 0
          logging.info(f"Graph edge count: {edge_count}")
          if edge_count == 0:
              logging.warning("Graph has no edges. GNN will not utilize graph structure.")

          # Step 4: Compute walkability scores
          start_time = time.time()
          print("Starting compute_walkability_scores...")
          G = compute_walkability_scores(G, data)
          timings['compute_walkability_scores'] = time.time() - start_time

          # Step 5: Prepare GNN data
          start_time = time.time()
          print("Starting prepare_gnn_data...")
          data_gnn = prepare_gnn_data(G)
          timings['prepare_gnn_data'] = time.time() - start_time

          # Step 6: Train GNN model
          start_time = time.time()
          print("Starting train_gnn_model...")
          results = train_gnn_model(data_gnn)
          model = results['model']
          timings['train_gnn_model'] = time.time() - start_time

          # Step 7: Predict walkability
          start_time = time.time()
          print("Starting predict_walkability...")
          G = predict_walkability(G, model)
          timings['predict_walkability'] = time.time() - start_time

          # Step 8: Create interactive map
          start_time = time.time()
          print("Starting create_interactive_map...")
          create_interactive_map(G, data)
          timings['create_interactive_map'] = time.time() - start_time

          # Final validation: Check walkability scores
          nodes_df = G._nodes.to_pandas()
          neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood']
          walkability_score_stats = neighborhood_nodes['walkability_score'].describe()
          walkability_gnn_stats = neighborhood_nodes['walkability_gnn'].describe()
          walkability_category_dist = neighborhood_nodes['walkability_category'].value_counts()
          non_zero_walkability = (neighborhood_nodes['walkability_score'] > 0).sum()
          non_zero_walkability_gnn = (neighborhood_nodes['walkability_gnn'] > 0).sum()
          
          logging.info("Final validation - Walkability scores in neighborhood nodes:")
          logging.info(f"Walkability score distribution:\n{walkability_score_stats}")
          logging.info(f"Walkability GNN distribution:\n{walkability_gnn_stats}")
          logging.info(f"Walkability category distribution:\n{walkability_category_dist}")
          logging.info(f"Number of neighborhood nodes with non-zero walkability_score: {non_zero_walkability}/{len(neighborhood_nodes)}")
          logging.info(f"Number of neighborhood nodes with non-zero walkability_gnn: {non_zero_walkability_gnn}/{len(neighborhood_nodes)}")

          # Check for low variation in walkability scores
          if walkability_score_stats['std'] < 0.05:
              logging.warning("Walkability scores have low variation (std < 0.05). Components may need adjustment.")
          if walkability_gnn_stats['std'] < 0.05:
              logging.warning("GNN predictions have low variation (std < 0.05). Check edge creation and model training.")

          # Compute correlation between walkability_score and walkability_gnn
          corr, p_value = pearsonr(neighborhood_nodes['walkability_score'], neighborhood_nodes['walkability_gnn'])
          logging.info(f"Correlation between walkability_score and walkability_gnn: {corr:.2f} (p-value: {p_value:.2f})")
          if corr < 0.5:
              logging.warning("Low correlation between walkability_score and walkability_gnn. GNN predictions may not align well with rule-based scores.")

          # Log timing summary
          logging.info("Processing complete. Timing summary:")
          for step, duration in timings.items():
              logging.info(f"{step}: {duration:.2f} seconds")
          
          print("Pipeline completed successfully.")
          print(G.edgelist.edgelist_df.to_pandas().head())

          return results

      except Exception as e:
          logging.error(f"Pipeline failed with error: {str(e)}")
          raise

def plot_training_history(results):
    import matplotlib.pyplot as plt
    # Plot losses
    plt.figure(figsize=(10, 5))
    plt.plot(results['train_losses'], label='Train Loss')
    plt.plot(results['val_losses'], label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('Training and Validation Loss')
    plt.savefig('training_validation_loss.png')
    plt.close()

    # Plot MAEs
    plt.figure(figsize=(10, 5))
    plt.plot(results['train_maes'], label='Train MAE')
    plt.plot(results['val_maes'], label='Val MAE')
    plt.xlabel('Epoch')
    plt.ylabel('MAE')
    plt.legend()
    plt.title('Training and Validation MAE')
    plt.savefig('training_validation_mae.png')
    plt.close()

    # Plot R2 scores
    plt.figure(figsize=(10, 5))
    plt.plot(results['train_r2s'], label='Train R2')
    plt.plot(results['val_r2s'], label='Val R2')
    plt.xlabel('Epoch')
    plt.ylabel('R2 Score')
    plt.legend()
    plt.title('Training and Validation R2 Score')
    plt.savefig('training_validation_r2.png')
    plt.close()

if __name__ == "__main__":
    results = main(force_recompute_graph=True)
    plot_training_history(results)

2025-04-27 18:44:03,055 - INFO - Ensured subgraph directory exists: /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs
2025-04-27 18:44:03,056 - INFO - Stage 1: Loading and preparing data...


Starting load_and_prepare_data...


Loading files:   0%|          | 0/8 [00:00<?, ?it/s]2025-04-27 18:44:03,224 - INFO - Loaded neighborhoods with shape (456, 57)
Loading files:  12%|█▎        | 1/8 [00:00<00:01,  6.05it/s]2025-04-27 18:44:04,431 - INFO - Loaded buildings with shape (74306, 9)
Loading files:  25%|██▌       | 2/8 [00:01<00:04,  1.29it/s]2025-04-27 18:44:04,613 - INFO - Loaded roads with shape (81444, 2)
Loading files:  38%|███▊      | 3/8 [00:01<00:02,  1.98it/s]2025-04-27 18:44:04,650 - INFO - Loaded trees with shape (5019, 12)
2025-04-27 18:44:04,726 - INFO - Loaded transit with shape (29892, 11)
Loading files:  62%|██████▎   | 5/8 [00:01<00:00,  3.98it/s]2025-04-27 18:44:05,355 - INFO - Loaded urban_masterplan with shape (15521, 15)
Loading files:  75%|███████▌  | 6/8 [00:02<00:00,  2.78it/s]2025-04-27 18:44:05,872 - INFO - Loaded accidents with shape (56133, 8)
Loading files:  88%|████████▊ | 7/8 [00:02<00:00,  2.46it/s]2025-04-27 18:44:05,881 - INFO - Columns in population_df after loading: ['LIE_NAM


--- Percentage Calculation Process ---

Neighborhood: 板溪里 (Index: 373)
Total unique master plan area: 63777.59 m²
Area of City_Open_Area (priority 10): 478.13 m²
Area of Education (priority 6): 8173.36 m²
Area of Commercial (priority 4): 23017.42 m²
Area of Residential (priority 3): 32108.69 m²

Percentages:
City_Open_Area: 0.75%
Education: 12.82%
Commercial: 36.09%
Residential: 50.34%
Sum of percentages: 100.00%

Neighborhood: 芝山里 (Index: 39)
Total unique master plan area: 1061285.95 m²
Area of Education (priority 6): 80442.15 m²
Area of Residential (priority 3): 196204.41 m²
Area of Natural (priority 2): 775753.11 m²
Area of River (priority 1): 2754.93 m²
Area of Government (priority 1): 6131.35 m²

Percentages:
Education: 7.58%
Residential: 18.49%
Natural: 73.10%
River: 0.26%
Government: 0.58%
Sum of percentages: 100.00%

Neighborhood: 和平里 (Index: 340)
Total unique master plan area: 98073.11 m²
Area of City_Open_Area (priority 10): 4608.53 m²
Area of Public_Transportation (priority

2025-04-27 18:44:36,257 - INFO - Finished loading and preparing data.
2025-04-27 18:44:36,297 - INFO - Computing correlation between road types and accident density...
2025-04-27 18:44:36,318 - INFO - Roads CRS: {"$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", "type": "ProjectedCRS", "name": "TWD97 / TM2 zone 121", "base_crs": {"name": "TWD97", "datum": {"type": "GeodeticReferenceFrame", "name": "Taiwan Datum 1997", "ellipsoid": {"name": "GRS 1980", "semi_major_axis": 6378137, "inverse_flattening": 298.257222101}}, "coordinate_system": {"subtype": "ellipsoidal", "axis": [{"name": "Geodetic latitude", "abbreviation": "Lat", "direction": "north", "unit": "degree"}, {"name": "Geodetic longitude", "abbreviation": "Lon", "direction": "east", "unit": "degree"}]}, "id": {"authority": "EPSG", "code": 3824}}, "conversion": {"name": "Taiwan 2-degree TM zone 121", "method": {"name": "Transverse Mercator", "id": {"authority": "EPSG", "code": 9807}}, "parameters": [{"name": "Latitu


--- Data Structure Summary ---

Dataset: neighborhoods
Shape: (456, 68)
Columns and Data Types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
transit_count                        int64
accident_count                       int64
road_density                       float64
total_population                     int64
elderly_percentage                 float64
Length: 68, dtype: object
Missing values (total): 0
Missing values per column:
LIE_NAME                           0
SECT_NAME                          0
2024population                     0
land_use_city_open_area_count      0
land_use_city_open_area_area_m2    0
                                  ..
transit_count                      0
accident_count                     0
road_density                       0
total_population  

See: https://matplotlib.org/stable/tutorials/intermediate/legend_guide.html#implementing-a-custom-legend-handler
  plt.legend()
2025-04-27 18:44:37,758 - INFO - Overlay plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/roads_neighborhoods_overlap.png
2025-04-27 18:44:38,201 - INFO - Assigning accidents to nearest road...
2025-04-27 18:44:42,736 - INFO - Matched 56133 accidents out of 56133
2025-04-27 18:44:42,740 - INFO - Reassigning 4991 accidents from footway/cycleway...
2025-04-27 18:44:42,835 - INFO - Reassigned 1429 accidents to wider roads
2025-04-27 18:44:42,849 - INFO - Accidents by road type:
class
bridleway            0
cycleway           247
footway           3315
living_street       79
motorway           109
path                86
pedestrian          84
primary           6535
residential      10110
secondary        16180
service           5011
steps               47
tertiary          9135
track                8
trunk             2493
unclassified      166

Road type counts:
class
service          21204
footway          16755
residential      14861
tertiary          5113
secondary         3869
path              3610
steps             2968
unclassified      1894
primary           1209
cycleway           825
track              716
trunk              593
motorway           313
pedestrian         297
living_street      264
unknown             54
bridleway            2
Name: count, dtype: int64

--- Road Type Accident Density Summary ---
            class    length_m  accident_count  accident_density  width_rank
1        cycleway   263682.07             245              0.26           1
2         footway  1767503.72            3235              0.73           1
3   living_street    23929.78              77              1.58           3
4        motorway   215317.82             109              2.74           5
5            path   720602.60              84              0.09           1
6      pedestrian    31830.26              82              

2025-04-27 18:44:45,640 - INFO - Road-neighborhood join resulted in 86725 matches with columns: ['geometry', 'class', 'length_m', 'width_rank', 'accident_density', 'index_right', 'LIE_NAME']
2025-04-27 18:44:45,642 - INFO - Non-NaN LIE_NAME count: 62277
2025-04-27 18:44:45,643 - INFO - Non-NaN accident_density count: 86725
2025-04-27 18:44:45,646 - INFO - Unique LIE_NAME values: 456
2025-04-27 18:44:45,652 - INFO - Number of neighborhoods with calculated avg_accident_density: 456
2025-04-27 18:44:45,653 - INFO - NaN in avg_accident_density: 0
2025-04-27 18:44:45,655 - INFO - Assigned avg_road_accident_density to 456 neighborhoods
2025-04-27 18:44:45,657 - INFO - Avg road accident density stats:
count    456.000000
mean       8.928906
std        5.840511
min        0.010199
25%        4.955368
50%        7.926718
75%       12.076502
max       33.147588
Name: avg_road_accident_density, dtype: float64
2025-04-27 18:44:45,819 - INFO - Bar chart saved to /home/johnny/Iaacthesis/projects/Geo

Bar chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_bar.png


2025-04-27 18:44:46,254 - INFO - Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


Box chart saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_box.png


2025-04-27 18:44:46,509 - INFO - Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
2025-04-27 18:44:46,512 - INFO - Top 3 road types by accident density:
        class  accident_density
7     primary             44.39
9   secondary             36.32
14      trunk             11.69
2025-04-27 18:44:46,513 - INFO - Computing pedestrian road density per neighborhood...


Scatter plot saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/road_type_accident_scatter.png
Top 3 road types by accident density:
        class  accident_density
7     primary             44.39
9   secondary             36.32
14      trunk             11.69
Starting compute_pedestrian_road_density...


2025-04-27 18:44:46,920 - INFO - pedestrian_road_density stats:
count    456.000000
mean      15.841759
std       10.276229
min        0.000000
25%        8.202007
50%       15.006599
75%       21.558429
max       57.267153
Name: pedestrian_road_density, dtype: float64
2025-04-27 18:44:46,924 - INFO - Stage 2: Building city graph...
2025-04-27 18:44:46,925 - INFO - Dataset neighborhoods column types:
LIE_NAME                            object
SECT_NAME                           object
2024population                       int32
land_use_city_open_area_count        int32
land_use_city_open_area_area_m2    float64
                                    ...   
road_density                       float64
total_population                     int64
elderly_percentage                 float64
avg_road_accident_density          float64
pedestrian_road_density            float64
Length: 70, dtype: object
2025-04-27 18:44:46,935 - INFO - Dataset buildings column types:
full_id       object
osm_id     

Starting build_graph...


Neighborhood nodes: 100%|██████████| 456/456 [00:00<00:00, 12142.81it/s]
2025-04-27 18:44:46,994 - INFO - Adding building nodes...
Building nodes: 100%|██████████| 74306/74306 [00:03<00:00, 21517.15it/s]
2025-04-27 18:44:50,453 - INFO - Adding road nodes...
Road nodes: 100%|██████████| 81444/81444 [00:03<00:00, 21348.82it/s]
2025-04-27 18:44:54,941 - INFO - Converting GeoDataFrames to cudf for GPU processing...
2025-04-27 18:44:55,067 - INFO - Extracting bounding box coordinates...
2025-04-27 18:44:57,026 - INFO - neighborhoods_cudf['min_x'] dtype: float64
2025-04-27 18:44:57,027 - INFO - buildings_cudf['min_x'] dtype: float64
2025-04-27 18:44:57,027 - INFO - roads_cudf['min_x'] dtype: float64
2025-04-27 18:44:57,027 - INFO - Creating edges using GPU-accelerated spatial joins...
2025-04-27 18:44:57,028 - INFO - Computing neighborhood-neighborhood edges...
Neighborhood-Neighborhood edges:   0%|          | 0/456 [00:00<?, ?it/s]2025-04-27 18:44:57,063 - INFO - init
Neighborhood-Neighborh

Starting compute_walkability_scores...


2025-04-27 18:45:13,915 - INFO - Finished computing walkability scores.
2025-04-27 18:45:13,918 - INFO - Preparing data for GNN training...


Starting prepare_gnn_data...


Normalizing features by node type:   0%|          | 0/3 [00:00<?, ?it/s]2025-04-27 18:45:13,994 - INFO - Node type neighborhood: 456 nodes, feature shape: (456, 127)
2025-04-27 18:45:14,265 - INFO - Node type building: 74306 nodes, feature shape: (74306, 127)
Normalizing features by node type:  67%|██████▋   | 2/3 [00:00<00:00,  5.88it/s]2025-04-27 18:45:14,438 - INFO - Node type road: 81444 nodes, feature shape: (81444, 127)
Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00,  5.78it/s]
2025-04-27 18:45:14,529 - INFO - Edge index created with 270909 edges
2025-04-27 18:45:14,540 - INFO - Prepared GNN data: 156206 nodes, 270909 edges
2025-04-27 18:45:14,540 - INFO - Feature matrix shape: torch.Size([156206, 127])
2025-04-27 18:45:14,541 - INFO - Label tensor shape: torch.Size([156206, 1])
2025-04-27 18:45:14,544 - INFO - Stage 4: Training GNN model...


Starting train_gnn_model...


2025-04-27 18:45:14,834 - INFO - Target (walkability_score) distribution for neighborhood nodes:
count    456.000000
mean       0.359348
std        0.040287
min        0.233758
25%        0.334115
50%        0.359866
75%        0.383302
max        0.593854
dtype: float64
Training epochs:   0%|          | 0/500 [00:00<?, ?it/s]2025-04-27 18:45:15,554 - INFO - Epoch 0, Train Loss: 0.0046, Train MAE: 0.0556, Train R2: -1.6796, Val Loss: 0.0069, Val MAE: 0.0766, Val R2: -4.4969
Training epochs:   2%|▏         | 10/500 [00:02<01:56,  4.21it/s]2025-04-27 18:45:17,846 - INFO - Epoch 10, Train Loss: 0.0016, Train MAE: 0.0306, Train R2: 0.0589, Val Loss: 0.0025, Val MAE: 0.0422, Val R2: -0.9846
Training epochs:   4%|▍         | 20/500 [00:04<01:43,  4.66it/s]2025-04-27 18:45:20,010 - INFO - Epoch 20, Train Loss: 0.0013, Train MAE: 0.0261, Train R2: 0.2636, Val Loss: 0.0011, Val MAE: 0.0257, Val R2: 0.0966
Training epochs:   6%|▌         | 30/500 [00:07<01:41,  4.62it/s]2025-04-27 18:45:22,163 -

Starting predict_walkability...


Normalizing features by node type:   0%|          | 0/3 [00:00<?, ?it/s]2025-04-27 18:45:35,106 - INFO - Node type neighborhood: 456 nodes, feature shape: (456, 127)
2025-04-27 18:45:35,403 - INFO - Node type building: 74306 nodes, feature shape: (74306, 127)
Normalizing features by node type:  67%|██████▋   | 2/3 [00:00<00:00,  5.26it/s]2025-04-27 18:45:35,580 - INFO - Node type road: 81444 nodes, feature shape: (81444, 127)
Normalizing features by node type: 100%|██████████| 3/3 [00:00<00:00,  5.33it/s]
2025-04-27 18:45:35,665 - INFO - Edge index created with 270909 edges
2025-04-27 18:45:35,668 - INFO - Prepared GNN data: 156206 nodes, 270909 edges
2025-04-27 18:45:35,669 - INFO - Feature matrix shape: torch.Size([156206, 127])
2025-04-27 18:45:35,669 - INFO - Label tensor shape: torch.Size([156206, 1])
2025-04-27 18:45:36,139 - INFO - GNN walkability category thresholds - low: 0.3505, high: 0.3690
2025-04-27 18:45:36,265 - INFO - Finished predicting walkability scores.
2025-04-27 1

Starting create_interactive_map...
User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


2025-04-27 18:45:42,863 - INFO - Interactive map generated and saved as /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html
2025-04-27 18:45:42,992 - INFO - Final validation - Walkability scores in neighborhood nodes:
2025-04-27 18:45:42,994 - INFO - Walkability score distribution:
count    456.000000
mean       0.359348
std        0.040287
min        0.233758
25%        0.334115
50%        0.359866
75%        0.383302
max        0.593854
Name: walkability_score, dtype: float64
2025-04-27 18:45:42,995 - INFO - Walkability GNN distribution:
count    456.000000
mean       0.360371
std        0.022180
min        0.290742
25%        0.346118
50%        0.360622
75%        0.374672
max        0.420831
Name: walkability_gnn, dtype: float64
2025-04-27 18:45:42,995 - INFO - Walkability category distribution:
walkability_category
high      155
low       151
medium    150
Name: count, dtype: int64
2025-04-27 18:45:42,996 - INFO - Number of neighborhood nodes with n

Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Pipeline completed successfully.
   src  dst
0    0    1
1    0    2
2    0    3
3    0    4
4    0    5
