In [86]:
import cudf, cugraph
print(cudf.__version__, cugraph.__version__)

25.02.02 25.02.00


Cell 1: Imports ,Global Constants

In [87]:
import geopandas as gpd
import pandas as pd
import cudf
import cugraph
from tqdm import tqdm
import logging
import os
import numpy as np
from scipy.stats import pearsonr
from shapely import make_valid
from shapely.errors import GEOSException
from shapely.geometry import Point
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from IPython.display import Image, display
from sklearn.cluster import KMeans
import plotly.express as px  
import plotly.graph_objects as go  
import cupy
import hashlib
import json
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from keplergl import KeplerGl

# Try to import CuSpatial for GPU-accelerated spatial operations
try:
    import cuspatial
    CUSPATIAL_AVAILABLE = True
except ImportError:
    logging.warning("CuSpatial not available, falling back to CPU-based computation for intersection counts.")
    CUSPATIAL_AVAILABLE = False

# Ensure plots are displayed inline
%matplotlib inline

# Set the font to support Traditional Chinese
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Noto Sans CJK TC', 'Noto Serif CJK TC', 'Noto Sans Mono CJK TC', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# Define file paths
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
INTERSECTION_CACHE_PATH = os.path.join(BASE_DIR, "neighborhoods_with_intersections.geoparquet")
GRAPH_NODES_CACHE_PATH = os.path.join(BASE_DIR, "graph_nodes.parquet")
GRAPH_EDGES_CACHE_PATH = os.path.join(BASE_DIR, "graph_edges.parquet")
GRAPH_NODE_ID_CACHE_PATH = os.path.join(BASE_DIR, "graph_node_id_to_index.json")
GRAPH_DATA_HASH_PATH = os.path.join(BASE_DIR, "graph_data_hash.txt")

# Ensure directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Define category priority (higher index = higher priority)
CATEGORY_PRIORITY = {
    'City_Open_Area': 10,
    'Pedestrian': 9,
    'Public_Transportation': 8,
    'Amenity': 7,
    'Education': 6,
    'Medical': 5,
    'Commercial': 4,
    'Residential': 3,
    'Natural': 2,
    'Road': 1,
    'River': 1,
    'Infrastructure': 1,
    'Government': 1,
    'Special_Zone': 1,
    'Military': 1,
    'Industrial': 1,
    'Agriculture': 1
}

# Define land use weights for walkability scoring
land_use_weights = {
    'city_open_area': 0.8,
    'commercial': 0.7,
    'infrastructure': 0.4,
    'government': 0.5,
    'public_transportation': 0.8,
    'education': 0.7,
    'medical': 0.6,
    'amenity': 0.8,
    'road': 0.3,
    'pedestrian': 1.0,
    'natural': 0.7,
    'special_zone': 0.4,
    'river': 0.7,
    'military': 0.2,
    'residential': 0.6,
    'industrial': 0.3,
    'agriculture': 0.4
}

Cell 2: Utility Functions

In [88]:
def print_data_structure(data_dict):
    print("\n--- Detailed Data Structure Overview ---")
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            print(f"\nDataset: {key}")
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            print(f"Data types:\n{df.dtypes}")
            print(f"Missing values per column:\n{df.isnull().sum()}")
            if not df.select_dtypes(include=['float64', 'int64']).empty:
                summary = df.describe().round(2)
                print(f"Summary statistics for numerical columns:\n{summary}")
            print("Sample data (first 5 rows):")
            sample = df.head(5).copy()
            for col in sample.select_dtypes(include=['float64', 'int64']).columns:
                sample[col] = sample[col].round(2)
            print(sample)
    print("--- End of Detailed Data Structure Overview ---\n")

def fix_geometry(geom, buffer_size=1e-5):
    if geom is None or geom.is_empty:
        return geom
    geom = make_valid(geom)
    if not geom.is_valid:
        geom = geom.buffer(buffer_size)
        geom = make_valid(geom)
    if not geom.is_valid:
        logging.warning(f"Geometry remains invalid after fixing: {geom.bounds}")
    return geom

def print_percentage_calculation(neighborhoods_gdf, urban_masterplan_gdf, sample_size=3):
    print("\n--- Percentage Calculation Process ---")
    sample_neighborhoods = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    for idx, row in sample_neighborhoods.iterrows():
        lie_name = row['LIE_NAME']
        print(f"\nNeighborhood: {lie_name} (Index: {idx})")
        
        neighborhood_geom = fix_geometry(row['geometry'])
        if not neighborhood_geom.is_valid:
            print(f"Neighborhood geometry is invalid after fixing: {lie_name}")
            continue
        
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            print("No master plan polygons intersect with this neighborhood.")
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            print("No valid intersections after overlay.")
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            print("No valid geometries after fixing intersected polygons.")
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area_geom = intersected.geometry.union_all()
        total_area_geom = fix_geometry(total_area_geom)
        if not total_area_geom.is_valid or total_area_geom.is_empty:
            print("Total area geometry is invalid after fixing.")
            continue
        total_area = total_area_geom.area
        print(f"Total unique master plan area: {total_area:.2f} m²")
        
        remaining_geom = total_area_geom
        category_areas = {}
        unique_categories = intersected['Category'].unique()
        
        for category in sorted(unique_categories, key=lambda x: CATEGORY_PRIORITY.get(x, 0), reverse=True):
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            category_geom = fix_geometry(category_geom)
            if not category_geom.is_valid or category_geom.is_empty:
                print(f"Geometry for category {category} is invalid after fixing.")
                category_areas[category] = 0.0
                continue
            
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area_geom = fix_geometry(category_area_geom)
                if not category_area_geom.is_valid or category_area_geom.is_empty:
                    print(f"Intersection geometry for category {category} is invalid after fixing.")
                    category_areas[category] = 0.0
                    continue
                
                category_area = category_area_geom.area
                category_areas[category] = category_area
                print(f"Area of {category} (priority {CATEGORY_PRIORITY.get(category, 0)}): {category_area:.2f} m²")
                
                remaining_geom = remaining_geom.difference(category_area_geom)
                remaining_geom = fix_geometry(remaining_geom)
                if not remaining_geom.is_valid or remaining_geom.is_empty:
                    print(f"Remaining geometry is invalid after subtracting {category}.")
                    break
            except GEOSException as e:
                print(f"Topology error for category {category}: {e}")
                category_areas[category] = 0.0
                continue
        
        print("\nPercentages:")
        total_percentage = 0.0
        for category, area in category_areas.items():
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            total_percentage += percentage
            print(f"{category}: {percentage:.2f}%")
        print(f"Sum of percentages: {total_percentage:.2f}%")
    print("--- End of Percentage Calculation Process ---\n")
    
def compute_data_hash(data_dict):
    """Compute a hash of the input data for cache validation."""
    hasher = hashlib.sha256()
    
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame, cudf.DataFrame)):
            df = df.to_pandas() if isinstance(df, cudf.DataFrame) else df
            # Hash the shape
            hasher.update(str(df.shape).encode('utf-8'))
            # Hash the column names
            hasher.update(str(sorted(df.columns)).encode('utf-8'))
            # Hash a sample of the data (first 5 rows, serialized as string)
            sample = df.head(5).to_json()
            hasher.update(sample.encode('utf-8'))
    
    return hasher.hexdigest()

Cell 3: Walkability Computation Functions

In [89]:
def compute_walkability_components(neighborhoods_gdf, sample_size=5):
    sample_gdf = neighborhoods_gdf.sample(min(sample_size, len(neighborhoods_gdf)), random_state=42)
    
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'walkability_score': []
    }
    
    ndvi_min, ndvi_max = neighborhoods_gdf['ndvi_mean'].min(), neighborhoods_gdf['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_gdf['tree_count'].min(), neighborhoods_gdf['tree_count'].max()
    transit_min, transit_max = neighborhoods_gdf['transit_count'].min(), neighborhoods_gdf['transit_count'].max()
    
    # Check if intersection_density exists, default to 0 if missing
    if 'intersection_density' in neighborhoods_gdf.columns:
        intersection_density_min = neighborhoods_gdf['intersection_density'].min()
        intersection_density_max = neighborhoods_gdf['intersection_density'].max()
    else:
        logging.warning("'intersection_density' column missing in neighborhoods_gdf. Defaulting to 0.")
        intersection_density_min = 0
        intersection_density_max = 1  # Avoid division by zero
    
    accident_min, accident_max = neighborhoods_gdf['accident_count'].min(), neighborhoods_gdf['accident_count'].max()
    
    for idx, row in sample_gdf.iterrows():
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.0) / 100 for col in land_use_cols if col in row]
        land_use_values = [v for v in land_use_values if v > 0]
        if land_use_values:
            entropy = -np.sum([p * np.log2(p) for p in land_use_values])
            max_entropy = np.log2(len(land_use_values)) if len(land_use_values) > 0 else 1
            land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0
        else:
            land_use_diversity = 0
        
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6))
        tree_normalized = ((row['tree_count'] - tree_min) / (tree_max - tree_min + 1e-6))
        open_area = row.get('land_use_city_open_area_percent', 0.0) / 100
        green_space_score = (ndvi_normalized + tree_normalized + open_area) / 3
        
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6)
        transit_score = np.log1p(transit_raw * 10) / np.log1p(10)
        
        # Safely access intersection_density
        intersection_density = row.get('intersection_density', 0.0)
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6)
        road_connectivity = np.log1p(intersection_density_normalized * 10) / np.log1p(10)
        
        accident_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_density_max = (accident_max / neighborhoods_gdf['area_km2'].min()) if neighborhoods_gdf['area_km2'].min() > 0 else 1
        safety_score = 1 - (accident_density / (accident_density_max + 1e-6))
        
        elderly_accessibility = row['elderly_percentage'] / 100
        
        base_score = (
            0.3 * land_use_diversity +
            0.3 * green_space_score +
            0.2 * transit_score +
            0.2 * road_connectivity
        )
        
        safety_modifier = 0.5 + 0.5 * safety_score
        elderly_modifier = 1 + elderly_accessibility
        walkability_score = base_score * safety_modifier * elderly_modifier
        
        walkability_score = 1 / (1 + np.exp(-5 * (walkability_score - 1)))
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['walkability_score'].append(walkability_score)
    
    return pd.DataFrame(components)

def compute_walkability_components_all(neighborhoods_df):
    components = {
        'LIE_NAME': [],
        'land_use_diversity': [],
        'green_space_score': [],
        'transit_score': [],
        'road_connectivity': [],
        'safety_score': [],
        'elderly_accessibility': [],
        'walkability_score': []
    }
    
    # Precompute min/max for normalization
    ndvi_min, ndvi_max = neighborhoods_df['ndvi_mean'].min(), neighborhoods_df['ndvi_mean'].max()
    tree_min, tree_max = neighborhoods_df['tree_count'].min(), neighborhoods_df['tree_count'].max()
    transit_min, transit_max = neighborhoods_df['transit_count'].min(), neighborhoods_df['transit_count'].max()
    
    # Handle missing intersection_density
    if 'intersection_density' in neighborhoods_df.columns:
        intersection_density_min = neighborhoods_df['intersection_density'].min()
        intersection_density_max = neighborhoods_df['intersection_density'].max()
    else:
        logging.warning("'intersection_density' column missing. Defaulting to 0.")
        intersection_density_min = 0
        intersection_density_max = 1  # Avoid division by zero
    
    accident_min, accident_max = neighborhoods_df['accident_count'].min(), neighborhoods_df['accident_count'].max()
    
    for idx, row in neighborhoods_df.iterrows():
        # Compute land use diversity
        land_use_cols = [f"land_use_{category.lower()}_percent" for category in CATEGORY_PRIORITY.keys()]
        land_use_values = [row.get(col, 0.0) / 100 for col in land_use_cols if col in row]
        land_use_values = [v for v in land_use_values if v > 0]
        if land_use_values:
            entropy = -np.sum([p * np.log2(p) for p in land_use_values])
            max_entropy = np.log2(len(land_use_values)) if len(land_use_values) > 0 else 1
            land_use_diversity = entropy / max_entropy if max_entropy > 0 else 0
        else:
            land_use_diversity = 0
        
        # Compute green space score
        ndvi_normalized = ((row['ndvi_mean'] - ndvi_min) / (ndvi_max - ndvi_min + 1e-6))
        tree_normalized = ((row['tree_count'] - tree_min) / (tree_max - tree_min + 1e-6))
        open_area = row.get('land_use_city_open_area_percent', 0.0) / 100
        green_space_score = (ndvi_normalized + tree_normalized + open_area) / 3
        
        # Compute transit score
        transit_raw = (row['transit_count'] - transit_min) / (transit_max - transit_min + 1e-6)
        transit_score = np.log1p(transit_raw * 10) / np.log1p(10)
        
        # Compute road connectivity
        intersection_density = row.get('intersection_density', 0.0)
        intersection_density_normalized = (intersection_density - intersection_density_min) / (intersection_density_max - intersection_density_min + 1e-6)
        road_connectivity = np.log1p(intersection_density_normalized * 10) / np.log1p(10)
        
        # Compute safety score
        accident_density = row['accident_count'] / row['area_km2'] if row['area_km2'] > 0 else 0
        accident_density_max = (accident_max / neighborhoods_df['area_km2'].min()) if neighborhoods_df['area_km2'].min() > 0 else 1
        safety_score = 1 - (accident_density / (accident_density_max + 1e-6))
        
        # Compute elderly accessibility
        elderly_accessibility = row['elderly_percentage'] / 100
        
        # Compute base walkability score
        base_score = (
            0.3 * land_use_diversity +
            0.3 * green_space_score +
            0.2 * transit_score +
            0.2 * road_connectivity
        )
        
        # Apply modifiers
        safety_modifier = 0.5 + 0.5 * safety_score
        elderly_modifier = 1 + elderly_accessibility
        walkability_score = base_score * safety_modifier * elderly_modifier
        
        # Normalize with sigmoid
        walkability_score = 1 / (1 + np.exp(-5 * (walkability_score - 1)))
        
        components['LIE_NAME'].append(row['LIE_NAME'])
        components['land_use_diversity'].append(land_use_diversity)
        components['green_space_score'].append(green_space_score)
        components['transit_score'].append(transit_score)
        components['road_connectivity'].append(road_connectivity)
        components['safety_score'].append(safety_score)
        components['elderly_accessibility'].append(elderly_accessibility)
        components['walkability_score'].append(walkability_score)
    
    walkability_df = pd.DataFrame(components)
    print("Walkability score distribution:")
    print(walkability_df['walkability_score'].describe())
    
    corr, p_value = pearsonr(walkability_df['walkability_score'], neighborhoods_df['transit_count'])
    logging.info(f"Correlation between walkability score and transit count: {corr:.2f} (p-value: {p_value:.2f})")
    
    return walkability_df

Cell 4 Main Data Loading and Processing

In [90]:
def load_and_prepare_data():
    logging.info("Stage 1: Loading and preparing data...")
    with tqdm(total=8, desc="Loading files") as pbar:
        neighborhoods_gdf = gpd.read_file(
            LANDUSE_NDVI_PATH,
            encoding='utf-8-sig',
            columns=['LIE_NAME', 'geometry', 'land_use_residential_percent', 'land_use_commercial_percent',
                     'land_use_education_percent', 'ndvi_mean']
        ).to_crs('EPSG:3826')
        # Check for duplicate LIE_NAME and log without deduplicating
        duplicates = neighborhoods_gdf['LIE_NAME'].duplicated().sum()
        if duplicates > 0:
            logging.warning(f"Found {duplicates} duplicate LIE_NAME values. Preserving all entries for node creation...")
        pbar.update(1)

        buildings_gdf = gpd.read_file(OSM_BUILDINGS_PATH, columns=['geometry', 'building']).to_crs('EPSG:3826')
        buildings_gdf['area_m2'] = buildings_gdf.geometry.area
        pbar.update(1)

        roads_gdf = gpd.read_parquet(OSM_ROADS_PATH, columns=['geometry', 'class']).to_crs('EPSG:3826')
        roads_gdf['length_m'] = roads_gdf.geometry.length
        logging.info(f"Found {roads_gdf['class'].isnull().sum()} roads with missing 'class' values.")
        roads_gdf['class'] = roads_gdf['class'].fillna('unknown')
        pbar.update(1)

        trees_gdf = gpd.read_parquet(OSM_TREES_PATH, columns=['geometry', 'subtype', 'class']).to_crs('EPSG:3826')
        trees_gdf = trees_gdf[trees_gdf['subtype'] == 'tree']
        pbar.update(1)

        transit_gdf = gpd.read_parquet(OSM_TRANSIT_PATH, columns=['geometry', 'class']).to_crs('EPSG:3826')
        transit_gdf = transit_gdf[transit_gdf['class'].isin(['stop_position', 'bus_stop'])]
        pbar.update(1)

        urban_masterplan_gdf = gpd.read_file(URBAN_MASTERPLAN_PATH, columns=['geometry', 'Category']).to_crs('EPSG:3826')
        if 'area' not in urban_masterplan_gdf.columns:
            urban_masterplan_gdf['area'] = urban_masterplan_gdf.geometry.area
        pbar.update(1)

        accidents_gdf = gpd.read_file(ACCIDENTS_PATH, columns=['geometry']).to_crs('EPSG:3826')
        pbar.update(1)

        population_df = pd.read_json(POPULATION_PATH, encoding='utf-8')
        population_df.rename(columns={'District': 'LIE_NAME', 'Total_Population': 'total_population',
                                      'Elderly_Percentage': 'elderly_percentage'}, inplace=True)
        pbar.update(1)

    logging.info("Validating and fixing geometries...")
    neighborhoods_gdf['geometry'] = neighborhoods_gdf['geometry'].apply(fix_geometry)
    urban_masterplan_gdf['geometry'] = urban_masterplan_gdf['geometry'].apply(fix_geometry)
    roads_gdf['geometry'] = roads_gdf['geometry'].apply(fix_geometry)
    accidents_gdf['geometry'] = accidents_gdf['geometry'].apply(fix_geometry)

    invalid_neighborhoods = neighborhoods_gdf[~neighborhoods_gdf.geometry.is_valid]
    if not invalid_neighborhoods.empty:
        logging.warning(f"Found {len(invalid_neighborhoods)} invalid geometries in neighborhoods_gdf after fixing.")
    invalid_masterplan = urban_masterplan_gdf[~urban_masterplan_gdf.geometry.is_valid]
    if not invalid_masterplan.empty:
        logging.warning(f"Found {len(invalid_masterplan)} invalid geometries in urban_masterplan_gdf after fixing.")
    invalid_roads = roads_gdf[~roads_gdf.geometry.is_valid]
    if not invalid_roads.empty:
        logging.warning(f"Found {len(invalid_roads)} invalid geometries in roads_gdf after fixing.")
    invalid_accidents = accidents_gdf[~accidents_gdf.geometry.is_valid]
    if not invalid_accidents.empty:
        logging.warning(f"Found {len(invalid_accidents)} invalid geometries in accidents_gdf after fixing.")

    logging.info("Performing spatial joins and aggregations...")
    neighborhoods_gdf['area_km2'] = neighborhoods_gdf.geometry.area / 1e6

    # Spatial joins
    tree_counts = gpd.sjoin(neighborhoods_gdf, trees_gdf, how='left', predicate='contains')
    tree_counts = tree_counts.groupby(level=0).size().reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['tree_count'] = tree_counts

    transit_counts = gpd.sjoin(neighborhoods_gdf, transit_gdf, how='left', predicate='contains')
    transit_counts = transit_counts.groupby(level=0).size().reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['transit_count'] = transit_counts

    accident_counts = gpd.sjoin(neighborhoods_gdf, accidents_gdf, how='left', predicate='contains')
    accident_counts = accident_counts.groupby(level=0).size().reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['accident_count'] = accident_counts

    road_lengths = gpd.sjoin(roads_gdf, neighborhoods_gdf, how='left', predicate='intersects')
    road_lengths = road_lengths.groupby('index_right')['length_m'].sum().reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['road_density'] = road_lengths / (neighborhoods_gdf['area_km2'] * 1000)

    # Compute intersection counts using compute_intersection_counts
    logging.info("Computing intersection counts...")
    neighborhoods_gdf = compute_intersection_counts(neighborhoods_gdf, roads_gdf)

    # Merge population data
    neighborhoods_gdf = neighborhoods_gdf.merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']],
        on='LIE_NAME',
        how='left'
    )
    neighborhoods_gdf['total_population'] = neighborhoods_gdf['total_population'].fillna(0)
    neighborhoods_gdf['elderly_percentage'] = neighborhoods_gdf['elderly_percentage'].fillna(0)

    # Compute land use percentages
    logging.info("Computing land use percentages...")
    for idx, row in neighborhoods_gdf.iterrows():
        neighborhood_geom = fix_geometry(row['geometry'])
        if not neighborhood_geom.is_valid:
            logging.warning(f"Invalid geometry for neighborhood {row['LIE_NAME']} after fixing.")
            continue
        
        relevant_masterplan = urban_masterplan_gdf[urban_masterplan_gdf.intersects(neighborhood_geom)]
        if relevant_masterplan.empty:
            continue
        
        temp_gdf = gpd.GeoDataFrame({'geometry': [neighborhood_geom]}, crs='EPSG:3826')
        intersected = gpd.overlay(temp_gdf, relevant_masterplan, how='intersection', keep_geom_type=False)
        if intersected.empty:
            continue
        
        intersected['geometry'] = intersected['geometry'].apply(fix_geometry)
        intersected = intersected[intersected.geometry.is_valid & ~intersected.geometry.is_empty]
        if intersected.empty:
            continue
        
        intersected['priority'] = intersected['Category'].map(CATEGORY_PRIORITY)
        intersected = intersected.sort_values(by='priority', ascending=False)
        
        total_area_geom = intersected.geometry.union_all()
        total_area_geom = fix_geometry(total_area_geom)
        if not total_area_geom.is_valid or total_area_geom.is_empty:
            continue
        total_area = total_area_geom.area
        
        remaining_geom = total_area_geom
        category_areas = {}
        unique_categories = intersected['Category'].unique()
        
        for category in sorted(unique_categories, key=lambda x: CATEGORY_PRIORITY.get(x, 0), reverse=True):
            category_rows = intersected[intersected['Category'] == category]
            category_geom = category_rows.geometry.union_all()
            category_geom = fix_geometry(category_geom)
            if not category_geom.is_valid or category_geom.is_empty:
                category_areas[category] = 0.0
                continue
            
            try:
                category_area_geom = category_geom.intersection(remaining_geom)
                category_area_geom = fix_geometry(category_area_geom)
                if not category_area_geom.is_valid or category_area_geom.is_empty:
                    category_areas[category] = 0.0
                    continue
                
                category_area = category_area_geom.area
                category_areas[category] = category_area
                remaining_geom = remaining_geom.difference(category_area_geom)
                remaining_geom = fix_geometry(remaining_geom)
                if not remaining_geom.is_valid or remaining_geom.is_empty:
                    break
            except GEOSException as e:
                logging.warning(f"Topology error for category {category} in neighborhood {row['LIE_NAME']}: {e}")
                category_areas[category] = 0.0
                continue
        
        for category in CATEGORY_PRIORITY.keys():
            col_name = f"land_use_{category.lower()}_percent"
            area = category_areas.get(category, 0.0)
            percentage = (area / total_area * 100) if total_area > 0 else 0.0
            neighborhoods_gdf.at[idx, col_name] = percentage

    # Print data structure
    print_data_structure({
        'neighborhoods': neighborhoods_gdf,
        'buildings': buildings_gdf,
        'roads': roads_gdf,
        'trees': trees_gdf,
        'transit': transit_gdf,
        'urban_masterplan': urban_masterplan_gdf,
        'accidents': accidents_gdf,
        'population': population_df
    })

    return {
        'neighborhoods': neighborhoods_gdf,
        'buildings': buildings_gdf,
        'roads': roads_gdf,
        'trees': trees_gdf,
        'transit': transit_gdf,
        'urban_masterplan': urban_masterplan_gdf,
        'accidents': accidents_gdf,
        'population': population_df
    }

Cell 5 compute_intersection_counts

In [91]:
def compute_intersection_counts(neighborhoods_gdf, roads_gdf):
    # Use CPU-based computation with GeoPandas
    logging.info("Computing intersection counts per neighborhood using CPU (GeoPandas)...")
    
    # Step 1: Build endpoint-to-road mapping with a progress bar
    endpoint_to_roads = {}
    for road_idx, geom in tqdm(roads_gdf.geometry.items(), total=len(roads_gdf), desc="Building endpoint-to-road mapping"):
        if geom.geom_type == 'LineString':
            start = geom.coords[0]
            end = geom.coords[-1]
            endpoint_to_roads.setdefault(start, []).append(road_idx)
            endpoint_to_roads.setdefault(end, []).append(road_idx)
        elif geom.geom_type == 'MultiLineString':
            for line in geom.geoms:
                start = line.coords[0]
                end = line.coords[-1]
                endpoint_to_roads.setdefault(start, []).append(road_idx)
                endpoint_to_roads.setdefault(end, []).append(road_idx)

    # Step 2: Identify intersections with a progress bar
    intersections = []
    for node in tqdm(endpoint_to_roads.keys(), desc="Identifying intersections"):
        if len(endpoint_to_roads[node]) > 2:  # Intersection if more than 2 roads meet
            intersections.append(Point(node))

    # Step 3: Create intersections GeoDataFrame and perform spatial join
    logging.info("Creating intersections GeoDataFrame and performing spatial join...")
    intersections_gdf = gpd.GeoDataFrame(geometry=intersections, crs='EPSG:3826')
    intersection_counts = gpd.sjoin(neighborhoods_gdf, intersections_gdf, how='left', predicate='contains')
    intersection_counts = intersection_counts.groupby(level=0).size().reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['intersection_count'] = intersection_counts
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_count'] / neighborhoods_gdf['area_km2']
    neighborhoods_gdf['intersection_density'] = neighborhoods_gdf['intersection_density'].fillna(0.0)
    logging.info("Spatial join completed.")
    
    return neighborhoods_gdf

Cell 6: Graph Construction (build_graph)

In [92]:
import cudf
import cugraph
import logging
import os
import json
from tqdm import tqdm
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Point

def build_graph(data, force_recompute=False):
    """
    Build a cuGraph Graph object from geospatial data, optimized for speed with caching support.
    
    Parameters:
    - data: Dictionary containing GeoDataFrames/DataFrames for 'neighborhoods', 'buildings',
            'roads', 'trees', 'transit', 'urban_masterplan', 'accidents', 'population'.
    - force_recompute: If True, rebuild the graph instead of loading from cache.
    
    Returns:
    - G: cuGraph Graph object with nodes and edges.
    """
    logging.info("Stage 2: Building city graph...")
    G = cugraph.Graph(directed=False)
    
    # Compute data hash for cache validation
    current_hash = compute_data_hash(data)
    
    # Check cache
    if not force_recompute and os.path.exists(GRAPH_NODES_CACHE_PATH) and os.path.exists(GRAPH_EDGES_CACHE_PATH) and os.path.exists(GRAPH_DATA_HASH_PATH):
        with open(GRAPH_DATA_HASH_PATH, 'r') as f:
            cached_hash = f.read().strip()
        if cached_hash == current_hash:
            logging.info("Loading graph from cache...")
            try:
                nodes_df = cudf.read_parquet(GRAPH_NODES_CACHE_PATH)
                edges_df = cudf.read_parquet(GRAPH_EDGES_CACHE_PATH)
                G._nodes = nodes_df
                if not edges_df.empty:
                    G.from_cudf_edgelist(edges_df, source='src', destination='dst')
                logging.info(f"City graph loaded from cache: {len(nodes_df)} nodes, {len(edges_df)} edges")
                return G
            except Exception as e:
                logging.warning(f"Failed to load cached graph: {e}. Recomputing...")
    
    # Initialize node and edge lists
    nodes = []
    edges = []
    node_id_to_index = {}
    current_idx = 0
    
    # Add neighborhood nodes
    logging.info("Adding neighborhood nodes...")
    neighborhoods_gdf = data['neighborhoods']
    for idx, row in neighborhoods_gdf.iterrows():
        node_id = f"neighborhood_{idx}"
        node_data = {
            'vertex': node_id,
            'type': 'neighborhood',
            'LIE_NAME': row['LIE_NAME'],
            'ndvi_mean': row['ndvi_mean'],
            'tree_count': row['tree_count'],
            'transit_count': row['transit_count'],
            'accident_count': row['accident_count'],
            'road_density': row['road_density'],
            'intersection_count': row.get('intersection_count', 0),
            'intersection_density': row.get('intersection_density', 0.0),
            'total_population': row['total_population'],
            'elderly_percentage': row['elderly_percentage'],
            'area_km2': row['area_km2']
        }
        for category in CATEGORY_PRIORITY.keys():
            col_name = f"land_use_{category.lower()}_percent"
            node_data[col_name] = row.get(col_name, 0.0)
        
        nodes.append(node_data)
        node_id_to_index[node_id] = current_idx
        current_idx += 1
    
    # Add building nodes
    logging.info("Adding building nodes...")
    buildings_gdf = data['buildings']
    for idx, row in buildings_gdf.iterrows():
        node_id = f"building_{idx}"
        node_data = {
            'vertex': node_id,
            'type': 'building',
            'building': row['building'] if pd.notna(row['building']) else 'unknown',
            'area_m2': row['area_m2']
        }
        nodes.append(node_data)
        node_id_to_index[node_id] = current_idx
        current_idx += 1
    
    # Add road nodes
    logging.info("Adding road nodes...")
    roads_gdf = data['roads']
    for idx, row in roads_gdf.iterrows():
        node_id = f"road_{idx}"
        node_data = {
            'vertex': node_id,
            'type': 'road',
            'class': row['class'],
            'length_m': row['length_m']
        }
        nodes.append(node_data)
        node_id_to_index[node_id] = current_idx
        current_idx += 1
    
    # Create nodes DataFrame
    nodes_df = cudf.DataFrame(nodes)
    logging.info(f"Created {len(nodes_df)} nodes, including {len(nodes_df[nodes_df['type'] == 'neighborhood'])} neighborhoods")
    
    # Build edges with spatial indexing
    logging.info("Building edges based on spatial proximity...")
    neighborhoods_gdf = neighborhoods_gdf.copy()
    buildings_gdf = buildings_gdf.copy()
    roads_gdf = data['roads'].copy()
    
    neighborhoods_gdf['geometry'] = neighborhoods_gdf['geometry'].apply(fix_geometry)
    buildings_gdf['geometry'] = buildings_gdf['geometry'].apply(fix_geometry)
    roads_gdf['geometry'] = roads_gdf['geometry'].apply(fix_geometry)
    
    neighborhood_sindex = neighborhoods_gdf.sindex
    building_sindex = buildings_gdf.sindex
    road_sindex = roads_gdf.sindex
    
    logging.info("Computing neighborhood-neighborhood edges...")
    for i, row_i in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood-neighborhood edges"):
        node_i = f"neighborhood_{i}"
        geom_i = row_i['geometry']
        possible_matches = list(neighborhood_sindex.query(geom_i, predicate='touches'))
        for j in possible_matches:
            if i < j:
                node_j = f"neighborhood_{j}"
                geom_j = neighborhoods_gdf.iloc[j]['geometry']
                try:
                    if geom_i.touches(geom_j):
                        edges.append({'src': node_i, 'dst': node_j})
                except Exception as e:
                    logging.warning(f"Error checking touches between {node_i} and {node_j}: {e}")
    
    logging.info("Computing neighborhood-building edges...")
    if CUSPATIAL_AVAILABLE:
        logging.info("Using cuspatial for neighborhood-building edges...")
        building_points = buildings_gdf['geometry'].centroid
        building_cudf = cudf.DataFrame({
            'x': building_points.x,
            'y': building_points.y,
            'idx': buildings_gdf.index
        })
        for i, row_i in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood-building edges (cuspatial)"):
            node_i = f"neighborhood_{i}"
            geom_i = row_i['geometry']
            try:
                poly_gdf = gpd.GeoSeries([geom_i], crs='EPSG:3826')
                hits = cuspatial.point_in_polygon(
                    points_x=building_cudf['x'],
                    points_y=building_cudf['y'],
                    poly_gs=poly_gdf
                )
                hit_indices = building_cudf[hits]['idx'].to_pandas()
                for j in hit_indices:
                    edges.append({'src': node_i, 'dst': f"building_{j}"})
            except Exception as e:
                logging.warning(f"cuspatial error for neighborhood {node_i}: {e}. Falling back to geopandas...")
                possible_matches = list(building_sindex.query(geom_i, predicate='contains'))
                for j in possible_matches:
                    node_j = f"building_{j}"
                    geom_j = buildings_gdf.iloc[j]['geometry']
                    try:
                        if geom_i.contains(geom_j):
                            edges.append({'src': node_i, 'dst': node_j})
                    except Exception as e:
                        logging.warning(f"Error checking containment between {node_i} and {node_j}: {e}")
    else:
        logging.info("Using geopandas for neighborhood-building edges...")
        for i, row_i in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood-building edges"):
            node_i = f"neighborhood_{i}"
            geom_i = row_i['geometry']
            possible_matches = list(building_sindex.query(geom_i, predicate='contains'))
            for j in possible_matches:
                node_j = f"building_{j}"
                geom_j = buildings_gdf.iloc[j]['geometry']
                try:
                    if geom_i.contains(geom_j):
                        edges.append({'src': node_i, 'dst': node_j})
                except Exception as e:
                    logging.warning(f"Error checking containment between {node_i} and {node_j}: {e}")
    
    logging.info("Computing neighborhood-road edges...")
    for i, row_i in tqdm(neighborhoods_gdf.iterrows(), total=len(neighborhoods_gdf), desc="Neighborhood-road edges"):
        node_i = f"neighborhood_{i}"
        geom_i = row_i['geometry']
        possible_matches = list(road_sindex.query(geom_i, predicate='intersects'))
        for j in possible_matches:
            node_j = f"road_{j}"
            geom_j = roads_gdf.iloc[j]['geometry']
            try:
                if geom_i.intersects(geom_j):
                    edges.append({'src': node_i, 'dst': node_j})
            except Exception as e:
                logging.warning(f"Error checking intersection between {node_i} and {node_j}: {e}")
    
    edges_df = cudf.DataFrame(edges)
    
    valid_nodes = set(nodes_df['vertex'].to_pandas())
    edges_df = edges_df[edges_df['src'].isin(valid_nodes) & edges_df['dst'].isin(valid_nodes)]
    
    G._nodes = nodes_df
    if not edges_df.empty:
        G.from_cudf_edgelist(edges_df, source='src', destination='dst')
    else:
        logging.warning("No valid edges created. Graph will have nodes but no edges.")
    
    logging.info("Saving graph data to cache...")
    try:
        nodes_df.to_parquet(GRAPH_NODES_CACHE_PATH)
        edges_df.to_parquet(GRAPH_EDGES_CACHE_PATH)
        with open(GRAPH_DATA_HASH_PATH, 'w') as f:
            f.write(current_hash)
        with open(GRAPH_NODE_ID_CACHE_PATH, 'w') as f:
            json.dump(node_id_to_index, f)
        logging.info("Successfully saved graph data to cache.")
    except Exception as e:
        logging.error(f"Failed to save graph data to cache: {e}")
    
    logging.info(f"City graph constructed: {len(nodes_df)} nodes, {len(edges_df)} edges")
    return G

Cell 7: Rule-Based Walkability Scores (compute_walkability_scores)

In [93]:
def compute_walkability_scores(G):
    logging.info("Stage 3: Calculating rule-based walkability scores...")
    nodes_df = G._nodes.to_pandas()
    
    # Filter neighborhood nodes
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood']
    
    # Compute walkability components
    walkability_components = compute_walkability_components_all(neighborhood_nodes)
    
    # Initialize walkability_score in nodes_df
    nodes_df['walkability_score'] = np.nan
    
    # Assign walkability scores using LIE_NAME for matching
    lie_name_to_score = dict(zip(walkability_components['LIE_NAME'], walkability_components['walkability_score']))
    for lie_name, score in lie_name_to_score.items():
        mask = nodes_df['LIE_NAME'] == lie_name
        if mask.any():
            nodes_df.loc[mask, 'walkability_score'] = score
        else:
            logging.warning(f"No node found for LIE_NAME {lie_name} when assigning walkability score.")
    
    # Convert back to cudf
    G._nodes = cudf.from_pandas(nodes_df)
    
    logging.info("Rule-based walkability scores calculated and added to graph.")
    return G

Cell 8:Subgraph Functionality

In [94]:
import networkx as nx
import os

def create_neighborhood_subgraphs(G, data, subgraph_dir=SUBGRAPH_DIR):
    """
    Create and save subgraphs for each neighborhood, handling duplicate LIE_NAME values.
    
    Parameters:
    - G: cuGraph Graph object
    - data: dictionary containing the original GeoDataFrames (e.g., 'neighborhoods')
    - subgraph_dir: directory to save the subgraphs (default: SUBGRAPH_DIR)
    """
    os.makedirs(subgraph_dir, exist_ok=True)
    
    # Convert cuGraph to NetworkX
    edges_df = G.edgelist.edgelist_df.to_pandas()
    edge_columns = [col for col in edges_df.columns if col not in ['src', 'dst']]
    edge_attr = edge_columns if edge_columns else None
    nx_G = nx.from_pandas_edgelist(
        edges_df,
        source='src',
        target='dst',
        edge_attr=edge_attr
    )
    nx.set_node_attributes(nx_G, G._nodes.to_pandas().set_index('vertex').to_dict('index'))
    
    # Create a mapping from LIE_NAME to node_id(s)
    nodes_df = G._nodes.to_pandas()
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'][['vertex', 'LIE_NAME']]
    logging.info(f"Found {len(neighborhood_nodes)} neighborhood nodes in graph")
    lie_name_to_node_id = {}
    for _, row in neighborhood_nodes.iterrows():
        lie_name_to_node_id.setdefault(row['LIE_NAME'], []).append(row['vertex'])
    
    # Log missing LIE_NAME values
    data_lie_names = set(data['neighborhoods']['LIE_NAME'])
    graph_lie_names = set(lie_name_to_node_id.keys())
    missing_names = data_lie_names - graph_lie_names
    if missing_names:
        logging.warning(f"Missing LIE_NAME values in graph: {missing_names}")
    
    # Iterate over neighborhoods
    for idx, row in data['neighborhoods'].iterrows():
        lie_name = row['LIE_NAME']
        node_ids = lie_name_to_node_id.get(lie_name, [])
        
        if not node_ids:
            logging.warning(f"No nodes found for {lie_name} (expected ID: neighborhood_{idx}). Skipping...")
            continue
        
        for node_id in node_ids:
            if node_id not in nx_G:
                logging.warning(f"Node {node_id} for {lie_name} not found in graph. Skipping...")
                continue
            
            # Get all nodes connected to the neighborhood node
            connected_nodes = list(nx_G.neighbors(node_id))
            subgraph_nodes = [node_id] + connected_nodes
            
            # Create subgraph
            subgraph = nx_G.subgraph(subgraph_nodes).copy()
            
            # Save subgraph with unique filename
            subgraph_path = os.path.join(subgraph_dir, f"{lie_name}_{node_id}_subgraph.gpickle")
            nx.write_gpickle(subgraph, subgraph_path)
            logging.info(f"Saved subgraph for {lie_name} ({node_id}) to {subgraph_path}")

def load_neighborhood_subgraph(lie_name, subgraph_dir=SUBGRAPH_DIR):
    """
    Load a subgraph for a specific neighborhood.
    
    Parameters:
    - lie_name: name of the neighborhood
    - subgraph_dir: directory where subgraphs are saved (default: SUBGRAPH_DIR)
    
    Returns:
    - NetworkX Graph object of the subgraph, or None if not found
    """
    # Check for subgraphs matching lie_name
    matching_files = [f for f in os.listdir(subgraph_dir) if f.startswith(f"{lie_name}_neighborhood_") and f.endswith('_subgraph.gpickle')]
    if matching_files:
        # Load the first matching subgraph (or implement logic to select one if multiple exist)
        subgraph_path = os.path.join(subgraph_dir, matching_files[0])
        subgraph = nx.read_gpickle(subgraph_path)
        logging.info(f"Loaded subgraph for {lie_name} from {subgraph_path}")
        return subgraph
    else:
        logging.warning(f"Subgraph for {lie_name} not found in {subgraph_dir}")
        return None

Cell 8: GNN Model Definition and Data Preparation (WalkabilityGNN and prepare_gnn_data)

In [95]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, BatchNorm
from torch_geometric.data import Data

class WalkabilityGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads=4):
        super(WalkabilityGNN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels, heads=heads)
        self.bn2 = BatchNorm(hidden_channels * heads)
        self.conv3 = GCNConv(hidden_channels * heads, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.fc = torch.nn.Linear(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(p=0.3)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x

Cell 10 prepare_gnn_data

In [96]:
import torch
from torch_geometric.data import Data
import numpy as np
import logging
import json

def prepare_gnn_data(G):
    logging.info("Preparing GNN data...")
    nodes_df = G._nodes.to_pandas()
    edges_df = G.edgelist.edgelist_df.to_pandas()
    
    # Load node_id_to_index from cache
    try:
        with open(GRAPH_NODE_ID_CACHE_PATH, 'r') as f:
            node_id_to_index = json.load(f)
    except Exception as e:
        logging.warning(f"Failed to load node_id_to_index: {e}. Rebuilding mapping...")
        node_id_to_index = {row['vertex']: idx for idx, row in nodes_df.iterrows()}
    
    # Define feature columns for the GNN
    feature_cols = [
        'land_use_residential_percent', 'land_use_commercial_percent', 'land_use_education_percent',
        'land_use_city_open_area_percent', 'land_use_public_transportation_percent', 'land_use_pedestrian_percent',
        'ndvi_mean', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'intersection_density',
        'total_population', 'elderly_percentage'
    ]
    
    # Fill missing feature values with 0 for non-neighborhood nodes
    for col in feature_cols:
        if col not in nodes_df.columns:
            nodes_df[col] = 0.0
        else:
            nodes_df[col] = nodes_df[col].fillna(0.0)
    
    # Normalize features
    for col in feature_cols:
        if nodes_df[col].std() > 0:
            nodes_df[col] = (nodes_df[col] - nodes_df[col].mean()) / nodes_df[col].std()
    
    # Create feature matrix
    feature_matrix = nodes_df[feature_cols].values
    feature_matrix = np.nan_to_num(feature_matrix, nan=0.0)
    
    # Ensure edge indices match node indices
    edges_df['src_idx'] = edges_df['src'].map(node_id_to_index)
    edges_df['dst_idx'] = edges_df['dst'].map(node_id_to_index)
    
    # Drop any edges with invalid indices
    edges_df = edges_df.dropna(subset=['src_idx', 'dst_idx'])
    
    # Convert edge indices to integer
    edges_df['src_idx'] = edges_df['src_idx'].astype(int)
    edges_df['dst_idx'] = edges_df['dst_idx'].astype(int)
    
    # Create edge index tensor
    edge_index = torch.tensor(
        [edges_df['src_idx'].values, edges_df['dst_idx'].values],
        dtype=torch.long
    )
    
    # Handle edge weights (optional)
    edge_attr = None
    if 'weight' in edges_df.columns:
        edge_attr = torch.tensor(edges_df['weight'].values, dtype=torch.float)
    
    # Extract labels (walkability scores)
    y = torch.tensor(nodes_df['walkability_score'].fillna(0.0).values, dtype=torch.float)
    
    # Define node type mapping
    node_type_mapping = {
        'neighborhood': 0,
        'building': 1,
        'road': 2,
        'tree': 3,
        'transit': 4
    }
    # Create node_type tensor
    node_type = nodes_df['type'].map(node_type_mapping).fillna(-1).astype(int).values
    node_type = torch.tensor(node_type, dtype=torch.long)
    
    # Create PyTorch Geometric Data object
    data = Data(
        x=torch.tensor(feature_matrix, dtype=torch.float),
        edge_index=edge_index,
        edge_attr=edge_attr,
        y=y,
        node_type=node_type
    )
    
    logging.info("GNN data prepared.")
    return data

Cell 11: GNN Training and Prediction (train_gnn_model and predict_walkability)

In [97]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
import logging

class WalkabilityGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super(WalkabilityGNN, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1)
    
    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x

def train_gnn_model(data):
    logging.info("Stage 4: Training GNN model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = WalkabilityGNN(
        in_channels=data.x.shape[1],
        hidden_channels=128,
        out_channels=1,
        heads=4
    ).to(device)
    data = data.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

    model.train()
    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_attr).squeeze()
        # Compute loss only on neighborhood nodes (node_type == 0)
        mask = data.node_type == 0
        loss = F.mse_loss(out[mask], data.y[mask])
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        if epoch % 10 == 0:
            logging.info(f"Epoch {epoch}, Loss: {loss.item():.4f}")

    model.eval()
    logging.info("Finished training GNN model.")
    return model

def predict_walkability(G, model):
    logging.info("Predicting walkability with GNN...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    data = prepare_gnn_data(G)
    data = data.to(device)

    with torch.no_grad():
        predictions = model(data.x, data.edge_index, data.edge_attr).squeeze()

    predictions = predictions.cpu().numpy()
    # Normalize predictions to [0, 1] using sigmoid
    predictions = 1 / (1 + np.exp(-predictions))

    # Update G._nodes with GNN predictions
    nodes_df = G._nodes.to_pandas()
    nodes_df['walkability_gnn'] = predictions
    G._nodes = cudf.from_pandas(nodes_df)

    logging.info("Finished predicting walkability with GNN.")
    return G

Cell 12: Interactive Map Generation (create_interactive_map)

In [98]:
def create_interactive_map(G, data):
    logging.info("Generating interactive Kepler.gl map...")
    nodes_df = G._nodes.to_pandas()
    neighborhoods_gdf = data['neighborhoods'].copy()

    # Standardize 'LIE_NAME'
    nodes_df['LIE_NAME'] = nodes_df['LIE_NAME'].astype(str).str.strip()
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].astype(str).str.strip()

    # Filter neighborhood nodes
    neighborhood_nodes = nodes_df[nodes_df['type'] == 'neighborhood'].copy()

    # Log LIE_NAME and geometry details
    nodes_lie_names = set(neighborhood_nodes['LIE_NAME'])
    gdf_lie_names = set(neighborhoods_gdf['LIE_NAME'])
    logging.info(f"Neighborhood nodes count: {len(neighborhood_nodes)}")
    logging.info(f"Neighborhoods_gdf count: {len(neighborhoods_gdf)}")
    logging.info(f"Sample LIE_NAME in nodes_df: {list(nodes_lie_names)[:5]}")
    logging.info(f"Sample LIE_NAME in neighborhoods_gdf: {list(gdf_lie_names)[:5]}")
    logging.info(f"Common LIE_NAMEs: {len(nodes_lie_names & gdf_lie_names)}")
    logging.info(f"Nodes LIE_NAMEs not in GDF: {list(nodes_lie_names - gdf_lie_names)}")
    logging.info(f"GDF LIE_NAMEs not in nodes: {list(gdf_lie_names - nodes_lie_names)}")
    logging.info(f"Nodes nulls: {neighborhood_nodes.isna().sum().to_dict()}")
    logging.info(f"GDF geometry nulls: {neighborhoods_gdf['geometry'].isna().sum()}")

    # Merge starting with neighborhoods_gdf to ensure all neighborhoods
    map_data = neighborhoods_gdf[['LIE_NAME', 'geometry']].merge(
        neighborhood_nodes[['LIE_NAME', 'walkability_score', 'walkability_gnn']],
        on='LIE_NAME',
        how='left'
    )

    # Log merge results
    logging.info(f"Merged map_data rows: {len(map_data)}")
    logging.info(f"Walkability score nulls: {map_data['walkability_score'].isna().sum()}")
    logging.info(f"Walkability GNN nulls: {map_data['walkability_gnn'].isna().sum()}")

    # Handle missing scores
    map_data['walkability_score'] = map_data['walkability_score'].fillna(0)
    map_data['walkability_gnn'] = map_data['walkability_gnn'].fillna(0)

    # Create GeoDataFrame
    map_data = gpd.GeoDataFrame(map_data, geometry='geometry', crs='EPSG:3826')

    # Convert to GeoJSON for Kepler.gl
    map_data['geometry'] = map_data['geometry'].to_crs('EPSG:4326')  # Kepler.gl requires WGS84
    kepler_data = {
        'neighborhoods': map_data[['LIE_NAME', 'walkability_score', 'walkability_gnn', 'geometry']].to_json()
    }

    # Enhanced Kepler.gl config for polygons
    config = {
        "version": "v1",
        "config": {
            "visState": {
                "layers": [
                    {
                        "id": "neighborhoods",
                        "type": "geojson",
                        "config": {
                            "dataId": "neighborhoods",
                            "label": "Neighborhoods",
                            "color": [18, 147, 154],
                            "columns": {
                                "geojson": "geometry"
                            },
                            "isVisible": True,
                            "visConfig": {
                                "opacity": 0.7,
                                "strokeOpacity": 0.9,
                                "thickness": 1,
                                "strokeColor": [255, 255, 255],
                                "colorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "strokeColorRange": {
                                    "name": "Global Warming",
                                    "type": "sequential",
                                    "colors": [
                                        "#5A1846", "#900C3F", "#C70039",
                                        "#E3611C", "#F1920E", "#FFC107"
                                    ]
                                },
                                "colorField": {
                                    "name": "walkability_gnn",
                                    "type": "real"
                                },
                                "colorScale": "quantile"
                            }
                        },
                        "visualChannels": {
                            "colorField": {
                                "name": "walkability_gnn",
                                "type": "real"
                            },
                            "colorScale": "quantile"
                        }
                    }
                ],
                "interactionConfig": {
                    "tooltip": {
                        "fieldsToShow": {
                            "neighborhoods": [
                                {"name": "LIE_NAME", "format": None},
                                {"name": "walkability_score", "format": "{:.3f}"},
                                {"name": "walkability_gnn", "format": "{:.3f}"}
                            ]
                        },
                        "enabled": True
                    }
                }
            },
            "mapState": {
                "latitude": 25.0330,
                "longitude": 121.5654,
                "zoom": 11
            },
            "mapStyle": {
                "styleType": "dark"
            }
        }
    }

    # Generate map
    from keplergl import KeplerGl
    map_1 = KeplerGl(height=800, data=kepler_data, config=config)
    map_path = os.path.join(BASE_DIR, 'taipei_walkability_map.html')
    map_1.save_to_html(file_name=map_path)
    logging.info(f"Interactive map generated and saved as {map_path}")
    print(f"Map saved to {map_path}!")

Cell 13: Main Execution (main)

In [99]:
def main(force_recompute_graph=False):
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    os.makedirs(SUBGRAPH_DIR, exist_ok=True)
    logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

    print("Starting load_and_prepare_data...")
    data = load_and_prepare_data()
    print("Finished load_and_prepare_data")

    print("Starting build_graph...")
    G = build_graph(data, force_recompute=force_recompute_graph)
    print("Finished build_graph")

    print("Starting create_neighborhood_subgraphs...")
    create_neighborhood_subgraphs(G, data)
    print("Finished create_neighborhood_subgraphs")

    print(G._nodes.to_pandas().columns)

    print("Starting compute_walkability_scores...")
    G = compute_walkability_scores(G)
    print("Finished compute_walkability_scores")

    print("Starting prepare_gnn_data...")
    data_gnn = prepare_gnn_data(G)
    print("Finished prepare_gnn_data")

    print("Starting train_gnn_model...")
    model = train_gnn_model(data_gnn)
    print("Finished train_gnn_model")

    print("Starting predict_walkability...")
    G = predict_walkability(G, model)
    print("Finished predict_walkability")

    print("Starting create_interactive_map...")
    create_interactive_map(G, data)
    print("Finished create_interactive_map")

    logging.info("Processing complete.")
    print(G.edgelist.edgelist_df.to_pandas().head())

if __name__ == "__main__":
    main(force_recompute_graph=True)

2025-04-16 15:05:27,604 - INFO - Ensured subgraph directory exists: /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/subgraphs
2025-04-16 15:05:27,604 - INFO - Stage 1: Loading and preparing data...


Starting load_and_prepare_data...


Loading files:  25%|██▌       | 2/8 [00:01<00:03,  1.66it/s]2025-04-16 15:05:29,712 - INFO - Found 604 roads with missing 'class' values.
Loading files: 100%|██████████| 8/8 [00:03<00:00,  2.27it/s]
2025-04-16 15:05:31,125 - INFO - Validating and fixing geometries...
2025-04-16 15:05:33,564 - INFO - Performing spatial joins and aggregations...
2025-04-16 15:05:33,976 - INFO - Computing intersection counts...
2025-04-16 15:05:33,976 - INFO - Computing intersection counts per neighborhood using CPU (GeoPandas)...
Building endpoint-to-road mapping: 100%|██████████| 81444/81444 [00:01<00:00, 60799.42it/s]
Identifying intersections: 100%|██████████| 101237/101237 [00:00<00:00, 1034840.25it/s]
2025-04-16 15:05:35,418 - INFO - Creating intersections GeoDataFrame and performing spatial join...
2025-04-16 15:05:35,444 - INFO - Spatial join completed.
2025-04-16 15:05:35,453 - INFO - Computing land use percentages...
2025-04-16 15:05:52,294 - INFO - Stage 2: Building city graph...
2025-04-16 15:


--- Detailed Data Structure Overview ---

Dataset: neighborhoods
Shape: (460, 29)
Columns: ['LIE_NAME', 'land_use_commercial_percent', 'land_use_education_percent', 'land_use_residential_percent', 'ndvi_mean', 'geometry', 'area_km2', 'tree_count', 'transit_count', 'accident_count', 'road_density', 'intersection_count', 'intersection_density', 'total_population', 'elderly_percentage', 'land_use_city_open_area_percent', 'land_use_pedestrian_percent', 'land_use_public_transportation_percent', 'land_use_amenity_percent', 'land_use_medical_percent', 'land_use_natural_percent', 'land_use_road_percent', 'land_use_river_percent', 'land_use_infrastructure_percent', 'land_use_government_percent', 'land_use_special_zone_percent', 'land_use_military_percent', 'land_use_industrial_percent', 'land_use_agriculture_percent']
Data types:
LIE_NAME                                    object
land_use_commercial_percent                float64
land_use_education_percent                 float64
land_use_resi

2025-04-16 15:05:53,667 - INFO - Adding road nodes...
2025-04-16 15:05:55,922 - INFO - Created 156210 nodes, including 460 neighborhoods
2025-04-16 15:05:55,922 - INFO - Building edges based on spatial proximity...
2025-04-16 15:05:56,931 - INFO - Computing neighborhood-neighborhood edges...
Neighborhood-neighborhood edges: 100%|██████████| 460/460 [00:00<00:00, 1238.24it/s]
2025-04-16 15:05:57,304 - INFO - Computing neighborhood-building edges...
2025-04-16 15:05:57,305 - INFO - Using cuspatial for neighborhood-building edges...
Neighborhood-building edges (cuspatial): 100%|██████████| 460/460 [00:04<00:00, 102.98it/s]
2025-04-16 15:06:01,838 - INFO - Computing neighborhood-road edges...
Neighborhood-road edges: 100%|██████████| 460/460 [00:03<00:00, 136.22it/s]
2025-04-16 15:06:06,041 - INFO - Saving graph data to cache...
2025-04-16 15:06:06,480 - INFO - Successfully saved graph data to cache.
2025-04-16 15:06:06,481 - INFO - City graph constructed: 156210 nodes, 142735 edges


Finished build_graph
Starting create_neighborhood_subgraphs...


2025-04-16 15:06:07,719 - INFO - Found 460 neighborhood nodes in graph
2025-04-16 15:06:08,107 - INFO - Stage 3: Calculating rule-based walkability scores...


Finished create_neighborhood_subgraphs
Index(['vertex', 'type', 'LIE_NAME', 'ndvi_mean', 'tree_count',
       'transit_count', 'accident_count', 'road_density', 'intersection_count',
       'intersection_density', 'total_population', 'elderly_percentage',
       'area_km2', 'land_use_city_open_area_percent',
       'land_use_pedestrian_percent', 'land_use_public_transportation_percent',
       'land_use_amenity_percent', 'land_use_education_percent',
       'land_use_medical_percent', 'land_use_commercial_percent',
       'land_use_residential_percent', 'land_use_natural_percent',
       'land_use_road_percent', 'land_use_river_percent',
       'land_use_infrastructure_percent', 'land_use_government_percent',
       'land_use_special_zone_percent', 'land_use_military_percent',
       'land_use_industrial_percent', 'land_use_agriculture_percent',
       'building', 'area_m2', 'class', 'length_m'],
      dtype='object')
Starting compute_walkability_scores...


2025-04-16 15:06:08,244 - INFO - Correlation between walkability score and transit count: 0.37 (p-value: 0.00)


Walkability score distribution:
count    460.000000
mean       0.067557
std        0.030006
min        0.010300
25%        0.046112
50%        0.062484
75%        0.083860
max        0.206594
Name: walkability_score, dtype: float64


2025-04-16 15:06:09,263 - INFO - Rule-based walkability scores calculated and added to graph.
2025-04-16 15:06:09,265 - INFO - Preparing GNN data...


Finished compute_walkability_scores
Starting prepare_gnn_data...


2025-04-16 15:06:09,642 - INFO - GNN data prepared.
2025-04-16 15:06:09,651 - INFO - Stage 4: Training GNN model...


Finished prepare_gnn_data
Starting train_gnn_model...



The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.

2025-04-16 15:06:10,593 - INFO - Epoch 0, Loss: 11.3845
2025-04-16 15:06:11,438 - INFO - Epoch 10, Loss: 2.0383
2025-04-16 15:06:12,269 - INFO - Epoch 20, Loss: 0.4416
2025-04-16 15:06:13,104 - INFO - Epoch 30, Loss: 0.2968
2025-04-16 15:06:13,941 - INFO - Epoch 40, Loss: 0.1801
2025-04-16 15:06:14,781 - INFO - Epoch 50, Loss: 0.1270
2025-04-16 15:06:15,620 - INFO - Epoch 60, Loss: 0.1030
2025-04-16 15:06:16,462 - INFO - Epoch 70, Loss: 0.0843
2025-04-16 15:06:17,301 - INFO - Epoch 80, Loss: 0.0709
2025-04-16 15:06:18,135 - INFO - Epoch 90, Loss: 0.0607
2025-04-16 15:06:18,971 - INFO - Epoch 100, Loss: 0.0525
2025-04-16 15:06:19,815 - INFO - Epoch 110, Loss: 0.0460
2025-04-16 15:06:20,650 - INFO - Epoch 120, Loss: 0.0405
2025-04-16 15:06:21,484 - INFO - Epoch 130, Loss: 0.0359
2025-04-16 15:06:22,324 - INFO - Epoch 140, Loss: 0.0320
2025-04-16 15:06:23,160 - INFO - Epoch 150, Loss: 0.0288
2025-

Finished train_gnn_model
Starting predict_walkability...


2025-04-16 15:06:26,063 - INFO - GNN data prepared.
2025-04-16 15:06:26,269 - INFO - Finished predicting walkability with GNN.
2025-04-16 15:06:26,272 - INFO - Generating interactive Kepler.gl map...
2025-04-16 15:06:26,395 - INFO - Neighborhood nodes count: 460
2025-04-16 15:06:26,396 - INFO - Neighborhoods_gdf count: 460
2025-04-16 15:06:26,396 - INFO - Sample LIE_NAME in nodes_df: ['和德里', '正守里', '樟樹里', '新起里', '古風里']
2025-04-16 15:06:26,396 - INFO - Sample LIE_NAME in neighborhoods_gdf: ['和德里', '正守里', '樟樹里', '新起里', '古風里']
2025-04-16 15:06:26,397 - INFO - Common LIE_NAMEs: 454
2025-04-16 15:06:26,397 - INFO - Nodes LIE_NAMEs not in GDF: []
2025-04-16 15:06:26,397 - INFO - GDF LIE_NAMEs not in nodes: []
2025-04-16 15:06:26,399 - INFO - Nodes nulls: {'vertex': 0, 'type': 0, 'LIE_NAME': 0, 'ndvi_mean': 0, 'tree_count': 0, 'transit_count': 0, 'accident_count': 0, 'road_density': 0, 'intersection_count': 0, 'intersection_density': 0, 'total_population': 0, 'elderly_percentage': 0, 'area_km

Finished predict_walkability
Starting create_interactive_map...


2025-04-16 15:06:26,529 - INFO - Interactive map generated and saved as /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html
2025-04-16 15:06:26,533 - INFO - Processing complete.


User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Map saved to /home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data/taipei_walkability_map.html!
Finished create_interactive_map
     src     dst
0  95155   33544
1  95155   69494
2  95155  113759
3  95155   97446
4  95155   23593
