When you provide solution, you should write the full sections of code, not vague instructions.

In [16]:
import geopandas as gpd
import cudf
import cuspatial
from shapely.geometry import Point
import pandas as pd
import os
from tqdm import tqdm
import pickle
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import numpy as np
from random import shuffle
import shutil
from multiprocessing import Pool
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Show info messages and above (e.g., warnings, errors)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Include timestamp and level
)

# Define file paths (customize these based on your system)
BASE_DIR = "/home/johnny/Iaacthesis/projects/Geojson/GNN_Read_data"
LANDUSE_NDVI_PATH = os.path.join(BASE_DIR, "neighborhoods_with_ndvi_numerical.geojson")
OSM_BUILDINGS_PATH = os.path.join(BASE_DIR, "Taipei_Buildings_fulldata.geojson")
OSM_ROADS_PATH = os.path.join(BASE_DIR, "taipei_segments_cleaned_verified.geoparquet")
OSM_TREES_PATH = os.path.join(BASE_DIR, "taipei_land.geoparquet")
OSM_TRANSIT_PATH = os.path.join(BASE_DIR, "taipei_infrastructure.geoparquet")
URBAN_MASTERPLAN_PATH = os.path.join(BASE_DIR, "Taipei_urban_masterplan.geojson")
ACCIDENTS_PATH = os.path.join(BASE_DIR, "2023_accidents.geojson")
POPULATION_PATH = os.path.join(BASE_DIR, "population.json")
SUBGRAPH_DIR = os.path.join(BASE_DIR, "subgraphs")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")

# Ensure output directories exist
os.makedirs(SUBGRAPH_DIR, exist_ok=True)
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# Helper Functions
def check_geometries(gdf, name):
    """Check for invalid geometries in a GeoDataFrame."""
    # It's good practice to fix or remove invalid geometries before spatial operations
    # gdf.geometry = gdf.geometry.buffer(0) # A common trick to try and fix invalid geoms
    invalid_geoms = gdf[~gdf.geometry.is_valid]
    if not invalid_geoms.empty:
        print(f"Warning: {len(invalid_geoms)} invalid geometries found in {name}. Consider cleaning them.")
        # Optionally remove or fix them here
        # gdf = gdf[gdf.geometry.is_valid]
    return gdf

def print_data_structure(data_dict):
    """Print the structure and summary of datasets in a dictionary."""
    print("\n--- Data Structure Overview ---")
    for key, df in data_dict.items():
        if isinstance(df, (gpd.GeoDataFrame, pd.DataFrame)):
            print(f"\nDataset: {key}")
            print(f"Shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            print(f"Data types:\n{df.dtypes}")
            print(f"Missing values per column:\n{df.isnull().sum()}")
            if not df.select_dtypes(include=['float64', 'int64']).empty:
                print(f"Summary statistics:\n{df.describe()}")
            print("Sample data (first row):")
            print(df.head(1))
        else:
            print(f"\nDataset: {key} - Not a DataFrame or GeoDataFrame")
    print("--- End of Data Structure Overview ---\n")

def load_and_prepare_data():
    """Load and prepare geospatial datasets with optimized memory usage."""
    logging.info("Stage 1: Loading and preparing data...")
    with tqdm(total=8, desc="Loading files") as pbar:
        # Load datasets
        neighborhoods_gdf = gpd.read_file(
            LANDUSE_NDVI_PATH,
            encoding='utf-8-sig',
            columns=['LIE_NAME', 'geometry', 'land_use_residential_percent', 'land_use_commercial_percent',
                     'land_use_education_percent', 'ndvi_mean']
        ).to_crs('EPSG:3826')
        pbar.update(1)

        buildings_gdf = gpd.read_file(OSM_BUILDINGS_PATH, columns=['geometry', 'building']).to_crs('EPSG:3826')
        buildings_gdf['area_m2'] = buildings_gdf.geometry.area
        pbar.update(1)

        roads_gdf = gpd.read_parquet(OSM_ROADS_PATH, columns=['geometry', 'class']).to_crs('EPSG:3826')
        roads_gdf['length_m'] = roads_gdf.geometry.length
        missing_road_types = roads_gdf['class'].isnull().sum()
        if missing_road_types > 0:
            logging.info(f"Found {missing_road_types} roads with missing 'class' values.")
        pbar.update(1)

        trees_gdf = gpd.read_parquet(OSM_TREES_PATH, columns=['geometry', 'subtype', 'class']).to_crs('EPSG:3826')
        trees_gdf = trees_gdf[trees_gdf['subtype'] == 'tree']
        pbar.update(1)

        transit_gdf = gpd.read_parquet(OSM_TRANSIT_PATH, columns=['geometry', 'class']).to_crs('EPSG:3826')
        transit_gdf = transit_gdf[transit_gdf['class'].isin(['stop_position', 'bus_stop'])]
        pbar.update(1)

        urban_masterplan_gdf = gpd.read_file(URBAN_MASTERPLAN_PATH, columns=['geometry', 'Category']).to_crs('EPSG:3826')
        pbar.update(1)

        accidents_gdf = gpd.read_file(ACCIDENTS_PATH, columns=['geometry']).to_crs('EPSG:3826')
        pbar.update(1)

        population_df = pd.read_json(POPULATION_PATH)
        population_df.rename(columns={'District': 'LIE_NAME', 'Total_Population': 'total_population',
                                      'Elderly_Percentage': 'elderly_percentage'}, inplace=True)
        pbar.update(1)

    # Print master plan categories and road types categories
    print("Master Plan Categories:", urban_masterplan_gdf['Category'].unique().tolist())
    print("Road Types Categories:", roads_gdf['class'].unique().tolist())

    # Merge population data
    neighborhoods_gdf['LIE_NAME'] = neighborhoods_gdf['LIE_NAME'].str.strip().str.lower()
    population_df['LIE_NAME'] = population_df['LIE_NAME'].str.strip().str.lower()
    neighborhoods_gdf = neighborhoods_gdf.merge(
        population_df[['LIE_NAME', 'total_population', 'elderly_percentage']], on='LIE_NAME', how='left'
    )

    # Compute tree and transit counts
    tree_counts = gpd.sjoin(neighborhoods_gdf, trees_gdf, how='left', predicate='contains')
    neighborhoods_gdf['tree_count'] = tree_counts.groupby(tree_counts.index).size().reindex(neighborhoods_gdf.index, fill_value=0)
    transit_counts = gpd.sjoin(neighborhoods_gdf, transit_gdf, how='left', predicate='contains')
    neighborhoods_gdf['transit_count'] = transit_counts.groupby(transit_counts.index).size().reindex(neighborhoods_gdf.index, fill_value=0)

    # Compute accident counts per neighborhood
    accident_counts = gpd.sjoin(neighborhoods_gdf, accidents_gdf, how='left', predicate='contains')
    neighborhoods_gdf['accident_count'] = accident_counts.groupby(accident_counts.index).size().reindex(neighborhoods_gdf.index, fill_value=0)

    # Compute road density
    road_lengths = gpd.sjoin(roads_gdf, neighborhoods_gdf, how='inner', predicate='intersects')
    road_lengths_grouped = road_lengths.groupby('index_right')['length_m'].sum()
    neighborhoods_gdf['road_length'] = road_lengths_grouped.reindex(neighborhoods_gdf.index, fill_value=0)
    neighborhoods_gdf['road_density'] = neighborhoods_gdf['road_length'] / neighborhoods_gdf.geometry.area

    # Inspect unique categories in urban_masterplan_gdf
    unique_categories = urban_masterplan_gdf['Category'].unique()
    logging.info(f"Unique categories in urban_masterplan_gdf: {unique_categories}")

    # Use actual categories from the data
    amenity_categories = ['City_Open_Area', 'Education', 'Commercial']
    for category in amenity_categories:
        amenity_gdf = urban_masterplan_gdf[urban_masterplan_gdf['Category'] == category]
        amenity_counts = gpd.sjoin(neighborhoods_gdf, amenity_gdf, how='left', predicate='contains')
        column_name = f"{category.lower().replace('_', '')}_count"
        neighborhoods_gdf[column_name] = amenity_counts.groupby(amenity_counts.index).size().reindex(neighborhoods_gdf.index, fill_value=0)
        if neighborhoods_gdf[column_name].min() > 0:
            logging.warning(f"No neighborhoods with zero {column_name}. Check data or computation.")

    data_dict = {
        'neighborhoods': neighborhoods_gdf,
        'buildings': buildings_gdf,
        'roads': roads_gdf,
        'trees': trees_gdf,
        'transit': transit_gdf,
        'urban_masterplan': urban_masterplan_gdf,
        'accidents': accidents_gdf,
        'population': population_df
    }
    print_data_structure(data_dict)
    return data_dict

# Stage 2: Graph Construction
import geopandas as gpd
import cudf
import cuspatial
from shapely.geometry import Point
import pandas as pd
import os
from tqdm import tqdm
import pickle
import numpy as np

# Define the directory for storing subgraphs globally
SUBGRAPH_DIR = "subgraphs"

import geopandas as gpd
import cudf
import pandas as pd
import os
from tqdm import tqdm
import pickle
from multiprocessing import Pool

SUBGRAPH_DIR = "subgraphs"

def build_subgraph_for_neighborhood(args):
    """Helper function to build or load a subgraph for a single neighborhood using pandas."""
    (idx, neighborhoods_gdf, buildings_gdf, roads_gdf, trees_gdf, transit_gdf, feature_cols, all_columns, SUBGRAPH_DIR) = args
    lie_name = neighborhoods_gdf['LIE_NAME'].iloc[idx]
    subgraph_path = os.path.join(SUBGRAPH_DIR, f"subgraph_{lie_name}.pkl")

# Check if subgraph exists and is loadable
    if os.path.exists(subgraph_path):
        try:
            with open(subgraph_path, 'rb') as f:
                loaded_data = pickle.load(f)
                logging.info(f"Loaded existing subgraph for: {lie_name}")
                return lie_name, loaded_data  # Return pandas DataFrames
        except Exception as e:
            logging.warning(f"Failed to load subgraph {subgraph_path}. Regenerating. Error: {e}")

    # If subgraph doesn’t exist or loading failed, compute it
    logging.info(f"Computing subgraph for: {lie_name}")
    try:
        all_nodes = []
        all_edges = []

        # Neighborhood node
        neighborhood_node_id = f"neighborhood_{lie_name}"
        neighborhood_data = neighborhoods_gdf.loc[idx]
        node_data = {
            'vertex': neighborhood_node_id, 'type': 'neighborhood', 'lie_name': lie_name,
            **{col: float(neighborhood_data.get(col, 0.0)) for col in feature_cols if col not in ['tree_count', 'transit_count', 'accident_count']},
            'tree_count': int(neighborhood_data.get('tree_count', 0)),
            'transit_count': int(neighborhood_data.get('transit_count', 0)),
            'accident_count': int(neighborhood_data.get('accident_count', 0)),
            'building_type': None, 'road_class': None, 'class': None
        }
        all_nodes.append(node_data)

        # Buffer and filter
        buffer_distance_meters = 200
        neigh_geom_proj = neighborhoods_gdf.geometry.iloc[idx]
        neigh_buffer_proj = neigh_geom_proj.buffer(buffer_distance_meters) if neigh_geom_proj.is_valid else None

        if neigh_buffer_proj:
            relevant_buildings = buildings_gdf[buildings_gdf.geometry.within(neigh_buffer_proj)]
            relevant_roads = roads_gdf[roads_gdf.geometry.intersects(neigh_buffer_proj)]
            relevant_trees = trees_gdf[trees_gdf.geometry.within(neigh_buffer_proj)]
            relevant_transit = transit_gdf[transit_gdf.geometry.within(neigh_buffer_proj)]
        else:
            relevant_buildings = gpd.GeoDataFrame(geometry=[])
            relevant_roads = gpd.GeoDataFrame(geometry=[])
            relevant_trees = gpd.GeoDataFrame(geometry=[])
            relevant_transit = gpd.GeoDataFrame(geometry=[])

        # Building nodes
        for b_idx, building in relevant_buildings.iterrows():
            node_id = f"building_{b_idx}"
            node_data = {
                'vertex': node_id, 'type': 'building', 'lie_name': lie_name,
                'area_m2': float(building.get('area_m2', 0.0)), 'building_type': building.get('building', None),
                **{col: 0.0 for col in feature_cols if col != 'area_m2'},
                'road_class': None, 'class': None
            }
            all_nodes.append(node_data)

        # Road nodes
        for r_idx, road in relevant_roads.iterrows():
            geom = road.geometry
            if geom and not geom.is_empty and geom.geom_type in ['LineString', 'MultiLineString']:
                start_id = f"road_start_{r_idx}"
                end_id = f"road_end_{r_idx}"
                node_data_start = {
                    'vertex': start_id, 'type': 'road', 'lie_name': lie_name,
                    'length_m': float(road.get('length_m', 0.0)), 'accident_count': int(road.get('accident_count', 0)),
                    'road_class': road.get('class', None),
                    **{col: 0.0 for col in feature_cols if col not in ['length_m', 'accident_count']},
                    'building_type': None, 'class': None
                }
                node_data_end = node_data_start.copy()
                node_data_end['vertex'] = end_id
                all_nodes.extend([node_data_start, node_data_end])

        # Tree nodes
        for t_idx, tree in relevant_trees.iterrows():
            node_id = f"tree_{t_idx}"
            node_data = {
                'vertex': node_id, 'type': 'tree', 'lie_name': lie_name,
                **{col: 0.0 for col in feature_cols},
                'building_type': None, 'road_class': None, 'class': tree.get('class', None)
            }
            all_nodes.append(node_data)

        # Transit nodes
        for tr_idx, transit in relevant_transit.iterrows():
            node_id = f"transit_{tr_idx}"
            node_data = {
                'vertex': node_id, 'type': 'transit', 'lie_name': lie_name,
                **{col: 0.0 for col in feature_cols},
                'building_type': None, 'road_class': None, 'class': transit.get('class', None)
            }
            all_nodes.append(node_data)

        # Create edges based on spatial proximity
        if neigh_buffer_proj:
            # Neighborhood to buildings
            for b_idx, building in relevant_buildings.iterrows():
                building_geom = building.geometry
                if building_geom and building_geom.is_valid:
                    distance = neigh_geom_proj.distance(building_geom)
                    if distance < 100:
                        all_edges.append({'src': neighborhood_node_id, 'dst': f"building_{b_idx}", 'weight': float(distance)})

            # Buildings to roads
            for b_idx, building in relevant_buildings.iterrows():
                building_geom = building.geometry
                if not (building_geom and building_geom.is_valid):
                    continue
                for r_idx, road in relevant_roads.iterrows():
                    road_geom = road.geometry
                    if not (road_geom and road_geom.is_valid):
                        continue
                    distance = building_geom.distance(road_geom)
                    if distance < 50:
                        all_edges.append({'src': f"building_{b_idx}", 'dst': f"road_start_{r_idx}", 'weight': float(distance)})
                        all_edges.append({'src': f"building_{b_idx}", 'dst': f"road_end_{r_idx}", 'weight': float(distance)})

        # Convert to DataFrames
        nodes_pd_df = pd.DataFrame(all_nodes)
        edges_pd_df = pd.DataFrame(all_edges, columns=['src', 'dst', 'weight']) if all_edges else pd.DataFrame(columns=['src', 'dst', 'weight'])
        for col in all_columns:
            if col not in nodes_pd_df.columns:
                nodes_pd_df[col] = 0.0 if col in feature_cols else None

        # Save to disk
        with open(subgraph_path, 'wb') as f:
            pickle.dump({'nodes': nodes_pd_df, 'edges': edges_pd_df}, f)
        logging.info(f"Saved subgraph for: {lie_name}")
        return lie_name, {'nodes': nodes_pd_df, 'edges': edges_pd_df}
    except Exception as e:
        logging.error(f"Error building subgraph for {lie_name}: {e}")
        return lie_name, None

def build_graph(data):
    """Build subgraphs for each neighborhood with accident counts using parallel processing."""
    logging.info("Stage 2: Building graph network...")
    subgraphs = {}
    target_crs = 'EPSG:3826'

    # Extract GeoDataFrames
    neighborhoods_gdf = data['neighborhoods'].to_crs(target_crs)
    buildings_gdf = data['buildings'].to_crs(target_crs)
    roads_gdf = data['roads'].to_crs(target_crs)
    trees_gdf = data['trees'].to_crs(target_crs)
    transit_gdf = data['transit'].to_crs(target_crs)
    accidents_gdf = data['accidents'].to_crs(target_crs)

    # Calculate accident counts per road
    logging.info("Computing accident counts per road...")
    road_accidents = gpd.sjoin(roads_gdf, accidents_gdf, how='left', predicate='intersects')
    roads_gdf['accident_count'] = road_accidents.groupby(road_accidents.index).size().reindex(roads_gdf.index, fill_value=0)

    feature_cols = [
        'land_use_residential_percent', 'land_use_commercial_percent',
        'land_use_education_percent', 'ndvi_mean', 'tree_count', 'transit_count',
        'area_m2', 'length_m', 'total_population', 'elderly_percentage', 'accident_count'
    ]
    all_columns = ['vertex', 'type', 'lie_name'] + feature_cols + ['building_type', 'road_class', 'class']

    os.makedirs(SUBGRAPH_DIR, exist_ok=True)

    # Prepare arguments for multiprocessing
    args_list = [(idx, neighborhoods_gdf, buildings_gdf, roads_gdf, trees_gdf, transit_gdf, feature_cols, all_columns, SUBGRAPH_DIR) for idx in range(len(neighborhoods_gdf))]

    # Use a pool of workers to build subgraphs in parallel
    with Pool(processes=4) as pool:  # Adjust 'processes' based on your CPU cores
        results = pool.map(build_subgraph_for_neighborhood, args_list)

    # Collect results into subgraphs dictionary
    for lie_name, subgraph_data in tqdm(results, desc="Collecting subgraphs"):
        if subgraph_data is not None:
            # Convert to cudf in the main process
            nodes_df = cudf.from_pandas(subgraph_data['nodes'])
            edges_df = cudf.from_pandas(subgraph_data['edges']) if not subgraph_data['edges'].empty else cudf.DataFrame(columns=['src', 'dst', 'weight'])
            subgraphs[lie_name] = {'nodes': nodes_df, 'edges': edges_df}
        else:
            logging.warning(f"Skipping subgraph for {lie_name} due to error.")

    logging.info(f"Finished building/loading {len(subgraphs)} subgraphs.")
    return subgraphs, []

# Stage 3: Rule-Based Walkability Calculation
import pandas as pd
import logging

def compute_walkability_scores(nodes_df):
    """Compute enhanced rule-based walkability scores for neighborhood nodes."""
    if not isinstance(nodes_df, pd.DataFrame):
        nodes_df = nodes_df.to_pandas()

    neighborhood_mask = nodes_df['type'] == 'neighborhood'
    neighborhood_df = nodes_df[neighborhood_mask].copy()

    if len(neighborhood_df) == 0:
        logging.warning("No neighborhood nodes found.")
        nodes_df['walkability_rule'] = 0.0
        return nodes_df

    # Land use score
    land_use_cols = [col for col in neighborhood_df.columns if 'land_use' in col.lower()]
    land_use_weights = {
        'land_use_residential_percent': 0.25,
        'land_use_commercial_percent': 0.20,
        'land_use_education_percent': 0.15
    }
    land_use_score = sum(
        (neighborhood_df[col].fillna(0.0) * land_use_weights.get(col, 0.1)) / 100
        for col in land_use_cols
    )

    # Normalized feature scores
    ndvi_score = ((neighborhood_df['ndvi_mean'] - neighborhood_df['ndvi_mean'].min()) /
                  (neighborhood_df['ndvi_mean'].max() - neighborhood_df['ndvi_mean'].min() + 1e-6)) * 0.3
    tree_score = ((neighborhood_df['tree_count'] - neighborhood_df['tree_count'].min()) /
                  (neighborhood_df['tree_count'].max() - neighborhood_df['tree_count'].min() + 1e-6)) * 0.2
    transit_score = ((neighborhood_df['transit_count'] - neighborhood_df['transit_count'].min()) /
                     (neighborhood_df['transit_count'].max() - neighborhood_df['transit_count'].min() + 1e-6)) * 0.2

    # Penalty factors
    elderly_factor = (neighborhood_df['elderly_percentage'].fillna(0.0) / 100) * 0.1
    accident_factor = ((neighborhood_df['accident_count'] - neighborhood_df['accident_count'].min()) /
                       (neighborhood_df['accident_count'].max() - neighborhood_df['accident_count'].min() + 1e-6)) * 0.1

    # Amenity score (using updated columns)
    amenity_cols = ['cityopenarea_count', 'education_count', 'commercial_count']
    amenity_score = sum(
        (neighborhood_df[col] / (neighborhood_df[col].max() + 1e-6)) * 0.05
        for col in amenity_cols if col in neighborhood_df.columns and neighborhood_df[col].max() > 0
    )

    # Road density score
    road_density_score = ((neighborhood_df['road_density'] - neighborhood_df['road_density'].min()) /
                          (neighborhood_df['road_density'].max() - neighborhood_df['road_density'].min() + 1e-6)) * 0.1

    # Final walkability score
    walkability = (land_use_score + ndvi_score + tree_score + transit_score +
                   amenity_score + road_density_score - elderly_factor - accident_factor).clip(lower=0.0, upper=1.0)
    nodes_df['walkability_rule'] = 0.0
    nodes_df.loc[neighborhood_mask, 'walkability_rule'] = walkability

    return nodes_df

def calculate_walkability(subgraphs, neighborhoods_gdf):
    """Calculate walkability scores for all subgraphs and prepare for visualization."""
    logging.info("Stage 3: Calculating rule-based walkability scores...")
    walkability_scores = {}

    for lie_name, subgraph_data in tqdm(subgraphs.items(), desc="Calculating walkability"):
        nodes_df = subgraph_data['nodes']
        nodes_df = compute_walkability_scores(nodes_df)
        subgraph_data['nodes'] = nodes_df

        neighborhood_node = nodes_df[nodes_df['type'] == 'neighborhood']
        if not neighborhood_node.empty:
            walkability_scores[lie_name] = neighborhood_node['walkability_rule'].iloc[0]

    neighborhoods_gdf['walkability_rule'] = neighborhoods_gdf['LIE_NAME'].map(walkability_scores).fillna(0.0)
    neighborhoods_gdf.to_file("walkability_scores.geojson", driver="GeoJSON")
    logging.info("Walkability scores saved to walkability_scores.geojson")
    return subgraphs

# Stage 4: GNN Model with GAT and Validation
class GATWalkability(torch.nn.Module):
    """Graph Attention Network (GAT) model for walkability prediction."""
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GATWalkability, self).__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=4, dropout=0.6)
        self.conv2 = GATConv(hidden_channels * 4, out_channels, heads=1, concat=False)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

def prepare_gnn_data(subgraphs, train_indices, val_indices):
    """Prepare data for GNN training."""
    feature_cols = [
        'land_use_residential_percent', 'land_use_commercial_percent', 'land_use_education_percent',
        'ndvi_mean', 'tree_count', 'transit_count', 'area_m2', 'length_m', 'total_population',
        'elderly_percentage', 'accident_count'
    ]

    train_nodes = cudf.concat([subgraphs[lie_name]['nodes'] for lie_name in train_indices])

    for col in feature_cols:
        if col not in train_nodes.columns:
            train_nodes[col] = 0.0

    min_values = train_nodes[feature_cols].min()
    max_values = train_nodes[feature_cols].max()

    data_list = []
    for lie_name in subgraphs.keys():
        nodes_df = subgraphs[lie_name]['nodes']
        edges_df = subgraphs[lie_name]['edges']

        if edges_df is None or not isinstance(edges_df, cudf.DataFrame):
            edges_df = cudf.DataFrame(columns=['src', 'dst', 'weight'])

        vertex_list = nodes_df['vertex'].to_arrow().to_pylist()
        node_mapping = {vertex: idx for idx, vertex in enumerate(vertex_list)}

        for col in feature_cols:
            if col not in nodes_df.columns:
                nodes_df[col] = 0.0

        scaled_nodes_df = nodes_df[feature_cols].copy()
        for col in feature_cols:
            if max_values[col] > min_values[col]:
                scaled_nodes_df[col] = (scaled_nodes_df[col] - min_values[col]) / (max_values[col] - min_values[col])
            else:
                scaled_nodes_df[col] = 0

        x = torch.tensor(scaled_nodes_df.to_pandas().values, dtype=torch.float)

        if len(edges_df) > 0:
            src_list = edges_df['src'].to_arrow().to_pylist()
            dst_list = edges_df['dst'].to_arrow().to_pylist()
            src_idx = [node_mapping[src] for src in src_list]
            dst_idx = [node_mapping[dst] for dst in dst_list]
            edge_index = torch.tensor([src_idx, dst_idx], dtype=torch.long)
        else:
            edge_index = torch.tensor([[], []], dtype=torch.long)

        y = torch.tensor(nodes_df['walkability_rule'].to_pandas().values, dtype=torch.float)
        data_list.append(Data(x=x, edge_index=edge_index, y=y))

    return data_list

def train_gnn_model(data_list):
    """Train the GNN model with validation, early stopping, and LR scheduling."""
    logging.info("Stage 4: Training GNN model...")
    model = GATWalkability(in_channels=11, hidden_channels=64, out_channels=1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
    criterion = torch.nn.MSELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5)

    indices = list(range(len(data_list)))
    shuffle(indices)
    train_data = [data_list[i] for i in indices[:int(0.8 * len(data_list))]]
    val_data = [data_list[i] for i in indices[int(0.8 * len(data_list)):]]

    best_val_loss = float('inf')
    patience = 10
    counter = 0

    for epoch in range(200):
        model.train()
        train_loss = 0
        for data in train_data:
            optimizer.zero_grad()
            out = model(data.x, data.edge_index)
            loss = criterion(out, data.y.view(-1, 1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_data)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data in val_data:
                out = model(data.x, data.edge_index)
                val_loss += criterion(out, data.y.view(-1, 1)).item()
        val_loss /= len(val_data)

        logging.info(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")
        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), os.path.join(CHECKPOINT_DIR, 'best_model.pth'))
            logging.info(f"New best model saved with validation loss: {best_val_loss:.4f}")
        else:
            counter += 1
        if counter >= patience:
            logging.info("Early stopping triggered")
            break

    model.load_state_dict(torch.load(os.path.join(CHECKPOINT_DIR, 'best_model.pth')))
    logging.info(f"Training completed. Best validation loss: {best_val_loss:.4f}")
    return model

def predict_walkability(subgraphs, model):
    """Predict walkability scores using the trained GNN model."""
    for lie_name, subgraph_data in subgraphs.items():
        nodes_df = subgraph_data['nodes'].to_pandas()
        data = prepare_gnn_data({lie_name: subgraph_data}, [lie_name], [])[0]
        with torch.no_grad():
            pred = model(data.x, data.edge_index)
        nodes_df['walkability_gnn'] = pred.numpy().flatten()
        subgraph_data['nodes'] = cudf.from_pandas(nodes_df)
    return subgraphs

# Main Execution Flow
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

def main():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    SUBGRAPH_DIR = "subgraphs"
    os.makedirs(SUBGRAPH_DIR, exist_ok=True)
    logging.info(f"Ensured subgraph directory exists: {SUBGRAPH_DIR}")

    # Load and prepare data
    data = load_and_prepare_data()

    # Build graph (use your existing function)
    subgraphs, _ = build_graph(data)

    # Calculate walkability
    subgraphs = calculate_walkability(subgraphs, data['neighborhoods'])

    logging.info("Processing complete. Check walkability_scores.geojson for results.")

if __name__ == "__main__":
    main()

2025-04-09 16:59:54,541 - INFO - Ensured subgraph directory exists: subgraphs
2025-04-09 16:59:54,542 - INFO - Stage 1: Loading and preparing data...
Loading files:  25%|██▌       | 2/8 [00:01<00:03,  1.60it/s]2025-04-09 16:59:56,609 - INFO - Found 604 roads with missing 'class' values.
Loading files: 100%|██████████| 8/8 [00:03<00:00,  2.27it/s]


Master Plan Categories: ['City_Open_Area', 'Commercial', 'Infrastructure', 'Government', 'Public_Transportation', 'Education', 'Medical', 'Amenity', 'Road', 'Pedestrian', 'Natural', 'Special_Zone', 'River', 'Military', 'Residential', 'Industrial', 'Agriculture']
Road Types Categories: [None, 'service', 'track', 'path', 'steps', 'footway', 'residential', 'unclassified', 'secondary', 'tertiary', 'pedestrian', 'living_street', 'primary', 'unknown', 'cycleway', 'motorway', 'trunk', 'bridleway']


2025-04-09 16:59:58,471 - INFO - Unique categories in urban_masterplan_gdf: ['City_Open_Area' 'Commercial' 'Infrastructure' 'Government'
 'Public_Transportation' 'Education' 'Medical' 'Amenity' 'Road'
 'Pedestrian' 'Natural' 'Special_Zone' 'River' 'Military' 'Residential'
 'Industrial' 'Agriculture']
2025-04-09 16:59:58,555 - INFO - Stage 2: Building graph network...
2025-04-09 16:59:58,563 - INFO - Computing accident counts per road...



--- Data Structure Overview ---

Dataset: neighborhoods
Shape: (460, 16)
Columns: ['LIE_NAME', 'land_use_commercial_percent', 'land_use_education_percent', 'land_use_residential_percent', 'ndvi_mean', 'geometry', 'total_population', 'elderly_percentage', 'tree_count', 'transit_count', 'accident_count', 'road_length', 'road_density', 'cityopenarea_count', 'education_count', 'commercial_count']
Data types:
LIE_NAME                          object
land_use_commercial_percent      float64
land_use_education_percent       float64
land_use_residential_percent     float64
ndvi_mean                        float64
geometry                        geometry
total_population                   int64
elderly_percentage               float64
tree_count                         int64
transit_count                      int64
accident_count                     int64
road_length                      float64
road_density                     float64
cityopenarea_count                 int64
education_count  

2025-04-09 16:59:59,604 - INFO - Loaded existing subgraph for: 湖田里
2025-04-09 16:59:59,612 - INFO - Loaded existing subgraph for: 菁山里
2025-04-09 16:59:59,614 - INFO - Loaded existing subgraph for: 大屯里
2025-04-09 16:59:59,617 - INFO - Loaded existing subgraph for: 平等里
2025-04-09 16:59:59,620 - INFO - Loaded existing subgraph for: 泉源里
2025-04-09 16:59:59,627 - INFO - Loaded existing subgraph for: 湖山里
2025-04-09 16:59:59,629 - INFO - Loaded existing subgraph for: 秀山里
2025-04-09 16:59:59,634 - INFO - Loaded existing subgraph for: 陽明里
2025-04-09 16:59:59,638 - INFO - Loaded existing subgraph for: 溪山里
2025-04-09 16:59:59,641 - INFO - Loaded existing subgraph for: 稻香里
2025-04-09 16:59:59,646 - INFO - Loaded existing subgraph for: 開明里
2025-04-09 16:59:59,649 - INFO - Loaded existing subgraph for: 中和里
2025-04-09 16:59:59,652 - INFO - Loaded existing subgraph for: 中心里
2025-04-09 16:59:59,653 - INFO - Loaded existing subgraph for: 桃源里
2025-04-09 16:59:59,656 - INFO - Loaded existing subgraph for:

KeyError: 'road_density'

In [None]:
import React, { useState, useEffect } from 'react';
import KeplerGl from 'kepler.gl';

const MapWithWalkability = () => {
  const [mapData, setMapData] = useState(null);

  useEffect(() => {
    fetch('/walkability_scores.geojson')
      .then(response => response.json())
      .then(data => {
        setMapData({
          data: data,
          config: {
            version: 'v1',
            config: {
              visState: {
                layers: [
                  {
                    type: 'geojson',
                    config: {
                      dataId: 'walkability_data',
                      label: 'Walkability Scores',
                      columns: { geojson: 'geometry' },
                      isVisible: true,
                      visConfig: {
                        opacity: 0.8,
                        colorRange: {
                          name: 'Global Warming',
                          type: 'sequential',
                          category: 'Uber',
                          colors: ['#5A1846', '#900C3F', '#C70039', '#E3611C', '#F1920E', '#FFC300']
                        },
                        filled: true,
                        colorField: {
                          name: 'walkability_rule',
                          type: 'real'
                        },
                        colorScale: 'quantile'
                      }
                    }
                  }
                ]
              }
            }
          }
        });
      })
      .catch(error => console.error('Error loading walkability scores:', error));
  }, []);

  return (
    <div>
      <KeplerGl
        id="map"
        mapboxApiAccessToken="YOUR_MAPBOX_TOKEN"  // Replace with your actual Mapbox token
        width={800}
        height={600}
        mapData={mapData ? mapData.data : null}
        config={mapData ? mapData.config : {}}
      />
    </div>
  );
};

export default MapWithWalkability;