整個城市(成功)

In [2]:
import geopandas as gpd
import networkx as nx
import os
from shapely.geometry import Point, LineString
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import pyproj
from shapely.ops import unary_union, linemerge
import hashlib
from keplergl import KeplerGl
import json
import webbrowser

CHECKPOINT_DIR = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\checkpoints"
SUBGRAPH_DIR = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\Neighborhood_subgraph"
CONFIG_PATH = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\kepler.gl.json"
TEMP_HTML_PATH = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\temp_walkability_map.html"
URBAN_MASTERPLAN_PATH = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\Taipei_urban_masterplan.geojson"

if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)
if not os.path.exists(SUBGRAPH_DIR):
    os.makedirs(SUBGRAPH_DIR)

MAPBOX_ACCESS_TOKEN = "pk.eyJ1Ijoiam9obm55Ym95NjY5NyIsImEiOiJjbTgyMzZkbjUxZHF2MmlzYTByc3pxZmw0In0.7RASB6M_AczC7q8dvFPWBQ"

def twd97_to_wgs84(x, y):
    transformer = pyproj.Transformer.from_crs("EPSG:3826", "EPSG:4326", always_xy=True)
    lon, lat = transformer.transform(x, y)
    return lat, lon

def polyline_to_points(linestring, spacing=7):
    points = []
    if linestring.geom_type == 'MultiLineString':
        linestring = linemerge(linestring)
    if linestring.geom_type != 'LineString':
        return points

    total_length = linestring.length
    if total_length <= 0:
        return points

    distance = 0
    while distance <= total_length:
        point = linestring.interpolate(distance)
        points.append(point)
        distance += spacing
    return points

def compute_data_hash(df):
    df_hashable = df.drop(columns=['geometry']) if 'geometry' in df.columns else df.copy()
    df_hashable = df_hashable.astype(str).fillna('missing')
    return hashlib.md5(pd.util.hash_pandas_object(df_hashable).values.tobytes()).hexdigest()

def load_data():
    print("Checkpoint 1: Loading data for the entire city...")
    landuse_ndvi_path = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\neighborhoods_with_ndvi_numerical.geojson"
    osm_buildings_path = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\Taipei_Buildings_fulldata.geojson"
    osm_roads_path = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\taipei_segments_cleaned_verified.geoparquet"
    osm_trees_path = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\taipei_land.geoparquet"
    osm_transit_path = r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data\taipei_infrastructure.geoparquet"
    urban_masterplan_path = URBAN_MASTERPLAN_PATH

    for path in [landuse_ndvi_path, osm_buildings_path, osm_roads_path, osm_trees_path, osm_transit_path, urban_masterplan_path]:
        if not os.path.exists(path):
            raise FileNotFoundError(f"File not found: {path}")

    try:
        with tqdm(total=6, desc="Loading files") as pbar:
            neighborhoods = gpd.read_file(landuse_ndvi_path, encoding='utf-8-sig')
            print("Columns in neighborhoods after loading:", neighborhoods.columns.tolist())
            pbar.update(1)
            buildings = gpd.read_file(osm_buildings_path, encoding='utf-8-sig')
            pbar.update(1)
            roads = gpd.read_parquet(osm_roads_path)
            pbar.update(1)
            trees = gpd.read_parquet(osm_trees_path)
            trees = trees[trees['subtype'] == 'tree']
            pbar.update(1)
            transit = gpd.read_parquet(osm_transit_path)
            transit = transit[transit['class'].isin(['stop_position', 'bus_stop'])]
            pbar.update(1)
            urban_masterplan = gpd.read_file(urban_masterplan_path)
            urban_masterplan = urban_masterplan.to_crs('EPSG:3826')  # Convert to TWD97
            urban_masterplan['geometry'] = urban_masterplan.geometry.make_valid()  # Ensure valid geometries
            pbar.update(1)
    except UnicodeDecodeError as e:
        print(f"UTF-8-SIG decoding failed: {e}. Attempting with errors='ignore'...")
        try:
            with tqdm(total=6, desc="Loading files (fallback)") as pbar:
                neighborhoods = gpd.read_file(landuse_ndvi_path, encoding='utf-8-sig', errors='ignore')
                print("Columns in neighborhoods after loading (fallback):", neighborhoods.columns.tolist())
                pbar.update(1)
                buildings = gpd.read_file(osm_buildings_path, encoding='utf-8-sig', errors='ignore')
                pbar.update(1)
                roads = gpd.read_parquet(osm_roads_path)
                pbar.update(1)
                trees = gpd.read_parquet(osm_trees_path)
                trees = trees[trees['subtype'] == 'tree']
                pbar.update(1)
                transit = gpd.read_parquet(osm_transit_path)
                transit = transit[transit['class'].isin(['stop_position', 'bus_stop'])]
                pbar.update(1)
                urban_masterplan = gpd.read_file(urban_masterplan_path, encoding='utf-8-sig', errors='ignore')
                urban_masterplan = urban_masterplan.to_crs('EPSG:3826')
                urban_masterplan['geometry'] = urban_masterplan.geometry.make_valid()
                pbar.update(1)
            print("Loaded with errors ignored. Some data may be incomplete.")
        except Exception as e2:
            print(f"Final loading attempt failed: {e2}. Please check file encoding or integrity.")
            raise

    # Clean transit data for JSON compliance
    transit = transit.replace({np.nan: None})
    for column in transit.select_dtypes(include=['object']).columns:
        if column == 'geometry':
            continue
        if transit[column].apply(lambda x: isinstance(x, (list, dict, np.ndarray))).any():
            transit[column] = transit[column].apply(lambda x: str(x) if x is not None else None)
        else:
            transit[column] = transit[column].astype(str).replace('nan', None)

    # Clean trees data for JSON compliance
    trees = trees.replace({np.nan: None})
    for column in trees.select_dtypes(include=['object']).columns:
        if column == 'geometry':
            continue
        if trees[column].apply(lambda x: isinstance(x, (list, dict, np.ndarray))).any():
            trees[column] = trees[column].apply(lambda x: str(x) if x is not None else None)
        else:
            trees[column] = trees[column].astype(str).replace('nan', None)

    neighborhoods = neighborhoods.to_crs('EPSG:3826')
    buildings = buildings.to_crs('EPSG:3826')
    roads = roads.to_crs('EPSG:3826')
    trees = trees.to_crs('EPSG:3826')
    transit = transit.to_crs('EPSG:3826')
    urban_masterplan = urban_masterplan.to_crs('EPSG:3826')

    neighborhoods = neighborhoods[neighborhoods.geometry.is_valid]
    buildings = buildings[buildings.geometry.is_valid].copy()
    roads = roads[roads.geometry.is_valid]
    trees = trees[trees.geometry.is_valid]
    transit = transit[transit.geometry.is_valid]
    urban_masterplan = urban_masterplan[urban_masterplan.geometry.is_valid]

    buildings['area_m2'] = buildings.geometry.area.round(1)
    buildings['building'] = buildings['building'].fillna('unknown').replace('yes', 'unknown')

    tree_points = trees[trees.geometry.geom_type == 'Point']
    tree_polylines = trees[trees.geometry.geom_type.isin(['LineString', 'MultiLineString'])]
    converted_points = []
    for _, row in tree_polylines.iterrows():
        geom = row.geometry
        points = polyline_to_points(geom, spacing=7)
        for point in points:
            converted_points.append({'geometry': point})

    if converted_points:
        converted_points_gdf = gpd.GeoDataFrame(converted_points, crs='EPSG:3826')
        all_trees = pd.concat([tree_points[['geometry']], converted_points_gdf], ignore_index=True)
        all_trees = gpd.GeoDataFrame(all_trees, crs='EPSG:3826')
    else:
        all_trees = tree_points[['geometry']].copy()
        all_trees = gpd.GeoDataFrame(all_trees, crs='EPSG:3826')

    print("Building type distribution after cleaning:")
    print(buildings['building'].value_counts())
    print(f"Warren: Total number of neighborhoods in dataset: {len(neighborhoods)}")
    print(f"Total number of tree points loaded: {len(tree_points)}")
    print(f"Total number of tree polylines loaded: {len(tree_polylines)}")
    print(f"Total number of trees after converting polylines: {len(all_trees)}")
    print(f"Total number of transit points loaded: {len(transit)}")
    print(f"Total number of urban masterplan polygons loaded: {len(urban_masterplan)}")

    # Use all neighborhoods for the whole city
    selected_neighborhoods = neighborhoods.copy()
    print(f"Analyzing all {len(selected_neighborhoods)} neighborhoods in the dataset.")

    selected_neighborhoods['tree_count'] = 0
    for idx, neighborhood in selected_neighborhoods.iterrows():
        neighborhood_geom = neighborhood.geometry
        trees_in_neighborhood = all_trees[all_trees.intersects(neighborhood_geom)]
        selected_neighborhoods.at[idx, 'tree_count'] = len(trees_in_neighborhood)

    selected_neighborhoods['transit_count'] = 0
    for idx, neighborhood in selected_neighborhoods.iterrows():
        neighborhood_geom = neighborhood.geometry
        transit_in_neighborhood = transit[transit.intersects(neighborhood_geom)]
        selected_neighborhoods.at[idx, 'transit_count'] = len(transit_in_neighborhood)

    # Segregate urban masterplan by neighborhood boundaries
    neighborhoods_gdf = gpd.GeoDataFrame(selected_neighborhoods, geometry='geometry')
    unique_categories = urban_masterplan['Category'].unique()
    for category in unique_categories:
        attr_name_m2 = f"land_use_{category.lower().replace('_', '_')}_m2"
        attr_name_percent = f"land_use_{category.lower().replace('_', '_')}_percent"
        selected_neighborhoods[attr_name_m2] = 0.0
        selected_neighborhoods[attr_name_percent] = 0.0

    urban_plan_to_neighborhoods = {}
    for idx, row in tqdm(urban_masterplan.iterrows(), desc="Segregating urban masterplan polygons"):
        urban_plan_to_neighborhoods[idx] = []
        masterplan_geom = row['geometry']
        for _, neigh_row in neighborhoods_gdf.iterrows():
            neigh_geom = neigh_row['geometry']
            if masterplan_geom.intersects(neigh_geom):
                intersection = masterplan_geom.intersection(neigh_geom)
                if not intersection.is_empty:
                    intersection_area = intersection.area
                    if intersection_area > 0:
                        category = row['Category']
                        area = row['Area']
                        prorated_area = (intersection_area / masterplan_geom.area) * area if masterplan_geom.area > 0 else 0
                        attr_name_m2 = f"land_use_{category.lower().replace('_', '_')}_m2"
                        selected_neighborhoods.loc[selected_neighborhoods['LIE_NAME'] == neigh_row['LIE_NAME'], attr_name_m2] += prorated_area
                        urban_plan_to_neighborhoods[idx].append((neigh_row['LIE_NAME'], intersection_area))

    # Calculate percentages for each category
    for category in unique_categories:
        attr_name_m2 = f"land_use_{category.lower().replace('_', '_')}_m2"
        attr_name_percent = f"land_use_{category.lower().replace('_', '_')}_percent"
        selected_neighborhoods[attr_name_percent] = selected_neighborhoods.apply(
            lambda row: (row[attr_name_m2] / row['geometry'].area * 100) if row['geometry'].area > 0 else 0, axis=1
        )

    print("Tree, transit, and urban masterplan counts per neighborhood:")
    for idx, row in selected_neighborhoods.iterrows():
        print(f"- {row['LIE_NAME']}: {row['tree_count']} trees, {row['transit_count']} transit points")
        for category in unique_categories:
            attr_name_m2 = f"land_use_{category.lower().replace('_', '_')}_m2"
            attr_name_percent = f"land_use_{category.lower().replace('_', '_')}_percent"
            print(f"  - {category} Area: {row[attr_name_m2]:.2f}m² ({row[attr_name_percent]:.2f}%)")

    city_center_boundary = unary_union(selected_neighborhoods['geometry'])
    buffer_distance = 50
    buffered_geom = city_center_boundary.buffer(buffer_distance)

    filtered_buildings = buildings[buildings.intersects(buffered_geom)]
    filtered_roads = roads[roads.intersects(buffered_geom)].copy()
    filtered_trees = all_trees[all_trees.intersects(buffered_geom)]
    filtered_transit = transit[transit.intersects(buffered_geom)]

    filtered_roads['length_m'] = filtered_roads.geometry.length.round(1)
    print("Road length distribution (in meters):")
    print(filtered_roads['length_m'].describe())

    road_points = []
    for idx, row in tqdm(filtered_roads.iterrows(), total=len(filtered_roads), desc="Extracting road endpoints"):
        geom = row['geometry']
        if geom.geom_type == 'LineString':
            start_point = Point(geom.coords[0])
            end_point = Point(geom.coords[-1])
            road_points.extend([(f"road_start_{idx}", start_point), (f"road_end_{idx}", end_point)])
        elif geom.geom_type == 'MultiLineString':
            for i, line in enumerate(geom.geoms):
                start_point = Point(line.coords[0])
                end_point = Point(line.coords[-1])
                road_points.extend([(f"road_start_{idx}_{i}", start_point), (f"road_end_{idx}_{i}", end_point)])

    filtered_buildings = filtered_buildings[filtered_buildings.geometry.is_valid]
    filtered_roads = filtered_roads[filtered_roads.geometry.is_valid]
    filtered_trees = filtered_trees[filtered_trees.geometry.is_valid]
    filtered_transit = filtered_transit[filtered_transit.geometry.is_valid]
    urban_masterplan = urban_masterplan[urban_masterplan.geometry.is_valid]

    selected_neighborhoods.to_file(os.path.join(CHECKPOINT_DIR, "selected_neighborhoods_filtered.geojson"), driver='GeoJSON')
    filtered_buildings.to_file(os.path.join(CHECKPOINT_DIR, "selected_buildings_filtered.geojson"), driver='GeoJSON')
    filtered_roads.to_file(os.path.join(CHECKPOINT_DIR, "selected_roads_filtered.geojson"), driver='GeoJSON')
    filtered_trees.to_file(os.path.join(CHECKPOINT_DIR, "selected_trees_filtered.geojson"), driver='GeoJSON')
    filtered_transit.to_file(os.path.join(CHECKPOINT_DIR, "selected_transit_filtered.geojson"), driver='GeoJSON')
    urban_masterplan.to_file(os.path.join(CHECKPOINT_DIR, "selected_urban_masterplan_filtered.geojson"), driver='GeoJSON')

    print(f"Data loaded and filtered for the entire city. Neighborhoods: {len(selected_neighborhoods)}, Buildings: {len(filtered_buildings)}, Roads: {len(filtered_roads)}, Trees: {len(filtered_trees)}, Transit Points: {len(filtered_transit)}, Urban Masterplan Polygons: {len(urban_masterplan)}")
    return selected_neighborhoods, filtered_buildings, filtered_roads, road_points, filtered_trees, filtered_transit, urban_masterplan

def build_graph(neighborhoods, buildings, roads, road_points, trees, transit, urban_masterplan):
    print("Checkpoint 2: Building graph network for the entire city with urban masterplan...")
    subgraphs = {}
    road_network = nx.Graph()

    for _, neighborhood_row in tqdm(neighborhoods.iterrows(), total=len(neighborhoods), desc="Processing neighborhoods"):
        lie_name = neighborhood_row['LIE_NAME']
        G_sub = nx.Graph()

        node_id = f"neighborhood_{lie_name}"
        G_sub.add_node(node_id,
                       type='neighborhood',
                       lie_name=lie_name,
                       sect_name=neighborhood_row['SECT_NAME'],
                       population=neighborhood_row['2024population'],
                       land_use_residential_percent=neighborhood_row['land_use_residential_percent'],
                       land_use_commercial_percent=neighborhood_row['land_use_commercial_percent'],
                       land_use_education_percent=neighborhood_row['land_use_education_percent'],
                       ndvi_mean=neighborhood_row['ndvi_mean'],
                       ndvi_median=neighborhood_row['ndvi_median'],
                       tree_count=neighborhood_row['tree_count'],
                       transit_count=neighborhood_row['transit_count'],
                       geometry=neighborhood_row['geometry'])
        # Add urban masterplan attributes
        unique_categories = urban_masterplan['Category'].unique()
        for category in unique_categories:
            attr_name_m2 = f"land_use_{category.lower().replace('_', '_')}_m2"
            attr_name_percent = f"land_use_{category.lower().replace('_', '_')}_percent"
            G_sub.nodes[node_id][attr_name_m2] = neighborhood_row[attr_name_m2] if pd.notna(neighborhood_row[attr_name_m2]) else 0.0
            G_sub.nodes[node_id][attr_name_percent] = neighborhood_row[attr_name_percent] if pd.notna(neighborhood_row[attr_name_percent]) else 0.0

        buffer_distance = 200
        neigh_geom = neighborhood_row['geometry']
        neigh_buffer = neigh_geom.buffer(buffer_distance)

        relevant_buildings = buildings[buildings.intersects(neigh_buffer)].copy()
        building_nodes = {}
        for idx, row in relevant_buildings.iterrows():
            node_id = f"building_{idx}"
            building_type = row['building'] if pd.notna(row['building']) else 'unknown'
            area_m2 = row['area_m2'] if pd.notna(row['area_m2']) else 0.0
            age = row['屋齡'] if pd.notna(row['屋齡']) else '<NA>'
            height = row['建物高度'] if pd.notna(row['建物高度']) else '<NA>'
            floors = row['地上層數'] if pd.notna(row['地上層數']) else '<NA>'
            structure_type = row['構造種類'] if pd.notna(row['構造種類']) else 'Unknown'
            usage_zone = row['使用分區'] if pd.notna(row['使用分區']) else 'Unknown'
            G_sub.add_node(node_id,
                           type='building',
                           building_type=building_type,
                           area_m2=area_m2,
                           age=age,
                           height=height,
                           floors=floors,
                           structure_type=structure_type,
                           usage_zone=usage_zone,
                           geometry=row['geometry'])
            building_nodes[node_id] = row['geometry']

        relevant_road_points = [(node_id, geom) for node_id, geom in road_points if geom.intersects(neigh_buffer)]
        road_nodes = {}
        class_column = next((col for col in roads.columns if col.lower() in ['class', 'road_class', 'highway']), 'unknown')
        print(f"Using column '{class_column}' for road classification.")
        for node_id, geom in relevant_road_points:
            road_idx = int(node_id.split('_')[2])
            road_class = roads.loc[road_idx, class_column] if pd.notna(roads.loc[road_idx, class_column]) else 'unknown'
            length_m = roads.loc[road_idx, 'length_m'] if pd.notna(roads.loc[road_idx, 'length_m']) else 0.0
            G_sub.add_node(node_id, type='road', road_class=road_class, length_m=length_m, geometry=geom)
            road_nodes[node_id] = geom
            road_network.add_node(node_id, geometry=geom, road_class=road_class, length_m=length_m)

        relevant_trees = trees[trees.intersects(neigh_buffer)].copy()
        tree_nodes = {}
        for idx, row in relevant_trees.iterrows():
            node_id = f"tree_{idx}"
            G_sub.add_node(node_id, type='tree', geometry=row['geometry'])
            tree_nodes[node_id] = row['geometry']

        relevant_transit = transit[transit.intersects(neigh_buffer)].copy()
        transit_nodes = {}
        for idx, row in relevant_transit.iterrows():
            node_id = f"transit_{idx}"
            G_sub.add_node(node_id, type='transit', **{'class': row['class'], 'geometry': row['geometry']})
            transit_nodes[node_id] = row['geometry']

        # Add urban masterplan nodes and connections
        urban_plan_nodes = {}
        for idx, row in urban_masterplan.iterrows():
            masterplan_geom = row['geometry']
            if masterplan_geom.intersects(neigh_geom):
                node_id = f"urban_plan_{idx}"
                G_sub.add_node(node_id,
                              type='urban_plan',
                              category=row['Category'],
                              area=row['Area'],
                              geometry=masterplan_geom)
                urban_plan_nodes[node_id] = masterplan_geom
                # Connect to neighborhood
                distance = neigh_geom.centroid.distance(masterplan_geom.centroid)
                G_sub.add_edge(node_id, f"neighborhood_{lie_name}", weight=distance, type='urban_plan')

                # Connect to other node types within buffer
                urban_plan_buffer = masterplan_geom.buffer(50)  # 50-meter buffer
                for building_node, build_geom in building_nodes.items():
                    if urban_plan_buffer.intersects(build_geom):
                        distance = masterplan_geom.centroid.distance(build_geom)
                        G_sub.add_edge(node_id, building_node, weight=distance, type='urban_plan_to_building')
                for road_node, road_geom in road_nodes.items():
                    if isinstance(road_geom, Point) and urban_plan_buffer.intersects(road_geom):
                        distance = masterplan_geom.centroid.distance(road_geom)
                        G_sub.add_edge(node_id, road_node, weight=distance, type='urban_plan_to_road')
                for tree_node, tree_geom in tree_nodes.items():
                    if urban_plan_buffer.intersects(tree_geom):
                        distance = masterplan_geom.centroid.distance(tree_geom)
                        G_sub.add_edge(node_id, tree_node, weight=distance, type='urban_plan_to_tree')
                for transit_node, transit_geom in transit_nodes.items():
                    if urban_plan_buffer.intersects(transit_geom):
                        distance = masterplan_geom.centroid.distance(transit_geom)
                        G_sub.add_edge(node_id, transit_node, weight=distance, type='urban_plan_to_transit')

        neigh_node = f"neighborhood_{lie_name}"
        for building_node, build_geom in building_nodes.items():
            distance = neigh_geom.distance(build_geom)
            G_sub.add_edge(neigh_node, building_node, weight=distance, type='walk')

        for road_node, road_geom in road_nodes.items():
            if isinstance(road_geom, Point):
                distance = neigh_geom.distance(road_geom)
                G_sub.add_edge(neigh_node, road_node, weight=distance, type='walk')

        for building_node, build_geom in building_nodes.items():
            build_buffer = build_geom.buffer(50)
            for road_node, road_geom in road_nodes.items():
                if isinstance(road_geom, Point) and build_buffer.intersects(road_geom):
                    distance = build_geom.distance(road_geom)
                    G_sub.add_edge(building_node, road_node, weight=distance, type='walk')

        for tree_node, tree_geom in tree_nodes.items():
            distance = neigh_geom.distance(tree_geom)
            G_sub.add_edge(neigh_node, tree_node, weight=distance, type='natural')

        for transit_node, transit_geom in transit_nodes.items():
            distance = neigh_geom.distance(transit_geom)
            G_sub.add_edge(neigh_node, transit_node, weight=distance, type='transit')

        road_class_weights = {
            'footway': 0.5, 'pedestrian': 0.5, 'cycleway': 0.6, 'steps': 0.6, 'living_street': 0.6,
            'path': 0.7, 'track': 0.7, 'residential': 0.8, 'service': 1.0, 'unclassified': 1.2,
            'tertiary': 1.5, 'secondary': 2.0, 'primary': 2.5, 'highway': 3.5, 'motorway': 2.5,
            'trunk': 2.5, 'unknown': 1.0
        }
        tolerance = 10
        road_node_list = list(road_nodes.items())
        for i, (node1_id, geom1) in enumerate(road_node_list):
            for j, (node2_id, geom2) in enumerate(road_node_list[i+1:], start=i+1):
                if geom1.distance(geom2) <= tolerance:
                    road1_idx = int(node1_id.split('_')[2])
                    road2_idx = int(node2_id.split('_')[2])
                    class1 = roads.loc[road1_idx, class_column] if pd.notna(roads.loc[road1_idx, class_column]) else 'unknown'
                    class2 = roads.loc[road2_idx, class_column] if pd.notna(roads.loc[road2_idx, class_column]) else 'unknown'
                    weight1 = road_class_weights.get(class1, road_class_weights['unknown'])
                    weight2 = road_class_weights.get(class2, road_class_weights['unknown'])
                    distance = geom1.distance(geom2)
                    weight = distance * (weight1 + weight2) / 2
                    G_sub.add_edge(node1_id, node2_id, weight=weight, type='road')

        subgraphs[lie_name] = G_sub
        print(f"Subgraph for {lie_name}: {G_sub.number_of_nodes()} nodes, {G_sub.number_of_edges()} edges")

        subgraph_path = os.path.join(SUBGRAPH_DIR, f"subgraph_{lie_name}.pkl")
        with open(subgraph_path, 'wb') as f:
            pickle.dump(G_sub, f)

    print("Building road network for inter-neighborhood connections...")
    tolerance = 10
    road_node_list = list(road_network.nodes(data=True))
    for i, (node1, data1) in enumerate(road_node_list):
        for j, (node2, data2) in enumerate(road_node_list[i+1:], start=i+1):
            geom1 = data1['geometry']
            geom2 = data2['geometry']
            if geom1.distance(geom2) <= tolerance:
                class1 = data1['road_class']
                class2 = data2['road_class']
                weight1 = road_class_weights.get(class1, road_class_weights['unknown'])
                weight2 = road_class_weights.get(class2, road_class_weights['unknown'])
                distance = geom1.distance(geom2)
                weight = distance * (weight1 + weight2) / 2
                road_network.add_edge(node1, node2, weight=weight, type='road')

    with open(os.path.join(CHECKPOINT_DIR, "road_network.pkl"), 'wb') as f:
        pickle.dump(road_network, f)

    print(f"Total road network nodes: {road_network.number_of_nodes()}, edges: {road_network.number_of_edges()}")
    return subgraphs, road_network

def calculate_walkability(subgraphs, neighborhoods, road_network):
    print("Checkpoint 3: Calculating walkability scores for the entire city...")
    walkability_scores = {}

    for lie_name, G_sub in tqdm(subgraphs.items(), total=len(subgraphs), desc="Calculating walkability"):
        for node in G_sub.nodes():
            if G_sub.nodes[node].get('type') == 'neighborhood':
                residential = G_sub.nodes[node]['land_use_residential_percent'] if pd.notna(G_sub.nodes[node]['land_use_residential_percent']) else 0.0
                commercial = G_sub.nodes[node]['land_use_commercial_percent'] if pd.notna(G_sub.nodes[node]['land_use_commercial_percent']) else 0.0
                education = G_sub.nodes[node]['land_use_education_percent'] if pd.notna(G_sub.nodes[node]['land_use_education_percent']) else 0.0
                ndvi = G_sub.nodes[node]['ndvi_mean'] if pd.notna(G_sub.nodes[node]['ndvi_mean']) else 0.0
                tree_count = G_sub.nodes[node]['tree_count'] if pd.notna(G_sub.nodes[node]['tree_count']) else 0
                transit_count = G_sub.nodes[node]['transit_count'] if pd.notna(G_sub.nodes[node]['transit_count']) else 0
                # Add urban masterplan contribution (example with open area)
                open_area = G_sub.nodes[node].get('land_use_city_open_area_m2', 0.0)
                open_space_score = min(1.0, open_area / 10000) * 0.2

                land_use_score = (residential * 0.4 + commercial * 0.3 + education * 0.2) / 100
                ndvi_score = ndvi * 0.5
                tree_score = min(1.0, tree_count / 100) * 0.2
                transit_score = min(1.0, transit_count / 20) * 0.2
                walkability = min(1.0, land_use_score + (ndvi_score * 0.4) + tree_score + transit_score + open_space_score)
                G_sub.nodes[node]['walkability'] = walkability
                walkability_scores[lie_name] = walkability

                neighborhoods.loc[neighborhoods['LIE_NAME'] == lie_name, 'walkability'] = walkability

        subgraph_path = os.path.join(SUBGRAPH_DIR, f"subgraph_{lie_name}_with_walkability.pkl")
        with open(subgraph_path, 'wb') as f:
            pickle.dump(G_sub, f)

    with open(os.path.join(CHECKPOINT_DIR, "walkability_scores.pkl"), 'wb') as f:
        pickle.dump(walkability_scores, f)

    print("Walkability scores calculated for the entire city.")
    return subgraphs

def update_subgraphs(changed_neighborhoods, neighborhoods, buildings, roads, road_points, trees, transit, urban_masterplan):
    print("Updating subgraphs for the entire city...")
    subgraphs = {}
    road_network_path = os.path.join(CHECKPOINT_DIR, "road_network.pkl")
    road_network = None

    if os.path.exists(road_network_path):
        with open(road_network_path, 'rb') as f:
            road_network = pickle.load(f)

    data_hashes = {
        'neighborhoods': compute_data_hash(neighborhoods),
        'buildings': compute_data_hash(buildings),
        'roads': compute_data_hash(roads),
        'trees': compute_data_hash(trees),
        'transit': compute_data_hash(transit),
        'urban_masterplan': compute_data_hash(urban_masterplan)
    }
    hash_path = os.path.join(CHECKPOINT_DIR, "data_hashes.pkl")
    changed = False
    if os.path.exists(hash_path):
        with open(hash_path, 'rb') as f:
            stored_hashes = pickle.load(f)
        if any(data_hashes[key] != stored_hashes.get(key) for key in data_hashes) or changed_neighborhoods:
            changed = True
            changed_neighborhoods = list(neighborhoods['LIE_NAME']) if not changed_neighborhoods else changed_neighborhoods
    else:
        changed = True
        changed_neighborhoods = list(neighborhoods['LIE_NAME'])

    for lie_name in neighborhoods['LIE_NAME']:
        subgraph_path = os.path.join(SUBGRAPH_DIR, f"subgraph_{lie_name}_with_walkability.pkl")
        if os.path.exists(subgraph_path) and lie_name not in changed_neighborhoods:
            with open(subgraph_path, 'rb') as f:
                subgraphs[lie_name] = pickle.load(f)
        else:
            print(f"Recomputing subgraph for {lie_name}...")
            temp_neighborhoods = neighborhoods[neighborhoods['LIE_NAME'] == lie_name]
            temp_subgraphs, temp_road_network = build_graph(temp_neighborhoods, buildings, roads, road_points, trees, transit, urban_masterplan)
            subgraphs[lie_name] = temp_subgraphs[lie_name]
            if road_network is None:
                road_network = temp_road_network
            else:
                road_network.add_nodes_from(temp_road_network.nodes(data=True))
                road_network.add_edges_from(temp_road_network.edges(data=True))

    with open(road_network_path, 'wb') as f:
        pickle.dump(road_network, f)

    if changed:
        with open(hash_path, 'wb') as f:
            pickle.dump(data_hashes, f)

    return subgraphs, road_network

def reconstruct_full_graph(subgraphs, road_network):
    print("Reconstructing full graph from subgraphs for the entire city...")
    G = nx.Graph()

    for G_sub in subgraphs.values():
        G.add_nodes_from(G_sub.nodes(data=True))
        G.add_edges_from(G_sub.edges(data=True))

    G.add_nodes_from(road_network.nodes(data=True))
    G.add_edges_from(road_network.edges(data=True))

    print(f"Full graph reconstructed: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    return G

def create_interactive_map(G, neighborhoods, buildings, roads, trees, transit, urban_masterplan):
    print("Generating interactive Kepler.gl map for the entire city...")
    try:
        # Verify Mapbox token
        if not MAPBOX_ACCESS_TOKEN or "pk." not in MAPBOX_ACCESS_TOKEN:
            raise ValueError("Invalid or missing Mapbox access token. Please provide a valid token.")

        neighborhoods_wgs84 = neighborhoods.to_crs('EPSG:4326')
        buildings_wgs84 = buildings.to_crs('EPSG:4326')
        roads_wgs84 = roads.to_crs('EPSG:4326')
        trees_wgs84 = trees.to_crs('EPSG:4326')
        transit_wgs84 = transit.to_crs('EPSG:4326')
        urban_masterplan_wgs84 = urban_masterplan.to_crs('EPSG:4326')

        # Debug: Inspect datasets
        print("Neighborhoods columns:", neighborhoods_wgs84.columns.tolist())
        print("Neighborhoods sample:", neighborhoods_wgs84.head(1))
        print("Buildings columns:", buildings_wgs84.columns.tolist())
        print("Buildings sample:", buildings_wgs84.head(1))
        print("Roads columns:", roads_wgs84.columns.tolist())
        print("Roads sample:", roads_wgs84.head(1))
        print("Trees columns:", trees_wgs84.columns.tolist())
        print("Trees sample:", trees_wgs84.head(1))
        print("Transit columns:", transit_wgs84.columns.tolist())
        print("Transit sample:", transit_wgs84.head(1))
        print("Urban Masterplan columns:", urban_masterplan_wgs84.columns.tolist())
        print("Urban Masterplan sample:", urban_masterplan_wgs84.head(1))

        # Include all columns for table view
        all_cols = neighborhoods_wgs84.columns.tolist()
        neighborhoods_data = neighborhoods_wgs84[all_cols].copy()

        for node in G.nodes(data=True):
            if node[1].get('type') == 'neighborhood':
                lie_name = node[1]['lie_name']
                walkability = round(node[1].get('walkability', 0), 2)
                transit_count = node[1].get('transit_count', 0)
                lie_name_normalized = lie_name.strip()
                neighborhoods_data.loc[neighborhoods_data['LIE_NAME'].str.strip() == lie_name_normalized, 'walkability'] = walkability
                neighborhoods_data.loc[neighborhoods_data['LIE_NAME'].str.strip() == lie_name_normalized, 'transit_count'] = transit_count

        numeric_cols = [col for col in all_cols if col not in ['geometry', 'LIE_NAME', 'SECT_NAME']]
        for col in numeric_cols:
            neighborhoods_data[col] = pd.to_numeric(neighborhoods_data[col], errors='coerce').round(2)

        # Remove 'type' column if it exists
        if 'type' in neighborhoods_data.columns:
            neighborhoods_data = neighborhoods_data.drop(columns=['type'])

        # Prepare GeoJSON data with consistent structure
        geojson_data = {
            'type': 'FeatureCollection',
            'features': []
        }
        all_fields = set()
        for _, row in neighborhoods_data.iterrows():
            properties = row.drop('geometry').to_dict()
            if 'type' in properties:
                del properties['type']
            filtered_properties = {k: v for k, v in properties.items() if pd.notna(v) and v != 0}
            feature = {
                'type': 'Feature',
                'properties': filtered_properties,
                'geometry': row['geometry'].__geo_interface__ if row['geometry'] is not None else None
            }
            geojson_data['features'].append(feature)
            all_fields.update(filtered_properties.keys())

        print("GeoJSON properties for 和安里:", [f for f in geojson_data['features'] if f['properties'].get('LIE_NAME') == '和安里'][0]['properties'])

        fields_to_show = sorted(list(all_fields))
        print("Fields to show (sorted):", fields_to_show)

        # Prepare buildings data with original column names for consistency with config
        buildings_data = buildings_wgs84[['full_id', 'osm_id', 'building', '屋齡', '建物高度', '地上層數', '構造種類', '使用分區', 'geometry', 'area_m2']].copy()
        buildings_data['建物高度'] = pd.to_numeric(buildings_data['建物高度'], errors='coerce').fillna(10).round(1)
        buildings_data['地上層數'] = pd.to_numeric(buildings_data['地上層數'], errors='coerce').fillna(3).round(0)
        buildings_data = buildings_data[buildings_data['geometry'].notna()]
        if 'type' in buildings_data.columns:
            buildings_data = buildings_data.drop(columns=['type'])

        roads_data = roads_wgs84[['class', 'length_m', 'geometry']].copy()
        roads_data['class'] = roads_data['class'].fillna('unknown')
        roads_data['length_m'] = roads_data['length_m'].round(1)
        roads_data = roads_data[roads_data['geometry'].notna()]
        if 'type' in roads_data.columns:
            roads_data = roads_data.drop(columns=['type'])

        trees_data = trees_wgs84[['geometry']].copy()
        trees_data['height_m'] = 10
        trees_data = trees_data[trees_data['geometry'].notna()]
        if 'type' in trees_data.columns:
            trees_data = trees_data.drop(columns=['type'])
        if 'subtype' in trees_data.columns:
            trees_data = trees_data.drop(columns=['subtype'])

        transit_data = transit_wgs84[['id', 'class', 'geometry']].copy()
        transit_data['size'] = 10
        transit_data = transit_data[transit_data['geometry'].notna()]
        if 'type' in transit_data.columns:
            transit_data = transit_data.drop(columns=['type'])
        if 'subtype' in transit_data.columns:
            transit_data = transit_data.drop(columns=['subtype'])

        # Prepare urban masterplan data
        urban_masterplan_data = urban_masterplan_wgs84[['Category', 'Area', 'geometry']].copy()
        urban_masterplan_data = urban_masterplan_data[urban_masterplan_data['geometry'].notna()]
        if 'type' in urban_masterplan_data.columns:
            urban_masterplan_data = urban_masterplan_data.drop(columns=['type'])

        # Initialize Kepler.gl map
        map_1 = KeplerGl(height=600, width=800, mapbox_api_access_token=MAPBOX_ACCESS_TOKEN)

        # Add data to the map with matching names
        map_1.add_data(data=geojson_data, name="Neighborhoods")
        map_1.add_data(data=buildings_data, name="Buildings")
        map_1.add_data(data=roads_data, name="Roads")
        map_1.add_data(data=trees_data, name="Trees")
        map_1.add_data(data=transit_data, name="Transit")
        map_1.add_data(data=urban_masterplan_data, name="Urban_Masterplan")

        # Check if a saved configuration exists
        if not os.path.exists(CONFIG_PATH):
            print("No saved configuration found. Generating a temporary HTML for customization...")
            # Save a temporary HTML file for customization
            map_1.save_to_html(file_name=TEMP_HTML_PATH)

            # Open the temporary HTML in the default browser
            print(f"Opening {TEMP_HTML_PATH} in your browser. Please customize the map, then export the configuration as kepler.gl.json.")
            webbrowser.open(f"file://{os.path.abspath(TEMP_HTML_PATH)}")

            # Prompt the user to save the configuration
            print("After customizing, click the 'Share' button in the Kepler.gl interface, then 'Export Configuration' to save as 'kepler.gl.json' in:")
            print(r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data")
            print("Press Enter once you have saved the configuration file (kepler.gl.json), or Ctrl+C to cancel.")
            input()  # Wait for user input to proceed

            # Check if the configuration file was saved
            if not os.path.exists(CONFIG_PATH):
                raise FileNotFoundError(f"Configuration file {CONFIG_PATH} not found. Please save the configuration and try again.")

        # Load the saved configuration
        print(f"Loading configuration from {CONFIG_PATH}...")
        with open(CONFIG_PATH, 'r') as f:
            custom_config = json.load(f)

        # Update visibility settings to ensure layers are visible
        for layer in custom_config['config']['visState']['layers']:
            if layer['config']['dataId'] in ["Neighborhoods", "Buildings", "Roads", "Trees", "Transit", "Urban_Masterplan"]:
                layer['config']['isVisible'] = True

        # Adjust column references to match prepared data
        for layer in custom_config['config']['visState']['layers']:
            if layer['config']['dataId'] == "Neighborhoods":
                layer['config']['columns'] = {'geojson': 'geometry'}
            elif layer['config']['dataId'] == "Buildings":
                layer['config']['columns'] = {'geojson': 'geometry'}
                layer['visualChannels']['colorField'] = {'name': 'building', 'type': 'string'}
                layer['visualChannels']['heightField'] = {'name': '建物高度', 'type': 'real'}
            elif layer['config']['dataId'] == "Roads":
                layer['config']['columns'] = {'geojson': 'geometry'}
            elif layer['config']['dataId'] == "Trees":
                layer['config']['columns'] = {'geojson': 'geometry'}
            elif layer['config']['dataId'] == "Transit":
                layer['config']['columns'] = {'geojson': 'geometry'}
            elif layer['config']['dataId'] == "Urban_Masterplan":
                layer['config']['columns'] = {'geojson': 'geometry'}
                layer['visualChannels']['colorField'] = {'name': 'Category', 'type': 'string'}
                layer['visualChannels']['opacityField'] = {'name': 'Area', 'type': 'real'}

        # Re-initialize the map with the updated configuration
        map_1 = KeplerGl(
            height=600,
            width=800,
            mapbox_api_access_token=MAPBOX_ACCESS_TOKEN,
            config=custom_config
        )

        # Add data again to the map with the loaded configuration
        map_1.add_data(data=geojson_data, name="Neighborhoods")
        map_1.add_data(data=buildings_data, name="Buildings")
        map_1.add_data(data=roads_data, name="Roads")
        map_1.add_data(data=trees_data, name="Trees")
        map_1.add_data(data=transit_data, name="Transit")
        map_1.add_data(data=urban_masterplan_data, name="Urban_Masterplan")

        # Save the final HTML with the applied configuration
        output_path = os.path.join(r"D:\IAAC\Thesis\Python\MLloading\Geojson\GNN_Read_data", "walkability_map_city_level.html")
        map_1.save_to_html(file_name=output_path)
        print(f"Interactive Kepler.gl map saved successfully at {output_path}")

        # Clean up the temporary HTML file
        if os.path.exists(TEMP_HTML_PATH):
            os.remove(TEMP_HTML_PATH)
            print(f"Temporary file {TEMP_HTML_PATH} removed.")

    except Exception as e:
        print(f"Error generating Kepler.gl map: {e}")
        raise

def main():
    print("Starting walkability graph network analysis for the entire city...")
    try:
        neighborhoods, buildings, roads, road_points, trees, transit, urban_masterplan = load_data()
        changed_neighborhoods = []
        subgraphs, road_network = update_subgraphs(changed_neighborhoods, neighborhoods, buildings, roads, road_points, trees, transit, urban_masterplan)
        subgraphs = calculate_walkability(subgraphs, neighborhoods, road_network)
        G = reconstruct_full_graph(subgraphs, road_network)
        create_interactive_map(G, neighborhoods, buildings, roads, trees, transit, urban_masterplan)
        print("Analysis completed successfully for the entire city.")
    except Exception as e:
        print(f"Error during analysis: {e}")

if __name__ == "__main__":
    main()

Starting walkability graph network analysis for the entire city...
Checkpoint 1: Loading data for the entire city...


Loading files:  17%|█▋        | 1/6 [00:00<00:01,  3.42it/s]

Columns in neighborhoods after loading: ['LIE_NAME', 'SECT_NAME', '2024population', 'land_use_city_open_area_count', 'land_use_city_open_area_area_m2', 'land_use_city_open_area_percent', 'land_use_commercial_count', 'land_use_commercial_area_m2', 'land_use_commercial_percent', 'land_use_infrastructure_count', 'land_use_infrastructure_area_m2', 'land_use_infrastructure_percent', 'land_use_government_count', 'land_use_government_area_m2', 'land_use_government_percent', 'land_use_public_transportation_count', 'land_use_public_transportation_area_m2', 'land_use_public_transportation_percent', 'land_use_education_count', 'land_use_education_area_m2', 'land_use_education_percent', 'land_use_medical_count', 'land_use_medical_area_m2', 'land_use_medical_percent', 'land_use_amenity_count', 'land_use_amenity_area_m2', 'land_use_amenity_percent', 'land_use_road_count', 'land_use_road_area_m2', 'land_use_road_percent', 'land_use_pedestrian_count', 'land_use_pedestrian_area_m2', 'land_use_pedestria

Loading files: 100%|██████████| 6/6 [00:07<00:00,  1.17s/it]


Building type distribution after cleaning:
building
apartments     23519
residential    18958
unknown        17957
house           4305
school          1569
               ...  
social             1
civil              1
entrance           1
supermarket        1
r                  1
Name: count, Length: 80, dtype: int64
Warren: Total number of neighborhoods in dataset: 456
Total number of tree points loaded: 2289
Total number of tree polylines loaded: 1106
Total number of trees after converting polylines: 17786
Total number of transit points loaded: 6845
Total number of urban masterplan polygons loaded: 15521
Analyzing all 456 neighborhoods in the dataset.


Segregating urban masterplan polygons: 15521it [05:29, 47.12it/s]


Tree, transit, and urban masterplan counts per neighborhood:
- 湖田里: 9 trees, 47 transit points
  - City_Open_Area Area: 0.00m² (0.00%)
  - Commercial Area: 0.00m² (0.00%)
  - Infrastructure Area: 0.00m² (0.00%)
  - Government Area: 0.00m² (0.00%)
  - Public_Transportation Area: 0.00m² (0.00%)
  - Education Area: 0.00m² (0.00%)
  - Medical Area: 0.00m² (0.00%)
  - Amenity Area: 0.00m² (0.00%)
  - Road Area: 0.00m² (0.00%)
  - Pedestrian Area: 0.00m² (0.00%)
  - Natural Area: 16242197.97m² (99.50%)
  - Special_Zone Area: 0.00m² (0.00%)
  - River Area: 0.00m² (0.00%)
  - Military Area: 0.00m² (0.00%)
  - Residential Area: 0.00m² (0.00%)
  - Industrial Area: 0.00m² (0.00%)
  - Agriculture Area: 0.00m² (0.00%)
- 菁山里: 0 trees, 59 transit points
  - City_Open_Area Area: 0.00m² (0.00%)
  - Commercial Area: 0.00m² (0.00%)
  - Infrastructure Area: 6758.17m² (0.06%)
  - Government Area: 72517.11m² (0.62%)
  - Public_Transportation Area: 0.00m² (0.00%)
  - Education Area: 11065.27m² (0.09%)
  - Me

Extracting road endpoints: 100%|██████████| 56170/56170 [00:03<00:00, 15873.56it/s]


Data loaded and filtered for the entire city. Neighborhoods: 456, Buildings: 74305, Roads: 56170, Trees: 14896, Transit Points: 4328, Urban Masterplan Polygons: 15521
Updating subgraphs for the entire city...
Checkpoint 3: Calculating walkability scores for the entire city...


Calculating walkability: 100%|██████████| 454/454 [00:07<00:00, 58.35it/s]


Walkability scores calculated for the entire city.
Reconstructing full graph from subgraphs for the entire city...
Full graph reconstructed: 221461 nodes, 2451587 edges
Generating interactive Kepler.gl map for the entire city...
Neighborhoods columns: ['LIE_NAME', 'SECT_NAME', '2024population', 'land_use_city_open_area_count', 'land_use_city_open_area_area_m2', 'land_use_city_open_area_percent', 'land_use_commercial_count', 'land_use_commercial_area_m2', 'land_use_commercial_percent', 'land_use_infrastructure_count', 'land_use_infrastructure_area_m2', 'land_use_infrastructure_percent', 'land_use_government_count', 'land_use_government_area_m2', 'land_use_government_percent', 'land_use_public_transportation_count', 'land_use_public_transportation_area_m2', 'land_use_public_transportation_percent', 'land_use_education_count', 'land_use_education_area_m2', 'land_use_education_percent', 'land_use_medical_count', 'land_use_medical_area_m2', 'land_use_medical_percent', 'land_use_amenity_coun