In [None]:
import os, geopandas as gpd, folium, pandas as pd, matplotlib.pyplot as plt, osmnx as ox
# from rpy2.robjects.lib.ggplot2 import layer
from shapely.validation import make_valid
from tqdm import tqdm
pd.set_option('display.float_format', '{:.2f}'.format)
os.getcwd()

__original dataset__

In [None]:
# nc_gdf_older = gpd.read_file("../../../Data/Original_dataset/Archive/Mecklenburg_2023_Buildings_sddncgov/Mecklenburg_2023_Buildings.gdb",layer="S_BUILDING_FP")
# nc_gdf = gpd.read_file("../../../Data/Original_dataset/Archive/NC_Buildings_sddncgov_1125/00_AllCounties_Building_Footprints/meck_only_buildings.gpkg").to_crs(meck.crs)
# meck_gdf = gpd.read_file("../../../Data/Original_dataset/Archive/Buildings_footprint_meck_01_24/Buildings.shp").to_crs(nc_gdf.crs)
# osm_gdf = gpd.read_file("../../../Data/Original_dataset/Archive/OSM_Buildings/OSM_Buildings_Meck.gpkg").to_crs(nc_gdf.crs)
# meck_unqiue = gpd.read_file("../../../Data/Final_dataset/ABT/outputs_unique_buildings/unique_meck_buildings.gpkg", layer = "unique_meck_buildings").to_crs(nc_gdf.crs)
# osm_unqiue = gpd.read_file("../../../Data/Final_dataset/ABT/outputs_unique_buildings/unique_osm_buildings.gpkg", layer = "unique_osm_buildings").to_crs(nc_gdf.crs)

__finding nearby building footprint in the buffer area next to the county (2000 m) by OSM__


In [None]:
meck_geom = meck.to_crs(3857).dissolve().geometry.iloc[0]        # Make sure it's a single polygon
outer_buffer_3857 = meck_geom.buffer(2000) # in meter
outer_buffer_poly = gpd.GeoDataFrame(geometry=[outer_buffer_3857], crs=3857).to_crs(4326).geometry.iloc[0] # Convert buffer to WGS84 for OX and create shapely Polygon for OSMnx

ox.settings.use_cache = True; ox.settings.log_console = True
ox.settings.overpass_settings = "[out:json][timeout:1800][maxsize:2147483648]" # Increase Overpass limits (optional but helps with large areas)

bldg_osm = ox.features_from_polygon(outer_buffer_poly, tags={"building": True})

print("Total features from OSM:", len(bldg_osm));print(bldg_osm.geom_type.value_counts())

bldg_osm = bldg_osm[bldg_osm.geometry.geom_type.isin(["Polygon", "MultiPolygon"])].copy().to_crs(3857) # Keep only polygonal buildings

#Filter to buildings just outside Mecklenburgn (within the buffer but NOT inside Mecklenburg itself)
bldg_osm["centroid"] = bldg_osm.geometry.centroid
mask_inside_meck  = bldg_osm["centroid"].within(meck_geom)
mask_inside_outer = bldg_osm["centroid"].within(outer_buffer_3857)
bldg_neighbors = bldg_osm[mask_inside_outer & ~mask_inside_meck].copy()
bldg_neighbors.drop(columns=["centroid"], inplace=True)

print("Border-neighbor buildings:", len(bldg_neighbors))


# Save to GPKG
bldg_neighbors.to_file("osm_buildings_meck_border_0p5mi.gpkg",layer="bldg_neighbors")

In [None]:
bldg_neighbors = bldg_neighbors.to_crs(meck.crs)
merged_osm = gpd.GeoDataFrame(pd.concat([osm_gdf_updated, bldg_neighbors], ignore_index=True),crs=meck.crs)
merged_osm.to_file("../../../Data/Final_dataset/ABT/outputs_building_overlap/split_by_source/Buildings_osm.gpkg")

__importing main datasets__

In [None]:
meck = gpd.read_file("../../../Data/Original_dataset/Archive/mecklenburgcounty_boundary/MecklenburgCounty_Boundary.shp")
# nc_gdf_updated_oldver = gpd.read_file("../../../Data/Final_dataset/ABT/outputs_building_overlap/split_by_source/Buildings_nc.gpkg").to_crs(meck.crs)
nc_gdf_updated = gpd.read_file("../../../Data/Original_dataset/Archive/NC_Buildings_sddncgov_1125/00_AllCounties_Building_Footprints/meck_only_buildings.gpkg").to_crs(meck.crs)
meck_gdf_updated = gpd.read_file("../../../Data/Final_dataset/ABT/outputs_building_overlap/split_by_source/Buildings_meck.gpkg").to_crs(meck.crs)
osm_gdf_updated = gpd.read_file("../../../Data/Final_dataset/ABT/outputs_building_overlap/split_by_source/Buildings_osm.gpkg").to_crs(meck.crs)

__Algorithm__

In [None]:
# Removing records below than 100 sqfeet
nc_gdf_updated = nc_gdf_updated[nc_gdf_updated.geometry.area > 100]
meck_gdf_updated = meck_gdf_updated[meck_gdf_updated.geometry.area > 100]
osm_gdf_updated = osm_gdf_updated[osm_gdf_updated.geometry.area > 100]

In [None]:
nc_gdf_updated = nc_gdf_updated.dropna(axis=1, how='all')
meck_gdf_updated = meck_gdf_updated.dropna(axis=1, how='all')
osm_gdf_updated = osm_gdf_updated.dropna(axis=1, how='all')

In [None]:
#Base layer (NCEM)
final_gdf = nc_gdf_updated.copy()
final_gdf["source_flag"] = "NCEM_BASE"

In [None]:
# Keep only valid building footprints
osm = osm_gdf_updated.copy()

# Remove 'roof', 'construction', and null geometries
invalid_building_types = ["roof","retail;roof","construction","ruins","abandoned","silo","storage_tank","static_caravan","container","guardhouse","bell_tower","carport","shed","garage","garages","boathouse","cabin","barn","stable","pavilion","hangar","riding_hall","transportation","sports_locker_room","locker_room","allotment_house","hut","service","public","shelter","greenhouse"]

osm = osm[~osm['building'].isin(invalid_building_types)]
osm = osm[osm.is_valid & (~osm.geometry.is_empty)]

osm['building_use'] = osm['building:use'].fillna(osm['building'])
osm['building_levels'] = pd.to_numeric(osm['building:levels'], errors='coerce')
osm['height_ft'] = pd.to_numeric(osm['height'], errors='coerce') * 3.28084  # meters → ft

def semantic_score(row):
    score = 0
    if pd.notnull(row['building_use']): score += 1
    if pd.notnull(row['name']): score += 1
    if pd.notnull(row['height_ft']): score += 1
    if pd.notnull(row['building_levels']): score += 1
    if row.get('source') and 'microsoft' in str(row['source']).lower(): score += 1
    return score

osm['semantic_score'] = osm.apply(semantic_score, axis=1)

In [None]:
meck_bf = meck_gdf_updated.copy()
meck_bf = meck_bf[meck_bf.is_valid & (~meck_bf.geometry.is_empty)]

In [None]:
final_gdf = nc_gdf_updated.copy()
final_gdf['source_flag'] = 'NCEM_BASE'

meck_unique = meck_bf[
    (meck_bf['overlap_nc'] == 0) &
    (meck_bf['overlap_osm'] == 0)
].copy()
meck_unique['source_flag'] = 'MECK_UNIQUE'

osm_unique = osm[
    (osm['overlap_nc'] == 0) &
    (osm['overlap_meck'] == 0)
].copy()
osm_unique['source_flag'] = 'OSM_UNIQUE'

# --- PRINT COUNTS ---
print("NCEM_BASE:", len(final_gdf))
print("MECK_UNIQUE:", len(meck_unique))
print("OSM_UNIQUE:", len(osm_unique))

In [None]:
osm_low = osm[
    (osm['overlap_nc'] <= 0.2) &
    (osm['overlap_meck'] <= 0.2) &
    (osm['semantic_score'] >= 3)
].copy()
osm_low['source_flag'] = 'OSM_LOW'

print("OSM_LOW:", len(osm_low))

In [None]:
#add OSM footprints outside the county
osm_outside = osm[~osm.geometry.within(meck.unary_union)]

In [None]:
merged = pd.concat([final_gdf, meck_unique, osm_unique, osm_low,osm_outside], ignore_index=True)
merged = merged[merged.is_valid & (~merged.geometry.is_empty)]
merged["geometry"] = merged.buffer(0)
merged["area_sqft"] = merged.geometry.area

priority = {"NCEM_BASE": 3, "MECK_UNIQUE": 2, "OSM_LOW": 1, "OSM_UNIQUE": 1}
merged["priority"] = merged["source_flag"].map(priority)

In [None]:
def remove_significant_overlaps(gdf, overlap_threshold=0.8):
    gdf = gdf.sort_values(["priority", "semantic_score"], ascending=[False, False]).reset_index(drop=True)
    kept, removed = [], set()
    sindex = gdf.sindex

    for i, geom_i in gdf.geometry.items():
        if i in removed: continue
        for j in sindex.intersection(geom_i.bounds):
            if j <= i or j in removed: continue
            geom_j = gdf.geometry.iloc[j]
            if geom_j.is_empty: continue
            inter_area = geom_i.intersection(geom_j).area
            min_area = min(geom_i.area, geom_j.area)
            if inter_area / min_area >= overlap_threshold:
                removed.add(j)
        kept.append(i)

    print(f"Removed {len(removed):,} near-duplicate polygons")
    return gdf.loc[kept].reset_index(drop=True)

merged = remove_significant_overlaps(merged)

__removing footprints less than 200 sqft which doesn't have shared boundary__

In [None]:
# Ensure clean index
merged = merged.reset_index(drop=True)

# Create spatial index *without overwriting the GeoDataFrame*
sindex = merged.sindex

shared_boundary = []

for i, geom in merged.geometry.items():
    # candidate neighbors using bounding box index
    possible = list(sindex.intersection(geom.bounds))
    possible.remove(i)  # remove itself
    
    # check touches relationship
    touching = [j for j in possible if geom.touches(merged.geometry[j])]
    
    shared_boundary.append(len(touching) > 0)

merged["shares_boundary"] = shared_boundary

merged_error = merged[
    (merged.geometry.area <= 200) &
    (merged["shares_boundary"] == False)
]

In [None]:
out_path_error = "../../../Data/Final_dataset/ABT/outputs_building_overlap/errors.gpkg"
merged_error.to_file(out_path_error, driver="GPKG")

In [None]:
merged = merged[merged.is_valid & (~merged.geometry.is_empty)]
final = merged.drop(merged_error.index)
out_path = "../../../Data/Final_dataset/ABT/outputs_building_overlap/Final_Buildings_SemanticIntegrated.gpkg"
final.to_file(out_path)
print(f"✅ Final dataset saved with {len(final):,} polygons")