In [None]:
nc_gdf_updated["overlap_nc"].isna().all()
meck_gdf_updated["overlap_meck"].isna().all()
osm_gdf_updated["overlap_osm"].isna().all()
print("=== NCEM dataset overlaps ===")
print(nc_gdf_updated[["overlap_meck", "overlap_osm"]].describe())

print("\n=== Mecklenburg dataset overlaps ===")
print(meck_gdf_updated[["overlap_nc", "overlap_osm"]].describe())

print("\n=== OSM dataset overlaps ===")
print(osm_gdf_updated[["overlap_nc", "overlap_meck"]].describe())
# Prepare data with clear, descriptive labels
data = {
    "NCEM overlap with Meck": nc_gdf_updated["overlap_meck"],
    "NCEM overlap with OSM": nc_gdf_updated["overlap_osm"],
    "Meck overlap with NCEM": meck_gdf_updated["overlap_nc"],
    "Meck overlap with OSM": meck_gdf_updated["overlap_osm"],
    "OSM overlap with NCEM": osm_gdf_updated["overlap_nc"],
    "OSM overlap with Meck": osm_gdf_updated["overlap_meck"],
}

# Figure setup
plt.figure(figsize=(16, 9))  # larger and presentation-friendly
flierprops = dict(marker='o', markersize=2, alpha=0.3, color='gray')

# Boxplot with enhanced styling
plt.boxplot(
    data.values(),
    labels=data.keys(),
    patch_artist=True,
    flierprops=flierprops,
    boxprops=dict(facecolor="#AEDFF7", color="black"),
    medianprops=dict(color="#D22B2B", linewidth=1.8),
    whiskerprops=dict(color="gray"),
    capprops=dict(color="gray")
)

# Titles and labels
plt.title("Building Footprint Overlap Ratios Between NCEM, Mecklenburg, and OSM Datasets",
          fontsize=17, weight="bold", pad=15)
plt.ylabel("Overlap Ratio (0â€“1)", fontsize=13)
plt.xticks(rotation=25, ha='right', fontsize=11)
plt.grid(axis="y", linestyle="--", alpha=0.6)

plt.tight_layout()
plt.show()
# --- Filter last four comparisons: 0 < overlap â‰¤ 0.2 ---
filtered_data = {
    "Meck overlap with NCEM": meck_gdf_updated.loc[
        (meck_gdf_updated["overlap_nc"] > 0) & (meck_gdf_updated["overlap_nc"] <= 0.2), "overlap_nc"
    ],
    "Meck overlap with OSM": meck_gdf_updated.loc[
        (meck_gdf_updated["overlap_osm"] > 0) & (meck_gdf_updated["overlap_osm"] <= 0.2), "overlap_osm"
    ],
    "OSM overlap with NCEM": osm_gdf_updated.loc[
        (osm_gdf_updated["overlap_nc"] > 0) & (osm_gdf_updated["overlap_nc"] <= 0.2), "overlap_nc"
    ],
    "OSM overlap with Meck": osm_gdf_updated.loc[
        (osm_gdf_updated["overlap_meck"] > 0) & (osm_gdf_updated["overlap_meck"] <= 0.2), "overlap_meck"
    ],
}

# --- Count how many buildings per comparison ---
counts = {k: len(v) for k, v in filtered_data.items()}

# --- Plot setup ---
plt.figure(figsize=(14, 8))
flierprops = dict(marker='o', markersize=2, alpha=0.3, color='gray')

# --- Boxplot ---
box = plt.boxplot(
    filtered_data.values(),
    labels=filtered_data.keys(),
    patch_artist=True,
    flierprops=flierprops,
    boxprops=dict(facecolor="#AEDFF7", color="black"),
    medianprops=dict(color="#D22B2B", linewidth=1.8),
    whiskerprops=dict(color="gray"),
    capprops=dict(color="gray")
)

# --- Annotate frequency above each box ---
for i, (label, n) in enumerate(counts.items(), start=1):
    plt.text(i, 0.205, f"n = {n:,}", ha='center', va='bottom', fontsize=11, fontweight='bold', color='dimgray')

# --- Titles and formatting ---
plt.title("Building Footprint Overlaps (0% < Overlap â‰¤ 20%) â€” Mecklenburg & OSM Sources",
          fontsize=17, weight="bold", pad=15)
plt.ylabel("Overlap Ratio (0â€“0.2)", fontsize=13)
plt.xticks(rotation=20, ha='right', fontsize=11)
plt.ylim(0, 0.22)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

# --- Filter each comparison (0 < overlap â‰¤ 0.2) ---
meck_on_nc_gdf = meck_gdf_updated.loc[
    (meck_gdf_updated["overlap_nc"] > 0) & (meck_gdf_updated["overlap_nc"] <= 0.2)
    ].copy()

meck_on_osm_gdf = meck_gdf_updated.loc[
    (meck_gdf_updated["overlap_osm"] > 0) & (meck_gdf_updated["overlap_osm"] <= 0.2)
    ].copy()

osm_on_nc_gdf = osm_gdf_updated.loc[
    (osm_gdf_updated["overlap_nc"] > 0) & (osm_gdf_updated["overlap_nc"] <= 0.2)
    ].copy()

osm_on_meck_gdf = osm_gdf_updated.loc[
    (osm_gdf_updated["overlap_meck"] > 0) & (osm_gdf_updated["overlap_meck"] <= 0.2)
    ].copy()

# --- Optional: save each to file ---
output_dir = "../../../Data/Final_dataset/ABT/outputs_building_overlap/output_less20"
os.makedirs(output_dir, exist_ok=True)

meck_on_nc_gdf.to_file(os.path.join(output_dir, "Meck_overlap_with_NCEM_0to20.gpkg"), driver="GPKG")
meck_on_osm_gdf.to_file(os.path.join(output_dir, "Meck_overlap_with_OSM_0to20.gpkg"), driver="GPKG")
osm_on_nc_gdf.to_file(os.path.join(output_dir, "OSM_overlap_with_NCEM_0to20.gpkg"), driver="GPKG")
osm_on_meck_gdf.to_file(os.path.join(output_dir, "OSM_overlap_with_Meck_0to20.gpkg"), driver="GPKG")

print("âœ… Extracted and saved low-overlap GeoPackages successfully.")
print("Counts:")
print(f"Meckâ†’NCEM: {len(meck_on_nc_gdf):,}")
print(f"Meckâ†’OSM: {len(meck_on_osm_gdf):,}")
print(f"OSMâ†’NCEM: {len(osm_on_nc_gdf):,}")
print(f"OSMâ†’Meck: {len(osm_on_meck_gdf):,}")

gdf_selected = osm_gdf

# drop all completely empty (all-NaN) columns
gdfs = [nc_gdf, meck_gdf, osm_gdf, meck_unqiue, osm_unqiue]

# Drop fully empty columns in each
for i in range(len(gdfs)):
    gdfs[i] = gdfs[i].dropna(axis=1, how='all')

# Unpack back into variables
nc_gdf, meck_gdf, osm_gdf, meck_unqiue, osm_unqiue = gdfs

non_geom_cols = [c for c in gdf_selected.columns if c != "geometry"]

# --- Loop through all attributes ---
for col in non_geom_cols:
    series = gdf_selected[col].dropna()

    # Skip completely empty columns
    if series.empty:
        continue

    # Detect type
    if pd.api.types.is_numeric_dtype(series):
        col_type = "Continuous"
        print(f"\nðŸ§® {col}  â†’  {col_type}")
        print(series.describe().to_string())
    else:
        col_type = "Categorical"
        unique_vals = series.unique()
        nunique = len(unique_vals)
        print(f"\nðŸ”  {col}  â†’  {col_type} ({nunique} unique values)")
        # Only show up to first 20 unique values
        sample_vals = unique_vals[:20]
        print(f"Sample values: {sample_vals}")