In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from syngrid.data_processor.processing.building_processor import BuildingHeuristicsProcessor
import geopandas as gpd

In [2]:
# Analyze Building results: 

buildings_filepath = '/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/OSM/buildings.geojson'
gdf = gpd.read_file(buildings_filepath)

In [None]:
def count_geometry_types(gdf):
        
    if gdf is None or len(gdf) == 0:
        return {"Empty GeoDataFrame": 0}
        
    # Count by geometry type
    type_counts = gdf.geometry.geom_type.value_counts().to_dict()
    
    # Log the results
    for geom_type, count in type_counts.items():
        print(f"Found {count} features of type {geom_type}")
        
    return type_counts

import matplotlib.pyplot as plt
import contextily as cx
import geopandas as gpd
from shapely.geometry import Point, Polygon
import numpy as np

def plot_mixed_geometries(gdf, title="Points and Polygons", figsize=(12, 10), 
                         point_color='red', polygon_color='blue', alpha=0.6, 
                         basemap=True, point_size=50):
    """
    Plot points and polygons from a GeoDataFrame in different colors on a map.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with mixed geometry types
    title : str
        Plot title
    figsize : tuple
        Figure size as (width, height)
    point_color : str
        Color for point geometries
    polygon_color : str
        Color for polygon geometries
    alpha : float
        Transparency level (0-1)
    basemap : bool
        Whether to add a contextily basemap
    point_size : int
        Size of point markers
    
    Returns:
    --------
    fig, ax : matplotlib figure and axis objects
    """
    # Ensure we have a valid GeoDataFrame
    if gdf is None or len(gdf) == 0:
        print("Empty GeoDataFrame provided")
        return None, None
    
    # Ensure the GeoDataFrame is in a Web Mercator projection for contextily basemap
    if basemap and gdf.crs != "EPSG:3857":
        gdf_web_mercator = gdf.to_crs("EPSG:3857")
    else:
        gdf_web_mercator = gdf.copy()
        
    # Create figure and axis
    fig, ax = plt.subplots(figsize=figsize)
    
    # Separate points and polygons
    points = gdf_web_mercator[gdf_web_mercator.geometry.geom_type == 'Point']
    polygons = gdf_web_mercator[gdf_web_mercator.geometry.geom_type.isin(['Polygon', 'MultiPolygon'])]
    
    # Get count of each type for the legend
    points_count = len(points)
    polygon_count = len(polygons)
    
    # Plot the data
    if not polygons.empty:
        polygons.plot(ax=ax, color=polygon_color, alpha=alpha, label=f'Polygons ({polygon_count})')
    
    if not points.empty:
        points.plot(ax=ax, color=point_color, markersize=point_size, alpha=alpha, label=f'Points ({points_count})')
    
    # Add a basemap if requested
    if basemap:
        cx.add_basemap(ax, source=cx.providers.OpenStreetMap.Mapnik)
    
    # Set plot title and add legend
    ax.set_title(title, fontsize=15)
    ax.legend(fontsize=12)
    
    # Remove axis labels for map
    ax.set_axis_off()
    
    plt.tight_layout()
    
    # Add text showing which points are inside polygons
    if not points.empty and not polygons.empty:
        # Count points inside any polygon
        points_inside = 0
        for point in points.geometry:
            if any(polygon.contains(point) for polygon in polygons.geometry):
                points_inside += 1
        
        percent_inside = (points_inside / len(points)) * 100 if len(points) > 0 else 0
        plt.figtext(0.5, 0.01, 
                   f"{points_inside} of {len(points)} points ({percent_inside:.1f}%) are inside polygons",
                   ha="center", fontsize=12, bbox={"facecolor":"white", "alpha":0.7, "pad":5})
    
    return fig, ax

# Example usage:
fig, ax = plot_mixed_geometries(gdf, title="Buildings and POIs")
plt.show()

count_geometry_types(gdf)


In [None]:
gdf.head(20)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from ydata_profiling import ProfileReport


def explore_geodataframe(gdf, exclude_cols=None, max_categories=10, figsize=(18, 12)):
    """
    Comprehensive exploration of a GeoDataFrame, analyzing categorical distributions and null values.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        The GeoDataFrame to analyze
    exclude_cols : list
        List of columns to exclude from analysis (e.g., ['geometry', 'id', 'element'])
    max_categories : int
        Maximum number of categories to display in distribution plots
    figsize : tuple
        Figure size for plots
        
    Returns:
    --------
    dict : Dictionary with analysis results
    """
    if exclude_cols is None:
        exclude_cols = ['geometry', 'id', 'element', 'index']
    
    # Create a copy without excluded columns
    df_analysis = gdf.drop(columns=[col for col in exclude_cols if col in gdf.columns])
    
    # Basic information
    print(f"Dataset has {len(gdf)} rows and {len(df_analysis.columns)} columns (excluding {len(exclude_cols)} specified columns)")
    
    # 1. Null values analysis
    print("\n--- NULL VALUES ANALYSIS ---")
    null_counts = df_analysis.isnull().sum().sort_values(ascending=False)
    null_percent = (null_counts / len(df_analysis) * 100).round(2)
    null_df = pd.DataFrame({'Count': null_counts, 'Percent': null_percent})

    # Only show columns with nulls
    null_df = null_df[null_df['Count'] > 0]
    if len(null_df) > 0:
        display(null_df)
        
        # Calculate plot height based on number of columns (at least 8 inches tall)
        plot_height = max(8, len(null_df) * 0.4)
        
        # Visualize null values with a much taller plot
        plt.figure(figsize=(figsize[0], plot_height))
        
        # Create the horizontal bar plot
        bars = plt.barh(null_df.index, null_df['Percent'], color='skyblue')
        
        # Add percentage labels to the right of each bar
        for bar in bars:
            width = bar.get_width()
            plt.text(width + 1, bar.get_y() + bar.get_height()/2, 
                    f'{width:.1f}%', 
                    va='center', fontweight='bold')
        
        plt.title('Percentage of NULL Values by Column', fontsize=14)
        plt.xlabel('Percent (%)', fontsize=12)
        plt.xlim(0, min(100, null_df['Percent'].max() * 1.15))  # Add some space for labels
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
    else:
        print("No NULL values found in any column!")
    
    # 2. Categorical columns analysis
    categorical_cols = df_analysis.select_dtypes(include=['object', 'category']).columns
    print(f"\n--- CATEGORICAL COLUMNS ANALYSIS ({len(categorical_cols)} columns) ---")
    
    results = {}
    if len(categorical_cols) > 0:
        for col in categorical_cols:
            # Skip if column has all nulls
            if df_analysis[col].isnull().all():
                results[col] = {'values': None, 'distribution': None}
                continue
                
            # Get value counts and non-null percentage
            val_counts = df_analysis[col].value_counts().reset_index()
            val_counts.columns = [col, 'Count']
            val_counts['Percent'] = (val_counts['Count'] / len(df_analysis) * 100).round(2)
            
            # Calculate fill rate
            non_null_percent = (df_analysis[col].count() / len(df_analysis) * 100).round(2)
            
            # Store in results
            results[col] = {
                'values': val_counts,
                'non_null_percent': non_null_percent,
                'unique_values': df_analysis[col].nunique()
            }
            
            # Display information
            print(f"\nColumn: {col}")
            print(f"Fill rate: {non_null_percent}% ({df_analysis[col].count()} / {len(df_analysis)})")
            print(f"Unique values: {df_analysis[col].nunique()}")
            
            # Display top categories if not too many
            if df_analysis[col].nunique() < 50:
                # Limit to top categories for display
                display(val_counts.head(max_categories))
                
                # Plot if not too many categories
                if df_analysis[col].nunique() <= max_categories:
                    plt.figure(figsize=(figsize[0]//2, figsize[1]//3))
                    sns.barplot(x='Count', y=col, data=val_counts)
                    plt.title(f'Distribution of {col}')
                    plt.tight_layout()
                    plt.show()
            else:
                print(f"Too many unique values ({df_analysis[col].nunique()}) to display")
    
    # 3. Numeric columns analysis
    numeric_cols = df_analysis.select_dtypes(include=['int64', 'float64']).columns
    print(f"\n--- NUMERIC COLUMNS ANALYSIS ({len(numeric_cols)} columns) ---")
    
    if len(numeric_cols) > 0:
        # Display summary statistics
        stats = df_analysis[numeric_cols].describe().T
        stats['fill_rate'] = (df_analysis[numeric_cols].count() / len(df_analysis) * 100).round(2)
        display(stats)
        
        # Plot distributions
        if len(numeric_cols) > 0:
            fig, axes = plt.subplots(nrows=min(len(numeric_cols), 3), ncols=1, figsize=figsize)
            if len(numeric_cols) == 1:
                axes = [axes]
                
            for i, col in enumerate(numeric_cols[:3]):  # Limit to 3 plots
                try:
                    sns.histplot(df_analysis[col].dropna(), kde=True, ax=axes[i])
                    axes[i].set_title(f'Distribution of {col}')
                except:
                    print(f"Could not plot {col}")
            
            plt.tight_layout()
            plt.show()
            
    # 4. OSM tag analysis (for OSM specific data)
    osm_tags = [col for col in df_analysis.columns if ':' in col]
    if len(osm_tags) > 0:
        print(f"\n--- OSM TAGS ANALYSIS ({len(osm_tags)} tags) ---")
        # Display the top OSM tags by presence
        tag_counts = pd.Series({tag: df_analysis[tag].count() for tag in osm_tags})
        tag_percent = (tag_counts / len(df_analysis) * 100).round(2)
        tag_df = pd.DataFrame({'Count': tag_counts, 'Percent': tag_percent}).sort_values('Count', ascending=False)
        
        display(tag_df.head(20))
        
        # Plot top 15 tags
        plt.figure(figsize=(figsize[0], figsize[1]//2))
        sns.barplot(x='Percent', y=tag_df.head(15).index, data=tag_df.head(15))
        plt.title('Top 15 OSM Tags by Presence')
        plt.xlabel('Percent (%)')
        plt.tight_layout()
        plt.show()
        
    # 5. Geometry type analysis
    print("\n--- GEOMETRY TYPE ANALYSIS ---")
    geom_counts = gdf.geometry.geom_type.value_counts()
    geom_percent = (geom_counts / len(gdf) * 100).round(2)
    geom_df = pd.DataFrame({'Count': geom_counts, 'Percent': geom_percent})
    display(geom_df)
    
    # Plot geometry types
    plt.figure(figsize=(figsize[0]//2, figsize[1]//3))
    plt.pie(geom_df['Count'], labels=geom_df.index, autopct='%1.1f%%')
    plt.title('Geometry Types')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
    # 6. Building type analysis (if applicable)
    if 'building' in df_analysis.columns:
        print("\n--- BUILDING TYPE ANALYSIS ---")
        bldg_counts = df_analysis['building'].value_counts().reset_index()
        bldg_counts.columns = ['building', 'Count']
        bldg_counts['Percent'] = (bldg_counts['Count'] / df_analysis['building'].count() * 100).round(2)
        
        display(bldg_counts.head(max_categories))
        
        # Plot building types
        if len(bldg_counts) <= max_categories:
            plt.figure(figsize=(figsize[0], figsize[1]//2))
            sns.barplot(x='Count', y='building', data=bldg_counts)
            plt.title('Building Types')
            plt.tight_layout()
            plt.show()
    
    return results

# # Example usage:
# analysis_results = explore_geodataframe(gdf, exclude_cols=['geometry', 'id', 'element'])


# Transform the gdf to a dataframe by dropping the geometry column
from ydata_profiling import ProfileReport

# Drop geometry column to avoid issues
df = gdf.drop(columns='geometry')

# Generate the profile report
profile = ProfileReport(df, title="Profiling Report", explorative=True)
profile.to_file("profiling_report.html")





In [None]:
gdf_filtered = gdf[gdf['amenity'].notna()].copy()
gdf_filtered = gdf_filtered[gdf_filtered['building']=='yes'].copy()

# Columns to display
# We'll only try to display columns that actually exist in gdf_filtered
cols_to_show_base = ['amenity', 'building', 'shop', 'office']
cols_to_show_existing = [col for col in cols_to_show_base if col in gdf_filtered.columns]
print(gdf_filtered[cols_to_show_existing])



In [None]:
from pyrosm import OSM
import geopandas as gpd
from pyrosm import get_data
fp = get_data("Massachusetts")
bounding_box = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/CENSUS/target_region_boundary.geojson")
polygon = bounding_box.geometry.iloc[0]

In [None]:
# Initialize the OSM parser object
import time 
start_time = time.time()
osm = OSM(fp,bounding_box=polygon)
buildings = osm.get_buildings()
buildings.plot()

In [None]:
ax = buildings.plot(column="building", figsize=(12,12), legend=True, legend_kwds=dict(loc='upper left', ncol=3, bbox_to_anchor=(1, 1)))


In [None]:
custom_filter = {'amenity': True}
pois = osm.get_pois(custom_filter=custom_filter)

# Gather info about POI type (combines the tag info from "amenity" and "shop")
pois["poi_type"] = pois["amenity"]

# Plot
ax = pois.plot(column='poi_type', markersize=3, figsize=(12,12), legend=True, legend_kwds=dict(loc='upper left', ncol=5, bbox_to_anchor=(1, 1)))

In [None]:
landuse = osm.get_landuse()
landuse.plot(column='landuse', legend=True, figsize=(10,6))

In [None]:
network = osm.get_network(network_type="all")
network.plot()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import contextily as cx
import geopandas as gpd
from shapely.errors import GEOSException # Import for handling potential geometry errors

buildings = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/unassigned_buildings.geojson")
def plot_buildings_with_floor_area(
    gdf: gpd.GeoDataFrame,
    floor_area_column: str = 'floor_area',
    title: str = "Building Footprints with Floor Area",
    figsize: tuple = (15, 15),
    polygon_color: str = 'blue',
    alpha: float = 0.5,
    basemap_provider: cx.providers = cx.providers.OpenStreetMap.Mapnik,
    label_fontsize: int = 8,
    label_color: str = 'black',
    label_background_alpha: float = 0.7,
    label_bbox_pad: float = 0.2
) -> None:
    """
    Plots building polygons from a GeoDataFrame on a contextily basemap,
    labeling each building with its floor area.

    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with building geometries and a floor area column.
        Must have a valid CRS.
    floor_area_column : str, optional
        The name of the column in gdf containing the floor area values.
        Defaults to 'floor_area'.
    title : str, optional
        Plot title. Defaults to "Building Footprints with Floor Area".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (15, 15).
    polygon_color : str, optional
        Color for polygon geometries. Defaults to 'blue'.
    alpha : float, optional
        Transparency level for polygons (0-1). Defaults to 0.5.
    basemap_provider : contextily.providers object, optional
        The contextily basemap provider.
        Defaults to cx.providers.OpenStreetMap.Mapnik.
    label_fontsize : int, optional
        Font size for the floor area labels. Defaults to 8.
    label_color : str, optional
        Color for the floor area labels. Defaults to 'black'.
    label_background_alpha : float, optional
        Alpha for the label background box (0-1). Defaults to 0.7.
    label_bbox_pad : float, optional
        Padding for the label background box. Defaults to 0.2.

    Returns:
    --------
    None
        Displays the plot.
    """
    if not isinstance(gdf, gpd.GeoDataFrame):
        print("Input must be a GeoDataFrame.")
        return
    if gdf.empty:
        print("GeoDataFrame is empty. Nothing to plot.")
        return
    
    if floor_area_column not in gdf.columns:
        print(f"Column '{floor_area_column}' not found in GeoDataFrame.")
        return
    if gdf.crs is None:
        print("GeoDataFrame must have a Coordinate Reference System (CRS) defined.")
        return

    # Ensure the GeoDataFrame is in Web Mercator projection for contextily
    gdf_web_mercator = gdf.to_crs(epsg=3857)

    fig, ax = plt.subplots(figsize=figsize)

    # Plot polygons
    gdf_web_mercator.plot(
        ax=ax,
        color=polygon_color,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.5
    )

    # Add basemap
    try:
        cx.add_basemap(ax, source=basemap_provider, zoom='auto')
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without it.")


    # Add floor area labels
    for idx, row in gdf_web_mercator.iterrows():
        try:
            # Use a representative point for labeling, robust to invalid polygons
            point_for_label = row.geometry.representative_point()
            if point_for_label.is_empty: # Check if representative_point is valid
                 # Fallback to centroid if representative_point is empty (should be rare)
                point_for_label = row.geometry.centroid
            
            if not point_for_label.is_empty: # Final check before plotting text
                floor_area = row[floor_area_column]
                label = f"{floor_area:,.0f} m²" if pd.notnull(floor_area) else "N/A"
                
                ax.text(
                    point_for_label.x,
                    point_for_label.y,
                    label,
                    fontsize=label_fontsize,
                    color=label_color,
                    ha='center',
                    va='center',
                    bbox=dict(
                        boxstyle='round,pad=' + str(label_bbox_pad),
                        fc='white',
                        alpha=label_background_alpha,
                        ec='none' # No edge color for bbox
                    )
                )
        except GEOSException:
            print(f"Warning: Could not generate a representative point for geometry at index {idx}. Skipping label.")
        except Exception as e:
            print(f"An unexpected error occurred while labeling geometry at index {idx}: {e}")


    ax.set_title(title, fontsize=18)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()


# Randomly sample ten rows from gpd:
samples = buildings.sample(n=100, random_state=1)

plot_buildings_with_floor_area(samples)

In [None]:

# Blocks and buildigns itnersection: 
import matplotlib.pyplot as plt
import pandas as pd
import contextily as cx
import geopandas as gpd
from shapely.errors import GEOSException # Import for handling potential geometry errors

# Load the data
unassigned_buildings = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/unassigned_buildings.geojson")
census_blocks = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/CENSUS/target_region_blocks.geojson")

def plot_unassigned_buildings_with_census_blocks(
    buildings_gdf: gpd.GeoDataFrame,
    census_blocks_gdf: gpd.GeoDataFrame,
    title: str = "Unassigned Buildings and Census Blocks",
    figsize: tuple = (15, 15),
    block_color: str = 'red',
    building_color: str = 'blue',
    block_alpha: float = 0.3,
    building_alpha: float = 0.8,
    basemap_provider: cx.providers = cx.providers.OpenStreetMap.Mapnik
) -> None:
    """
    Plots census blocks and unassigned buildings on a contextily basemap to visualize
    spatial relationships and identify why buildings couldn't be assigned to blocks.

    Parameters:
    -----------
    buildings_gdf : GeoDataFrame
        GeoDataFrame with unassigned building geometries.
    census_blocks_gdf : GeoDataFrame
        GeoDataFrame with census block geometries.
    title : str, optional
        Plot title. Defaults to "Unassigned Buildings and Census Blocks".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (15, 15).
    block_color : str, optional
        Color for census block polygons. Defaults to 'red'.
    building_color : str, optional
        Color for building polygons. Defaults to 'blue'.
    block_alpha : float, optional
        Transparency level for census blocks (0-1). Defaults to 0.3.
    building_alpha : float, optional
        Transparency level for buildings (0-1). Defaults to 0.8.
    basemap_provider : contextily.providers object, optional
        The contextily basemap provider.
        Defaults to cx.providers.OpenStreetMap.Mapnik.

    Returns:
    --------
    None
        Displays the plot.
    """
    if not isinstance(buildings_gdf, gpd.GeoDataFrame):
        print("Buildings input must be a GeoDataFrame.")
        return
    if not isinstance(census_blocks_gdf, gpd.GeoDataFrame):
        print("Census blocks input must be a GeoDataFrame.")
        return
    if buildings_gdf.empty:
        print("Buildings GeoDataFrame is empty. Nothing to plot.")
        return
    if census_blocks_gdf.empty:
        print("Census blocks GeoDataFrame is empty. Nothing to plot.")
        return
    
    if buildings_gdf.crs is None:
        print("Buildings GeoDataFrame must have a Coordinate Reference System (CRS) defined.")
        return
    if census_blocks_gdf.crs is None:
        print("Census blocks GeoDataFrame must have a Coordinate Reference System (CRS) defined.")
        return

    # Ensure both GeoDataFrames are in Web Mercator projection for contextily
    buildings_web_mercator = buildings_gdf.to_crs(epsg=3857)
    blocks_web_mercator = census_blocks_gdf.to_crs(epsg=3857)

    fig, ax = plt.subplots(figsize=figsize)

    # Plot census blocks first (background layer)
    blocks_web_mercator.plot(
        ax=ax,
        color=block_color,
        alpha=block_alpha,
        edgecolor='darkred',
        linewidth=1.0,
        label='Census Blocks'
    )

    # Plot buildings on top
    buildings_web_mercator.plot(
        ax=ax,
        color=building_color,
        alpha=building_alpha,
        edgecolor='darkblue',
        linewidth=0.5,
        label='Unassigned Buildings'
    )

    # Add basemap
    try:
        cx.add_basemap(ax, source=basemap_provider, zoom='auto')
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without it.")

    # Add legend
    ax.legend(loc='upper right', fontsize=12, framealpha=0.9)
    
    # Add title and statistics
    stats_text = f"Census Blocks: {len(census_blocks_gdf)}\nUnassigned Buildings: {len(buildings_gdf)}"
    ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=12,
            verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

    ax.set_title(title, fontsize=18)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()

def plot_buildings_with_floor_area(
    gdf: gpd.GeoDataFrame,
    floor_area_column: str = 'floor_area',
    title: str = "Building Footprints with Floor Area",
    figsize: tuple = (15, 15),
    polygon_color: str = 'blue',
    alpha: float = 0.5,
    basemap_provider: cx.providers = cx.providers.OpenStreetMap.Mapnik,
    label_fontsize: int = 8,
    label_color: str = 'black',
    label_background_alpha: float = 0.7,
    label_bbox_pad: float = 0.2
) -> None:
    """
    Plots building polygons from a GeoDataFrame on a contextily basemap,
    labeling each building with its floor area.

    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with building geometries and a floor area column.
        Must have a valid CRS.
    floor_area_column : str, optional
        The name of the column in gdf containing the floor area values.
        Defaults to 'floor_area'.
    title : str, optional
        Plot title. Defaults to "Building Footprints with Floor Area".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (15, 15).
    polygon_color : str, optional
        Color for polygon geometries. Defaults to 'blue'.
    alpha : float, optional
        Transparency level for polygons (0-1). Defaults to 0.5.
    basemap_provider : contextily.providers object, optional
        The contextily basemap provider.
        Defaults to cx.providers.OpenStreetMap.Mapnik.
    label_fontsize : int, optional
        Font size for the floor area labels. Defaults to 8.
    label_color : str, optional
        Color for the floor area labels. Defaults to 'black'.
    label_background_alpha : float, optional
        Alpha for the label background box (0-1). Defaults to 0.7.
    label_bbox_pad : float, optional
        Padding for the label background box. Defaults to 0.2.

    Returns:
    --------
    None
        Displays the plot.
    """
    if not isinstance(gdf, gpd.GeoDataFrame):
        print("Input must be a GeoDataFrame.")
        return
    if gdf.empty:
        print("GeoDataFrame is empty. Nothing to plot.")
        return
    
    if floor_area_column not in gdf.columns:
        print(f"Column '{floor_area_column}' not found in GeoDataFrame.")
        return
    if gdf.crs is None:
        print("GeoDataFrame must have a Coordinate Reference System (CRS) defined.")
        return

    # Ensure the GeoDataFrame is in Web Mercator projection for contextily
    gdf_web_mercator = gdf.to_crs(epsg=3857)

    fig, ax = plt.subplots(figsize=figsize)

    # Plot polygons
    gdf_web_mercator.plot(
        ax=ax,
        color=polygon_color,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.5
    )

    # Add basemap
    try:
        cx.add_basemap(ax, source=basemap_provider, zoom='auto')
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without it.")


    # Add floor area labels
    for idx, row in gdf_web_mercator.iterrows():
        try:
            # Use a representative point for labeling, robust to invalid polygons
            point_for_label = row.geometry.representative_point()
            if point_for_label.is_empty: # Check if representative_point is valid
                 # Fallback to centroid if representative_point is empty (should be rare)
                point_for_label = row.geometry.centroid
            
            if not point_for_label.is_empty: # Final check before plotting text
                floor_area = row[floor_area_column]
                label = f"{floor_area:,.0f} m²" if pd.notnull(floor_area) else "N/A"
                
                ax.text(
                    point_for_label.x,
                    point_for_label.y,
                    label,
                    fontsize=label_fontsize,
                    color=label_color,
                    ha='center',
                    va='center',
                    bbox=dict(
                        boxstyle='round,pad=' + str(label_bbox_pad),
                        fc='white',
                        alpha=label_background_alpha,
                        ec='none' # No edge color for bbox
                    )
                )
        except GEOSException:
            print(f"Warning: Could not generate a representative point for geometry at index {idx}. Skipping label.")
        except Exception as e:
            print(f"An unexpected error occurred while labeling geometry at index {idx}: {e}")


    ax.set_title(title, fontsize=18)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()

# Plot the unassigned buildings with census blocks
print(f"Plotting {len(unassigned_buildings)} unassigned buildings with {len(census_blocks)} census blocks")
plot_unassigned_buildings_with_census_blocks(unassigned_buildings, census_blocks)

# Randomly sample ten buildings from the unassigned buildings for detailed view
if len(unassigned_buildings) > 10:
    samples = unassigned_buildings.sample(n=10, random_state=1)
    plot_buildings_with_floor_area(samples, title="Sample of Unassigned Buildings with Floor Area")
else:
    plot_buildings_with_floor_area(unassigned_buildings, title="All Unassigned Buildings with Floor Area")

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx

def plot_buildings(
    geojson_path: str,
    title: str = "Building Footprints",
    figsize: tuple = (12, 12),
    polygon_color: str = 'blue',
    alpha: float = 0.6,
    basemap_provider=ctx.providers.OpenStreetMap.Mapnik
) -> None:
    """
    Reads a GeoJSON file and plots building footprints on a web map using Contextily.

    Parameters:
    -----------
    geojson_path : str
        Path to the GeoJSON file containing building polygons.
    title : str, optional
        Plot title. Defaults to "Building Footprints".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (12, 12).
    polygon_color : str, optional
        Color for building polygons. Defaults to 'blue'.
    alpha : float, optional
        Polygon transparency. Defaults to 0.6.
    basemap_provider : contextily provider, optional
        Tile provider. Defaults to OpenStreetMap.Mapnik.

    Returns:
    --------
    None
    """

    # Step 1: Load data
    try:
        gdf = gpd.read_file(geojson_path)
    except Exception as e:
        print(f"Error reading GeoJSON: {e}")
        return

    # # Step 2: Ensure valid CRS
    # if gdf.crs is None or gdf.crs.to_string().lower() in ["urn:ogc:def:crs:ogc:1.3:crs84", "crs84"]:
    #     gdf = gdf.set_crs(epsg=4326)  # WGS84 lon/lat

    # Step 3: Reproject to Web Mercator
    gdf_web_mercator = gdf.to_crs(epsg=3857)

    # Step 4: Plot
    fig, ax = plt.subplots(figsize=figsize)
    gdf_web_mercator.plot(
        ax=ax,
        color=polygon_color,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.5
    )

    try:
        ctx.add_basemap(ax, source=basemap_provider, crs=gdf_web_mercator.crs)
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without basemap.")

    ax.set_title(title, fontsize=16)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()

path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/MICROSOFT_BUILDINGS/microsoft_buildings_25_017_11000.geojson"

plot_buildings(path)

In [None]:
import folium

gdf = gpd.read_file(path)
centroid = gdf.unary_union.centroid
m = folium.Map(location=[centroid.y, centroid.x], zoom_start=12)
folium.GeoJson(gdf).add_to(m)
m.save("debug_map.html")

In [None]:
import geopandas as gpd
path_ms = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/buildings_with_ms_height.geojson"
gdf = gpd.read_file(path_ms)


In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
import pandas as pd
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
from shapely.errors import GEOSException
import seaborn as sns

def plot_buildings_by_height_with_stats_separate(
    gdf: gpd.GeoDataFrame,
    height_column: str = 'height',
    floor_area_column: str = 'floor_area',
    title: str = "Buildings Colored by Height",
    figsize: tuple = (15, 15),
    alpha: float = 0.7,
    basemap_provider=ctx.providers.OpenStreetMap.Mapnik,
    label_fontsize: int = 6,
    label_color: str = 'black',
    label_background_alpha: float = 0.8,
    n_bins: int = 5,
    colormap: str = 'viridis',
    show_labels: bool = True,
    sample_for_labels: int = None
) -> dict:
    """
    Plots building polygons colored by height with floor area labels and provides statistical analysis.
    Creates two separate plots for better readability.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with building geometries, height, and floor area columns
    height_column : str
        Column name containing height values (will be converted to numeric)
    floor_area_column : str 
        Column name containing floor area values for labels
    title : str
        Plot title
    figsize : tuple
        Figure size as (width, height)
    alpha : float
        Transparency level for polygons (0-1)
    basemap_provider : contextily provider
        Basemap tile provider
    label_fontsize : int
        Font size for floor area labels
    label_color : str
        Color for floor area labels
    label_background_alpha : float
        Alpha for label background boxes
    n_bins : int
        Number of bins for height categorization
    colormap : str
        Matplotlib colormap name for height coloring
    show_labels : bool
        Whether to show floor area labels
    sample_for_labels : int or None
        If provided, randomly sample this many buildings for labeling (to avoid clutter)
    
    Returns:
    --------
    dict : Statistical analysis results
    """
    
    if gdf.empty:
        print("GeoDataFrame is empty. Nothing to plot.")
        return {}
    
    # Create working copy
    buildings = gdf.copy()
    
    # Convert height to numeric, handling string values and invalid data
    def parse_height(height_val):
        """Parse height value from various formats"""
        if pd.isna(height_val) or height_val is None:
            return np.nan
        try:
            # Handle string values that might have units
            height_str = str(height_val).strip()
            # Remove common units
            height_str = height_str.replace('m', '').replace('meters', '').replace('ft', '').replace('feet', '').strip()
            height_float = float(height_str)
            # Sanity check: reasonable building height
            if 0.5 <= height_float <= 500.0:
                return height_float
            else:
                return np.nan
        except (ValueError, TypeError):
            return np.nan
    
    # Parse height values
    buildings[height_column] = buildings[height_column].apply(parse_height)
    
    # Statistical Analysis
    print("="*60)
    print("BUILDING HEIGHT STATISTICAL ANALYSIS")
    print("="*60)
    
    # Basic statistics
    total_buildings = len(buildings)
    buildings_with_height = buildings[height_column].notna().sum()
    height_coverage = (buildings_with_height / total_buildings) * 100
    
    print(f"Total buildings: {total_buildings:,}")
    print(f"Buildings with height data: {buildings_with_height:,}")
    print(f"Height data coverage: {height_coverage:.1f}%")
    print(f"Buildings with missing height: {total_buildings - buildings_with_height:,} ({100-height_coverage:.1f}%)")
    
    # Height statistics for valid values
    height_data = buildings[height_column].dropna()
    if len(height_data) > 0:
        print(f"\nHeight Statistics (meters):")
        print(f"  Mean: {height_data.mean():.2f}m")
        print(f"  Median: {height_data.median():.2f}m")
        print(f"  Std Dev: {height_data.std():.2f}m")
        print(f"  Min: {height_data.min():.2f}m")
        print(f"  Max: {height_data.max():.2f}m")
        print(f"  25th percentile: {height_data.quantile(0.25):.2f}m")
        print(f"  75th percentile: {height_data.quantile(0.75):.2f}m")
        
        # Outlier analysis using IQR method
        Q1 = height_data.quantile(0.25)
        Q3 = height_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = height_data[(height_data < lower_bound) | (height_data > upper_bound)]
        
        print(f"\nOutlier Analysis (IQR method):")
        print(f"  IQR: {IQR:.2f}m")
        print(f"  Lower bound: {lower_bound:.2f}m")
        print(f"  Upper bound: {upper_bound:.2f}m")
        print(f"  Number of outliers: {len(outliers)} ({len(outliers)/len(height_data)*100:.1f}%)")
        if len(outliers) > 0:
            print(f"  Outlier range: {outliers.min():.2f}m - {outliers.max():.2f}m")
        
        # Height distribution by bins
        print(f"\nHeight Distribution:")
        bins = np.linspace(height_data.min(), height_data.max(), n_bins + 1)
        height_counts, _ = np.histogram(height_data, bins=bins)
        for i in range(len(bins)-1):
            count = height_counts[i]
            percentage = (count / len(height_data)) * 100
            print(f"  {bins[i]:.1f}m - {bins[i+1]:.1f}m: {count} buildings ({percentage:.1f}%)")
    
    # Floor area statistics for context
    if floor_area_column in buildings.columns:
        area_data = buildings[floor_area_column].dropna()
        if len(area_data) > 0:
            print(f"\nFloor Area Statistics (m²):")
            print(f"  Mean: {area_data.mean():.0f}m²")
            print(f"  Median: {area_data.median():.0f}m²")
            print(f"  Min: {area_data.min():.0f}m²")
            print(f"  Max: {area_data.max():.0f}m²")
    
    # Now create the plots
    print(f"\n{'='*60}")
    print("CREATING VISUALIZATION")
    print("="*60)
    
    # Filter buildings with valid height data for plotting
    buildings_with_valid_height = buildings[buildings[height_column].notna()].copy()
    buildings_without_height = buildings[buildings[height_column].isna()].copy()
    
    if len(buildings_with_valid_height) == 0:
        print("No buildings with valid height data to plot.")
        return {"error": "No valid height data"}
    
    # Project to Web Mercator for plotting
    if buildings.crs != "EPSG:3857":
        buildings_web_mercator = buildings.to_crs("EPSG:3857")
        buildings_with_valid_height_web_mercator = buildings_with_valid_height.to_crs("EPSG:3857")
        if len(buildings_without_height) > 0:
            buildings_without_height_web_mercator = buildings_without_height.to_crs("EPSG:3857")
    else:
        buildings_web_mercator = buildings.copy()
        buildings_with_valid_height_web_mercator = buildings_with_valid_height.copy()
        buildings_without_height_web_mercator = buildings_without_height.copy()
    
    # Create height bins for coloring
    height_values = buildings_with_valid_height_web_mercator[height_column]
    min_height = height_values.min()
    max_height = height_values.max()
    
    # Create bins
    bins = np.linspace(min_height, max_height, n_bins + 1)
    buildings_with_valid_height_web_mercator['height_bin'] = pd.cut(
        buildings_with_valid_height_web_mercator[height_column], 
        bins=bins, 
        labels=[f"{bins[i]:.1f}-{bins[i+1]:.1f}m" for i in range(len(bins)-1)],
        include_lowest=True
    )
    
    # PLOT 1: Main map visualization
    print("Creating main map plot...")
    fig1, ax1 = plt.subplots(1, 1, figsize=figsize)
    
    # Plot buildings without height in gray
    if len(buildings_without_height) > 0:
        buildings_without_height_web_mercator.plot(
            ax=ax1,
            color='lightgray',
            alpha=alpha*0.5,
            edgecolor='gray',
            linewidth=0.3,
            label=f'No height data ({len(buildings_without_height)})'
        )
    
    # Plot buildings with height using colormap
    buildings_with_valid_height_web_mercator.plot(
        ax=ax1,
        column=height_column,
        cmap=colormap,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.3,
        legend=True,
        legend_kwds={'label': 'Height (m)', 'shrink': 0.8}
    )
    
    # Add basemap
    try:
        ctx.add_basemap(ax1, source=basemap_provider, crs=buildings_web_mercator.crs)
    except Exception as e:
        print(f"Could not add basemap: {e}")
    
    # Add floor area labels if requested
    if show_labels and floor_area_column in buildings.columns:
        # Sample buildings for labeling if requested
        if sample_for_labels and len(buildings_with_valid_height_web_mercator) > sample_for_labels:
            buildings_to_label = buildings_with_valid_height_web_mercator.sample(n=sample_for_labels, random_state=42)
        else:
            buildings_to_label = buildings_with_valid_height_web_mercator
        
        print(f"Adding floor area labels to {len(buildings_to_label)} buildings...")
        
        for idx, row in buildings_to_label.iterrows():
            try:
                point_for_label = row.geometry.representative_point()
                if not point_for_label.is_empty:
                    floor_area = row[floor_area_column]
                    if pd.notnull(floor_area):
                        label = f"{floor_area:,.0f}m²"
                        ax1.text(
                            point_for_label.x,
                            point_for_label.y,
                            label,
                            fontsize=label_fontsize,
                            color=label_color,
                            ha='center',
                            va='center',
                            bbox=dict(
                                boxstyle='round,pad=0.2',
                                fc='white',
                                alpha=label_background_alpha,
                                ec='none'
                            )
                        )
            except GEOSException:
                continue
    
    ax1.set_title(f"{title}\n{buildings_with_height:,} buildings with height data", fontsize=16, pad=20)
    ax1.set_axis_off()
    
    plt.tight_layout()
    plt.show()
    
    # PLOT 2: Statistical analysis dashboard
    print("Creating statistical analysis plots...")
    fig2, axes2 = plt.subplots(2, 2, figsize=(16, 12))
    
    # Distribution histogram
    if len(height_data) > 0:
        axes2[0,0].hist(height_data, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
        axes2[0,0].axvline(height_data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {height_data.mean():.1f}m')
        axes2[0,0].axvline(height_data.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {height_data.median():.1f}m')
        axes2[0,0].set_xlabel('Height (meters)', fontsize=12)
        axes2[0,0].set_ylabel('Number of Buildings', fontsize=12)
        axes2[0,0].set_title('Height Distribution', fontsize=14, fontweight='bold')
        axes2[0,0].legend()
        axes2[0,0].grid(True, alpha=0.3)
    
    # Box plot
    if len(height_data) > 0:
        bp = axes2[0,1].boxplot(height_data, patch_artist=True)
        bp['boxes'][0].set_facecolor('lightblue')
        axes2[0,1].set_title('Height Distribution (Box Plot)', fontsize=14, fontweight='bold')
        axes2[0,1].set_ylabel('Height (m)', fontsize=12)
        axes2[0,1].grid(True, alpha=0.3)
        
        # Add outlier information
        if len(outliers) > 0:
            axes2[0,1].text(0.02, 0.98, f'Outliers: {len(outliers)} ({len(outliers)/len(height_data)*100:.1f}%)', 
                           transform=axes2[0,1].transAxes, verticalalignment='top',
                           bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # Coverage pie chart
    coverage_data = [buildings_with_height, total_buildings - buildings_with_height]
    coverage_labels = [f'With Height\n({buildings_with_height:,})', f'Missing Height\n({total_buildings - buildings_with_height:,})']
    colors = ['#66b3ff', '#ffcc99']
    axes2[1,0].pie(coverage_data, labels=coverage_labels, autopct='%1.1f%%', startangle=90, colors=colors)
    axes2[1,0].set_title('Height Data Coverage', fontsize=14, fontweight='bold')
    
    # Height bins bar chart
    if len(height_data) > 0:
        bin_counts = buildings_with_valid_height['height_bin'].value_counts().sort_index()
        bars = axes2[1,1].bar(range(len(bin_counts)), bin_counts.values, color='lightcoral', edgecolor='black')
        axes2[1,1].set_xticks(range(len(bin_counts)))
        axes2[1,1].set_xticklabels(bin_counts.index, rotation=45, ha='right')
        axes2[1,1].set_xlabel('Height Bins', fontsize=12)
        axes2[1,1].set_ylabel('Number of Buildings', fontsize=12)
        axes2[1,1].set_title('Buildings by Height Range', fontsize=14, fontweight='bold')
        axes2[1,1].grid(True, alpha=0.3, axis='y')
        
        # Add value labels on bars
        for bar, count in zip(bars, bin_counts.values):
            height = bar.get_height()
            axes2[1,1].text(bar.get_x() + bar.get_width()/2., height + height*0.01,
                           f'{count}', ha='center', va='bottom', fontsize=10)
    
    plt.suptitle('Building Height Statistical Analysis', fontsize=16, fontweight='bold', y=0.95)
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()
    
    # PLOT 3: Height vs Floor Area scatter plot (if floor area data is available)
    if floor_area_column in buildings.columns:
        valid_both = buildings[(buildings[height_column].notna()) & (buildings[floor_area_column].notna())]
        if len(valid_both) > 0:
            print("Creating height vs floor area correlation plot...")
            fig3, ax3 = plt.subplots(1, 1, figsize=(12, 8))
            
            scatter = ax3.scatter(valid_both[floor_area_column], valid_both[height_column], 
                                alpha=0.6, c=valid_both[height_column], cmap=colormap, s=30)
            ax3.set_xlabel('Floor Area (m²)', fontsize=12)
            ax3.set_ylabel('Height (m)', fontsize=12)
            ax3.set_title('Building Height vs Floor Area Correlation', fontsize=14, fontweight='bold')
            ax3.grid(True, alpha=0.3)
            
            # Add colorbar
            cbar = plt.colorbar(scatter, ax=ax3)
            cbar.set_label('Height (m)', fontsize=12)
            
            # Calculate and display correlation
            correlation = valid_both[height_column].corr(valid_both[floor_area_column])
            ax3.text(0.02, 0.98, f'Correlation: {correlation:.3f}', 
                    transform=ax3.transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                    fontsize=12)
            
            plt.tight_layout()
            plt.show()
    
    # Return statistical summary
    stats_summary = {
        'total_buildings': total_buildings,
        'buildings_with_height': buildings_with_height,
        'height_coverage_percent': height_coverage,
        'height_stats': {
            'mean': height_data.mean() if len(height_data) > 0 else None,
            'median': height_data.median() if len(height_data) > 0 else None,
            'std': height_data.std() if len(height_data) > 0 else None,
            'min': height_data.min() if len(height_data) > 0 else None,
            'max': height_data.max() if len(height_data) > 0 else None,
            'q25': height_data.quantile(0.25) if len(height_data) > 0 else None,
            'q75': height_data.quantile(0.75) if len(height_data) > 0 else None
        },
        'outliers': {
            'count': len(outliers) if len(height_data) > 0 else 0,
            'percentage': len(outliers)/len(height_data)*100 if len(height_data) > 0 else 0,
            'lower_bound': lower_bound if len(height_data) > 0 else None,
            'upper_bound': upper_bound if len(height_data) > 0 else None
        }
    }
    
    return stats_summary

# Example usage with the separate plots version:
path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/buildings_with_ms_height.geojson"
buildings_gdf = gpd.read_file(path)

stats = plot_buildings_by_height_with_stats_separate(
    gdf=buildings_gdf,
    height_column='height',
    floor_area_column='floor_area',
    show_labels=True,
    sample_for_labels=50,  # Only label 50 random buildings to avoid clutter
    n_bins=6,
    colormap='plasma',
    figsize=(14, 14)
)

In [None]:
# now checking the buildings where height is equal to null : 
osm_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/buildings_with_ms_height.geojson"
ms_buildings_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/MICROSOFT_BUILDINGS/ms_buildings_output.geojson"

osm_gdf = gpd.read_file(osm_path)
ms_gdf = gpd.read_file(ms_buildings_path)


# Filter buildings with null height
null_height_buildings = osm_gdf[osm_gdf['height'].isna()]

# Print statistics about null height buildings
print(f"Total buildings: {len(null_height_buildings)}")

# Plot the null height buildings
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
null_height_buildings.plot(ax=ax, color='red', markersize=10)
plt.show()



In [None]:
import pandas as pd
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the input and output paths
input_file = Path("/Users/magic-rabbit/Documents/MA-data/NREL_Residential_typology.tsv")
output_dir = Path("county_data")
output_dir.mkdir(exist_ok=True)

# Process the file in chunks
chunk_size = 100000
processed_counties = set()

logger.info("Starting to process the TSV file...")

for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
    # Convert county to string for consistent naming
    chunk['in.county'] = chunk['in.county'].astype(str)
    logger.info(chunk)
    
    # Process each county in the current chunk
    for county_id, group in chunk.groupby('in.county'):
        if county_id not in processed_counties:
            output_file = output_dir / f"county_{county_id}.parquet"
            group.to_parquet(output_file, index=False)
            processed_counties.add(county_id)
            logger.info(f"Saved data for county {county_id}")
    
    # Optional: Add a progress indicator
    logger.info(f"Processed {len(processed_counties)} unique counties so far")

logger.info("Finished processing all counties")

In [None]:


# Load a single county's data
import pandas as pd

# Load one county file (using G0100370 as an example)
county_data = pd.read_parquet("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/notebooks/county_data/county_G0100810.parquet")

# Display basic information about the data
print("Data shape:", county_data.shape)
print("\nColumns:", county_data.columns.tolist())
print("\nFirst few rows:")
display(county_data.head())


columns_to_show = [
    'in.county',
    'in.state',
    'in.puma',
    'in.ahs_region',
    'in.american_housing_survey_region',
    'in.resstock_county_id',
    'in.resstock_puma_id',
    'in.nhgis_county_gisjoin',
    'in.nhgis_puma_gisjoin',
    'in.state_name'
]

# Display the first 5 random rows of the selected columns
print(county_data[columns_to_show].sample(n=3, random_state=1))

# Show some basic statistics
print("\nBasic statistics:")
display(county_data.describe())

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
from shapely.geometry import Point, Polygon
import numpy as np

def plot_height_data_gaps_limited(osm_buildings_path, ms_buildings_path, max_overlaps=100, output_path=None):
    """
    Plot first N OSM buildings with missing height data alongside overlapping MS buildings.
    Shows MS building centroids as red X markers.
    
    Parameters:
    -----------
    osm_buildings_path : str
        Path to OSM buildings with height data
    ms_buildings_path : str  
        Path to MS buildings data
    max_overlaps : int
        Maximum number of overlaps to plot (default 100)
    output_path : str, optional
        Path to save the plot
    """
    
    # Load data
    print("Loading building data...")
    osm_gdf = gpd.read_file(osm_buildings_path)
    ms_gdf = gpd.read_file(ms_buildings_path)
    
    # Filter OSM buildings with null height
    null_height_osm = osm_gdf[osm_gdf['height'].isna()].copy()
    print(f"OSM buildings without height data: {len(null_height_osm)}")
    print(f"Total MS buildings: {len(ms_gdf)}")
    
    if len(null_height_osm) == 0:
        print("No OSM buildings without height data found!")
        return
    
    # Ensure same CRS for spatial operations
    if null_height_osm.crs != ms_gdf.crs:
        print("Aligning CRS for spatial operations...")
        ms_gdf = ms_gdf.to_crs(null_height_osm.crs)
    
    # Find MS buildings that intersect with null height OSM buildings (limited)
    print(f"Finding first {max_overlaps} intersecting MS buildings...")
    intersecting_pairs = []
    
    # Create spatial index for efficiency
    ms_sindex = ms_gdf.sindex
    
    for idx, osm_building in null_height_osm.iterrows():
        if len(intersecting_pairs) >= max_overlaps:
            break
            
        # Get potential matches from spatial index
        possible_matches_idx = list(ms_sindex.intersection(osm_building.geometry.bounds))
        possible_matches = ms_gdf.iloc[possible_matches_idx]
        
        # Check actual intersections
        intersecting_ms = possible_matches[possible_matches.intersects(osm_building.geometry)]
        
        for ms_idx, ms_building in intersecting_ms.iterrows():
            if len(intersecting_pairs) >= max_overlaps:
                break
                
            intersecting_pairs.append({
                'osm_idx': idx,
                'ms_idx': ms_idx,
                'osm_geometry': osm_building.geometry,
                'ms_geometry': ms_building.geometry,
                'ms_centroid': ms_building.geometry.centroid,
                'ms_height': ms_building.get('height', 'Unknown'),
                'ms_confidence': ms_building.get('confidence', 'Unknown'),
                'centroid_within': osm_building.geometry.contains(ms_building.geometry.centroid)
            })
    
    print(f"Found {len(intersecting_pairs)} OSM-MS building intersection pairs (limited to {max_overlaps})")
    
    if len(intersecting_pairs) == 0:
        print("No intersections found between null height OSM buildings and MS buildings")
        return
    
    # Get unique OSM buildings from the pairs for plotting
    unique_osm_indices = list(set(p['osm_idx'] for p in intersecting_pairs))
    selected_osm_buildings = null_height_osm.loc[unique_osm_indices]
    
    # Convert to Web Mercator for contextily basemap
    selected_osm_web = selected_osm_buildings.to_crs(epsg=3857)
    
    # Create figure
    fig, ax = plt.subplots(1, 1, figsize=(15, 15))
    
    # Plot selected OSM buildings without height (light gray background)
    selected_osm_web.plot(ax=ax, color='lightgray', alpha=0.6, 
                         edgecolor='gray', linewidth=0.5)
    
    # Separate geometries by centroid position
    centroids_within = []
    centroids_outside = []
    osm_geoms_within = []
    osm_geoms_outside = []
    ms_geoms_within = []
    ms_geoms_outside = []
    
    for pair in intersecting_pairs:
        # Convert geometries to Web Mercator
        osm_geom_web = gpd.GeoSeries([pair['osm_geometry']], crs=null_height_osm.crs).to_crs(epsg=3857)
        ms_geom_web = gpd.GeoSeries([pair['ms_geometry']], crs=ms_gdf.crs).to_crs(epsg=3857)
        ms_centroid_web = gpd.GeoSeries([pair['ms_centroid']], crs=ms_gdf.crs).to_crs(epsg=3857)
        
        if pair['centroid_within']:
            # Centroid is within OSM building
            osm_geoms_within.extend(osm_geom_web.geometry)
            ms_geoms_within.extend(ms_geom_web.geometry)
            centroids_within.extend(ms_centroid_web.geometry)
        else:
            # Centroid is outside OSM building
            osm_geoms_outside.extend(osm_geom_web.geometry)
            ms_geoms_outside.extend(ms_geom_web.geometry)
            centroids_outside.extend(ms_centroid_web.geometry)
    
    # Plot OSM buildings - green if centroid within, red if outside
    if osm_geoms_within:
        gpd.GeoSeries(osm_geoms_within, crs='EPSG:3857').plot(
            ax=ax, color='green', alpha=0.7, edgecolor='darkgreen', linewidth=2)
    
    if osm_geoms_outside:
        gpd.GeoSeries(osm_geoms_outside, crs='EPSG:3857').plot(
            ax=ax, color='red', alpha=0.7, edgecolor='darkred', linewidth=2)
    
    # Plot MS buildings - blue if centroid within, orange if outside
    if ms_geoms_within:
        gpd.GeoSeries(ms_geoms_within, crs='EPSG:3857').plot(
            ax=ax, color='blue', alpha=0.5, edgecolor='darkblue', linewidth=1)
    
    if ms_geoms_outside:
        gpd.GeoSeries(ms_geoms_outside, crs='EPSG:3857').plot(
            ax=ax, color='orange', alpha=0.5, edgecolor='darkorange', linewidth=1)
    
    # Plot all MS building centroids as red X markers
    all_centroids = centroids_within + centroids_outside
    if all_centroids:
        gpd.GeoSeries(all_centroids, crs='EPSG:3857').plot(
            ax=ax, color='red', marker='x', markersize=5, alpha=1.0)
    
    # Add basemap
    try:
        ctx.add_basemap(ax, crs='EPSG:3857', source=ctx.providers.OpenStreetMap.Mapnik, alpha=0.7)
    except Exception as e:
        print(f"Could not add basemap: {e}")
    
    # Create custom legend
    from matplotlib.patches import Patch
    from matplotlib.lines import Line2D
    
    legend_elements = [
        Patch(facecolor='lightgray', alpha=0.6, label='OSM buildings without height'),
        Patch(facecolor='green', alpha=0.7, label='OSM buildings (centroid within)'),
        Patch(facecolor='red', alpha=0.7, label='OSM buildings (centroid outside)'),
        Patch(facecolor='blue', alpha=0.5, label='MS buildings (centroid within)'),
        Patch(facecolor='orange', alpha=0.5, label='MS buildings (centroid outside)'),
        Line2D([0], [0], marker='x', color='red', linestyle='None', 
               markersize=10, label='MS building centroids')
    ]
    ax.legend(handles=legend_elements, loc='upper right', fontsize=10)
    
    # Set title and labels
    ax.set_title(f'OSM Buildings Missing Height Data vs MS Buildings (First {len(intersecting_pairs)} overlaps)', 
                fontsize=14, fontweight='bold')
    ax.set_xlabel('Longitude', fontsize=12)
    ax.set_ylabel('Latitude', fontsize=12)
    
    # Remove axis ticks for cleaner look
    ax.set_xticks([])
    ax.set_yticks([])
    
    plt.tight_layout()
    
    if output_path:
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to: {output_path}")
    
    plt.show()
    
    # Print summary statistics
    print("\n" + "="*60)
    print("SUMMARY STATISTICS (First 100 overlaps)")
    print("="*60)
    
    centroids_within_count = len(centroids_within)
    centroids_outside_count = len(centroids_outside)
    total = centroids_within_count + centroids_outside_count
    
    print(f"Total intersections analyzed: {total}")
    print(f"MS centroids WITHIN OSM buildings: {centroids_within_count} ({centroids_within_count/total*100:.1f}%)")
    print(f"MS centroids OUTSIDE OSM buildings: {centroids_outside_count} ({centroids_outside_count/total*100:.1f}%)")

# Usage
plot_height_data_gaps_limited(
    osm_buildings_path="/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/buildings_with_ms_height.geojson",
    ms_buildings_path="/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/MICROSOFT_BUILDINGS/ms_buildings_output.geojson",
    max_overlaps=100,
    output_path="height_data_gap_analysis_100.png"
)

In [None]:
import pandas as pd
from pathlib import Path
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define the input and output paths
input_file = Path("/Users/magic-rabbit/Documents/MA-data/NREL_Residential_typology.tsv")
output_dir = Path("county_data")
output_dir.mkdir(exist_ok=True)

# Process the file in chunks
chunk_size = 100000
processed_counties = set()

logger.info("Starting to process the TSV file...")

for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
    # Convert county to string for consistent naming
    chunk['in.county'] = chunk['in.county'].astype(str)
    logger.info(chunk)
    
    # Process each county in the current chunk
    for county_id, group in chunk.groupby('in.county'):
        if county_id not in processed_counties:
            output_file = output_dir / f"county_{county_id}.parquet"
            group.to_parquet(output_file, index=False)
            processed_counties.add(county_id)
            logger.info(f"Saved data for county {county_id}")
    
    # Optional: Add a progress indicator
    logger.info(f"Processed {len(processed_counties)} unique counties so far")

logger.info("Finished processing all counties")

In [None]:


# Load a single county's data
import pandas as pd

# Load one county file (using G0100370 as an example)
county_data = pd.read_parquet("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/notebooks/county_data/county_G0100810.parquet")

# Display basic information about the data
print("Data shape:", county_data.shape)
print("\nColumns:", county_data.columns.tolist())
print("\nFirst few rows:")
display(county_data.head())


columns_to_show = [
    'in.county',
    'in.state',
    'in.puma',
    'in.ahs_region',
    'in.american_housing_survey_region',
    'in.resstock_county_id',
    'in.resstock_puma_id',
    'in.nhgis_county_gisjoin',
    'in.nhgis_puma_gisjoin',
    'in.state_name'
]

# Display the first 5 random rows of the selected columns
print(county_data[columns_to_show].sample(n=3, random_state=1))

# Show some basic statistics
print("\nBasic statistics:")
display(county_data.describe())

In [None]:
import pandas as pd

def analyze_and_generate_res_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze a ResStock-style building dataset and return weighted distribution
    of simplified residential building types.

    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame with building metadata including weights and building types.

    Returns:
    --------
    res_types_df : pd.DataFrame
        A DataFrame summarizing building types and their weighted percentage shares.
    """

    # === Exploratory Summary ===
    print("=== Dataset Overview ===")
    print(df.info())
    print("\n=== Unique Building Types (ACS Classification) ===")
    print(df["in.geometry_building_type_acs"].value_counts(dropna=False))
    print("\n=== Weight Column Summary ===")
    print(df["weight"].describe())

    # === Data Cleaning ===
    df_clean = df.dropna(subset=["weight", "in.geometry_building_type_acs"])

    # === Mapping Raw Types to Simplified Codes ===
    def map_building_type(bt: str) -> int:
        if "Detached" in bt:
            return 1  # SFH
        elif "Attached" in bt:
            return 2  # TH
        elif "2-4" in bt:
            return 3  # MFH
        elif "5+" in bt:
            return 4  # AB
        else:
            return None  # Unused/Other types

    df_clean["Building_Type"] = df_clean["in.geometry_building_type_acs"].apply(map_building_type)
    df_clean = df_clean.dropna(subset=["Building_Type"])

    # === Weighted Share Calculation ===
    grouped = df_clean.groupby("Building_Type")["weight"].sum().reset_index()
    total_weight = grouped["weight"].sum()
    grouped["Stati_res_percent"] = grouped["weight"] / total_weight * 100

    return grouped[["Building_Type", "Stati_res_percent"]]

analyze_and_generate_res_types(county_data)

In [None]:
import pandas as pd

def analyze_and_generate_res_types(df: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze a ResStock-style building dataset and return weighted distribution
    of simplified residential building types.

    Parameters:
    -----------
    df : pd.DataFrame
        Input DataFrame with building metadata including weights and building types.

    Returns:
    --------
    res_types_df : pd.DataFrame
        A DataFrame summarizing building types and their weighted percentage shares.
    """

    # === Exploratory Summary ===
    print("=== Dataset Overview ===")
    print(df.info())
    print("\n=== Unique Building Types (ACS Classification) ===")
    print(df["in.geometry_building_type_acs"].value_counts(dropna=False))
    print("\n=== Weight Column Summary ===")
    print(df["weight"].describe())

    # === Data Cleaning ===
    df_clean = df.dropna(subset=["weight", "in.geometry_building_type_acs"])

    # === Mapping Raw Types to Simplified Codes ===
    def map_building_type(bt: str) -> int:
        if "Detached" in bt:
            return 1  # SFH
        elif "Attached" in bt:
            return 2  # TH
        elif "2-4" in bt:
            return 3  # MFH
        elif "5+" in bt:
            return 4  # AB
        else:
            return None  # Unused/Other types

    df_clean["Building_Type"] = df_clean["in.geometry_building_type_acs"].apply(map_building_type)
    df_clean = df_clean.dropna(subset=["Building_Type"])

    # === Weighted Share Calculation ===
    grouped = df_clean.groupby("Building_Type")["weight"].sum().reset_index()
    total_weight = grouped["weight"].sum()
    grouped["Stati_res_percent"] = grouped["weight"] / total_weight * 100

    return grouped[["Building_Type", "Stati_res_percent"]]

analyze_and_generate_res_types(county_data)

In [None]:
# Function used to visualize buildings in a given census block
import contextily as ctx

# I want to now visualize the geometry of  all buildings in a for which census_block_id = 250173546012000 and also draw the cesnsu block boundary by filtereing GEOID20

census_block_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/CENSUS/target_region_blocks.geojson"
census_block_gdf = gpd.read_file(census_block_path)

# Filter census block by GEOID20
census_block_boundary = census_block_gdf[census_block_gdf['GEOID20'] == '250173547001005']

# Load buildings with ids
buildings_with_ids_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/06_residential_buildings_with_building_type.geojson"
buildings_with_ids_gdf = gpd.read_file(buildings_with_ids_path)
print(f"Number of buildings in the census block: {len(buildings_with_ids_gdf)}")

# Filter buildings by census block ID
buildings_in_block = buildings_with_ids_gdf[buildings_with_ids_gdf['census_block_id'] == '250173547001005']

# Convert to Web Mercator for contextily
census_block_mercator = census_block_boundary.to_crs(epsg=3857)
buildings_mercator = buildings_in_block.to_crs(epsg=3857)

# Plot
fig, ax = plt.subplots(figsize=(12, 10))

# Plot census block boundary in blue
census_block_mercator.plot(ax=ax, color='blue', alpha=0.3, edgecolor='blue', linewidth=3)

# Plot buildings in red
buildings_mercator.plot(ax=ax, color='red', alpha=0.8)

# Add basemap
ctx.add_basemap(ax, crs=census_block_mercator.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

ax.set_title('Buildings in Census Block 250173546012000')
plt.show()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx

# Read unassigned buildings
unassigned_buildings_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/unassigned_buildings.geojson"
unassigned_buildings_gdf = gpd.read_file(unassigned_buildings_path)

# Read census blocks
census_block_path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/CENSUS/target_region_blocks.geojson"
census_blocks_gdf = gpd.read_file(census_block_path)

# Convert to Web Mercator for contextily
unassigned_buildings_mercator = unassigned_buildings_gdf.to_crs(epsg=3857)
census_blocks_mercator = census_blocks_gdf.to_crs(epsg=3857)

# Plot
fig, ax = plt.subplots(figsize=(15, 12))

# Plot census blocks in opaque blue underneath
census_blocks_mercator.plot(ax=ax, color='blue', alpha=0.3, edgecolor='darkblue', linewidth=0.5)

# Plot unassigned buildings in red on top
unassigned_buildings_mercator.plot(ax=ax, color='red', alpha=0.8, markersize=20)

# Add basemap
ctx.add_basemap(ax, crs=census_blocks_mercator.crs.to_string(), source=ctx.providers.OpenStreetMap.Mapnik)

ax.set_title(f'Unassigned Buildings ({len(unassigned_buildings_gdf)}) and Census Blocks', fontsize=14)
plt.show()

print(f"Total unassigned buildings: {len(unassigned_buildings_gdf)}")
print(f"Total census blocks: {len(census_blocks_gdf)}")