In [11]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '../../..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from syngrid.data_processor.processing.building_processor import BuildingHeuristicsProcessor
import geopandas as gpd

In [12]:
# Analyze Building results: 

buildings_filepath = '/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/OSM/buildings.geojson'
gdf = gpd.read_file(buildings_filepath)

In [None]:
def count_geometry_types(gdf):
        
    if gdf is None or len(gdf) == 0:
        return {"Empty GeoDataFrame": 0}
        
    # Count by geometry type
    type_counts = gdf.geometry.geom_type.value_counts().to_dict()
    
    # Log the results
    for geom_type, count in type_counts.items():
        print(f"Found {count} features of type {geom_type}")
        
    return type_counts

import matplotlib.pyplot as plt
import contextily as cx
import geopandas as gpd
from shapely.geometry import Point, Polygon
import numpy as np

def plot_mixed_geometries(gdf, title="Points and Polygons", figsize=(12, 10), 
                         point_color='red', polygon_color='blue', alpha=0.6, 
                         basemap=True, point_size=50):
    """
    Plot points and polygons from a GeoDataFrame in different colors on a map.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with mixed geometry types
    title : str
        Plot title
    figsize : tuple
        Figure size as (width, height)
    point_color : str
        Color for point geometries
    polygon_color : str
        Color for polygon geometries
    alpha : float
        Transparency level (0-1)
    basemap : bool
        Whether to add a contextily basemap
    point_size : int
        Size of point markers
    
    Returns:
    --------
    fig, ax : matplotlib figure and axis objects
    """
    # Ensure we have a valid GeoDataFrame
    if gdf is None or len(gdf) == 0:
        print("Empty GeoDataFrame provided")
        return None, None
    
    # Ensure the GeoDataFrame is in a Web Mercator projection for contextily basemap
    if basemap and gdf.crs != "EPSG:3857":
        gdf_web_mercator = gdf.to_crs("EPSG:3857")
    else:
        gdf_web_mercator = gdf.copy()
        
    # Create figure and axis
    fig, ax = plt.subplots(figsize=figsize)
    
    # Separate points and polygons
    points = gdf_web_mercator[gdf_web_mercator.geometry.geom_type == 'Point']
    polygons = gdf_web_mercator[gdf_web_mercator.geometry.geom_type.isin(['Polygon', 'MultiPolygon'])]
    
    # Get count of each type for the legend
    points_count = len(points)
    polygon_count = len(polygons)
    
    # Plot the data
    if not polygons.empty:
        polygons.plot(ax=ax, color=polygon_color, alpha=alpha, label=f'Polygons ({polygon_count})')
    
    if not points.empty:
        points.plot(ax=ax, color=point_color, markersize=point_size, alpha=alpha, label=f'Points ({points_count})')
    
    # Add a basemap if requested
    if basemap:
        cx.add_basemap(ax, source=cx.providers.OpenStreetMap.Mapnik)
    
    # Set plot title and add legend
    ax.set_title(title, fontsize=15)
    ax.legend(fontsize=12)
    
    # Remove axis labels for map
    ax.set_axis_off()
    
    plt.tight_layout()
    
    # Add text showing which points are inside polygons
    if not points.empty and not polygons.empty:
        # Count points inside any polygon
        points_inside = 0
        for point in points.geometry:
            if any(polygon.contains(point) for polygon in polygons.geometry):
                points_inside += 1
        
        percent_inside = (points_inside / len(points)) * 100 if len(points) > 0 else 0
        plt.figtext(0.5, 0.01, 
                   f"{points_inside} of {len(points)} points ({percent_inside:.1f}%) are inside polygons",
                   ha="center", fontsize=12, bbox={"facecolor":"white", "alpha":0.7, "pad":5})
    
    return fig, ax

# Example usage:
fig, ax = plot_mixed_geometries(gdf, title="Buildings and POIs")
plt.show()

count_geometry_types(gdf)


In [None]:
gdf.head(20)

In [None]:
BHP = BuildingHeuristicsProcessor(
    output_dir="/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/OSM/"
)
buildings = BHP.add_floor_area(gdf)


In [None]:
buildings.head(40)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

def explore_geodataframe(gdf, exclude_cols=None, max_categories=10, figsize=(18, 12)):
    """
    Comprehensive exploration of a GeoDataFrame, analyzing categorical distributions and null values.
    
    Parameters:
    -----------
    gdf : GeoDataFrame
        The GeoDataFrame to analyze
    exclude_cols : list
        List of columns to exclude from analysis (e.g., ['geometry', 'id', 'element'])
    max_categories : int
        Maximum number of categories to display in distribution plots
    figsize : tuple
        Figure size for plots
        
    Returns:
    --------
    dict : Dictionary with analysis results
    """
    if exclude_cols is None:
        exclude_cols = ['geometry', 'id', 'element', 'index']
    
    # Create a copy without excluded columns
    df_analysis = gdf.drop(columns=[col for col in exclude_cols if col in gdf.columns])
    
    # Basic information
    print(f"Dataset has {len(gdf)} rows and {len(df_analysis.columns)} columns (excluding {len(exclude_cols)} specified columns)")
    
    # 1. Null values analysis
    print("\n--- NULL VALUES ANALYSIS ---")
    null_counts = df_analysis.isnull().sum().sort_values(ascending=False)
    null_percent = (null_counts / len(df_analysis) * 100).round(2)
    null_df = pd.DataFrame({'Count': null_counts, 'Percent': null_percent})

    # Only show columns with nulls
    null_df = null_df[null_df['Count'] > 0]
    if len(null_df) > 0:
        display(null_df)
        
        # Calculate plot height based on number of columns (at least 8 inches tall)
        plot_height = max(8, len(null_df) * 0.4)
        
        # Visualize null values with a much taller plot
        plt.figure(figsize=(figsize[0], plot_height))
        
        # Create the horizontal bar plot
        bars = plt.barh(null_df.index, null_df['Percent'], color='skyblue')
        
        # Add percentage labels to the right of each bar
        for bar in bars:
            width = bar.get_width()
            plt.text(width + 1, bar.get_y() + bar.get_height()/2, 
                    f'{width:.1f}%', 
                    va='center', fontweight='bold')
        
        plt.title('Percentage of NULL Values by Column', fontsize=14)
        plt.xlabel('Percent (%)', fontsize=12)
        plt.xlim(0, min(100, null_df['Percent'].max() * 1.15))  # Add some space for labels
        plt.grid(axis='x', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()
    else:
        print("No NULL values found in any column!")
    
    # 2. Categorical columns analysis
    categorical_cols = df_analysis.select_dtypes(include=['object', 'category']).columns
    print(f"\n--- CATEGORICAL COLUMNS ANALYSIS ({len(categorical_cols)} columns) ---")
    
    results = {}
    if len(categorical_cols) > 0:
        for col in categorical_cols:
            # Skip if column has all nulls
            if df_analysis[col].isnull().all():
                results[col] = {'values': None, 'distribution': None}
                continue
                
            # Get value counts and non-null percentage
            val_counts = df_analysis[col].value_counts().reset_index()
            val_counts.columns = [col, 'Count']
            val_counts['Percent'] = (val_counts['Count'] / len(df_analysis) * 100).round(2)
            
            # Calculate fill rate
            non_null_percent = (df_analysis[col].count() / len(df_analysis) * 100).round(2)
            
            # Store in results
            results[col] = {
                'values': val_counts,
                'non_null_percent': non_null_percent,
                'unique_values': df_analysis[col].nunique()
            }
            
            # Display information
            print(f"\nColumn: {col}")
            print(f"Fill rate: {non_null_percent}% ({df_analysis[col].count()} / {len(df_analysis)})")
            print(f"Unique values: {df_analysis[col].nunique()}")
            
            # Display top categories if not too many
            if df_analysis[col].nunique() < 50:
                # Limit to top categories for display
                display(val_counts.head(max_categories))
                
                # Plot if not too many categories
                if df_analysis[col].nunique() <= max_categories:
                    plt.figure(figsize=(figsize[0]//2, figsize[1]//3))
                    sns.barplot(x='Count', y=col, data=val_counts)
                    plt.title(f'Distribution of {col}')
                    plt.tight_layout()
                    plt.show()
            else:
                print(f"Too many unique values ({df_analysis[col].nunique()}) to display")
    
    # 3. Numeric columns analysis
    numeric_cols = df_analysis.select_dtypes(include=['int64', 'float64']).columns
    print(f"\n--- NUMERIC COLUMNS ANALYSIS ({len(numeric_cols)} columns) ---")
    
    if len(numeric_cols) > 0:
        # Display summary statistics
        stats = df_analysis[numeric_cols].describe().T
        stats['fill_rate'] = (df_analysis[numeric_cols].count() / len(df_analysis) * 100).round(2)
        display(stats)
        
        # Plot distributions
        if len(numeric_cols) > 0:
            fig, axes = plt.subplots(nrows=min(len(numeric_cols), 3), ncols=1, figsize=figsize)
            if len(numeric_cols) == 1:
                axes = [axes]
                
            for i, col in enumerate(numeric_cols[:3]):  # Limit to 3 plots
                try:
                    sns.histplot(df_analysis[col].dropna(), kde=True, ax=axes[i])
                    axes[i].set_title(f'Distribution of {col}')
                except:
                    print(f"Could not plot {col}")
            
            plt.tight_layout()
            plt.show()
            
    # 4. OSM tag analysis (for OSM specific data)
    osm_tags = [col for col in df_analysis.columns if ':' in col]
    if len(osm_tags) > 0:
        print(f"\n--- OSM TAGS ANALYSIS ({len(osm_tags)} tags) ---")
        # Display the top OSM tags by presence
        tag_counts = pd.Series({tag: df_analysis[tag].count() for tag in osm_tags})
        tag_percent = (tag_counts / len(df_analysis) * 100).round(2)
        tag_df = pd.DataFrame({'Count': tag_counts, 'Percent': tag_percent}).sort_values('Count', ascending=False)
        
        display(tag_df.head(20))
        
        # Plot top 15 tags
        plt.figure(figsize=(figsize[0], figsize[1]//2))
        sns.barplot(x='Percent', y=tag_df.head(15).index, data=tag_df.head(15))
        plt.title('Top 15 OSM Tags by Presence')
        plt.xlabel('Percent (%)')
        plt.tight_layout()
        plt.show()
        
    # 5. Geometry type analysis
    print("\n--- GEOMETRY TYPE ANALYSIS ---")
    geom_counts = gdf.geometry.geom_type.value_counts()
    geom_percent = (geom_counts / len(gdf) * 100).round(2)
    geom_df = pd.DataFrame({'Count': geom_counts, 'Percent': geom_percent})
    display(geom_df)
    
    # Plot geometry types
    plt.figure(figsize=(figsize[0]//2, figsize[1]//3))
    plt.pie(geom_df['Count'], labels=geom_df.index, autopct='%1.1f%%')
    plt.title('Geometry Types')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
    
    # 6. Building type analysis (if applicable)
    if 'building' in df_analysis.columns:
        print("\n--- BUILDING TYPE ANALYSIS ---")
        bldg_counts = df_analysis['building'].value_counts().reset_index()
        bldg_counts.columns = ['building', 'Count']
        bldg_counts['Percent'] = (bldg_counts['Count'] / df_analysis['building'].count() * 100).round(2)
        
        display(bldg_counts.head(max_categories))
        
        # Plot building types
        if len(bldg_counts) <= max_categories:
            plt.figure(figsize=(figsize[0], figsize[1]//2))
            sns.barplot(x='Count', y='building', data=bldg_counts)
            plt.title('Building Types')
            plt.tight_layout()
            plt.show()
    
    return results

# Example usage:
analysis_results = explore_geodataframe(gdf, exclude_cols=['geometry', 'id', 'element'])


In [None]:
gdf_filtered = gdf[gdf['amenity'].notna()].copy()
gdf_filtered = gdf_filtered[gdf_filtered['building']=='yes'].copy()

# Columns to display
# We'll only try to display columns that actually exist in gdf_filtered
cols_to_show_base = ['amenity', 'building', 'shop', 'office']
cols_to_show_existing = [col for col in cols_to_show_base if col in gdf_filtered.columns]
print(gdf_filtered[cols_to_show_existing])



In [1]:
from pyrosm import OSM
import geopandas as gpd
from pyrosm import get_data
fp = get_data("Massachusetts")
bounding_box = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/Census/25_017_11000_blocks_boundary.geojson")
polygon = bounding_box.geometry.iloc[0]

In [None]:
# Initialize the OSM parser object
import time 
start_time = time.time()
osm = OSM(fp,bounding_box=polygon)
buildings = osm.get_buildings()
buildings.plot()

In [None]:
ax = buildings.plot(column="building", figsize=(12,12), legend=True, legend_kwds=dict(loc='upper left', ncol=3, bbox_to_anchor=(1, 1)))


In [None]:
custom_filter = {'amenity': True}
pois = osm.get_pois(custom_filter=custom_filter)

# Gather info about POI type (combines the tag info from "amenity" and "shop")
pois["poi_type"] = pois["amenity"]

# Plot
ax = pois.plot(column='poi_type', markersize=3, figsize=(12,12), legend=True, legend_kwds=dict(loc='upper left', ncol=5, bbox_to_anchor=(1, 1)))

In [None]:
landuse = osm.get_landuse()
landuse.plot(column='landuse', legend=True, figsize=(10,6))

In [None]:
network = osm.get_network(network_type="all")
network.plot()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import contextily as cx
import geopandas as gpd
from shapely.errors import GEOSException # Import for handling potential geometry errors

buildings = gpd.read_file("/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/buildings_with_floor_area.geojson")
def plot_buildings_with_floor_area(
    gdf: gpd.GeoDataFrame,
    floor_area_column: str = 'floor_area',
    title: str = "Building Footprints with Floor Area",
    figsize: tuple = (15, 15),
    polygon_color: str = 'blue',
    alpha: float = 0.5,
    basemap_provider: cx.providers = cx.providers.OpenStreetMap.Mapnik,
    label_fontsize: int = 8,
    label_color: str = 'black',
    label_background_alpha: float = 0.7,
    label_bbox_pad: float = 0.2
) -> None:
    """
    Plots building polygons from a GeoDataFrame on a contextily basemap,
    labeling each building with its floor area.

    Parameters:
    -----------
    gdf : GeoDataFrame
        Input GeoDataFrame with building geometries and a floor area column.
        Must have a valid CRS.
    floor_area_column : str, optional
        The name of the column in gdf containing the floor area values.
        Defaults to 'floor_area'.
    title : str, optional
        Plot title. Defaults to "Building Footprints with Floor Area".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (15, 15).
    polygon_color : str, optional
        Color for polygon geometries. Defaults to 'blue'.
    alpha : float, optional
        Transparency level for polygons (0-1). Defaults to 0.5.
    basemap_provider : contextily.providers object, optional
        The contextily basemap provider.
        Defaults to cx.providers.OpenStreetMap.Mapnik.
    label_fontsize : int, optional
        Font size for the floor area labels. Defaults to 8.
    label_color : str, optional
        Color for the floor area labels. Defaults to 'black'.
    label_background_alpha : float, optional
        Alpha for the label background box (0-1). Defaults to 0.7.
    label_bbox_pad : float, optional
        Padding for the label background box. Defaults to 0.2.

    Returns:
    --------
    None
        Displays the plot.
    """
    if not isinstance(gdf, gpd.GeoDataFrame):
        print("Input must be a GeoDataFrame.")
        return
    if gdf.empty:
        print("GeoDataFrame is empty. Nothing to plot.")
        return
    
    if floor_area_column not in gdf.columns:
        print(f"Column '{floor_area_column}' not found in GeoDataFrame.")
        return
    if gdf.crs is None:
        print("GeoDataFrame must have a Coordinate Reference System (CRS) defined.")
        return

    # Ensure the GeoDataFrame is in Web Mercator projection for contextily
    gdf_web_mercator = gdf.to_crs(epsg=3857)

    fig, ax = plt.subplots(figsize=figsize)

    # Plot polygons
    gdf_web_mercator.plot(
        ax=ax,
        color=polygon_color,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.5
    )

    # Add basemap
    try:
        cx.add_basemap(ax, source=basemap_provider, zoom='auto')
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without it.")


    # Add floor area labels
    for idx, row in gdf_web_mercator.iterrows():
        try:
            # Use a representative point for labeling, robust to invalid polygons
            point_for_label = row.geometry.representative_point()
            if point_for_label.is_empty: # Check if representative_point is valid
                 # Fallback to centroid if representative_point is empty (should be rare)
                point_for_label = row.geometry.centroid
            
            if not point_for_label.is_empty: # Final check before plotting text
                floor_area = row[floor_area_column]
                label = f"{floor_area:,.0f} m²" if pd.notnull(floor_area) else "N/A"
                
                ax.text(
                    point_for_label.x,
                    point_for_label.y,
                    label,
                    fontsize=label_fontsize,
                    color=label_color,
                    ha='center',
                    va='center',
                    bbox=dict(
                        boxstyle='round,pad=' + str(label_bbox_pad),
                        fc='white',
                        alpha=label_background_alpha,
                        ec='none' # No edge color for bbox
                    )
                )
        except GEOSException:
            print(f"Warning: Could not generate a representative point for geometry at index {idx}. Skipping label.")
        except Exception as e:
            print(f"An unexpected error occurred while labeling geometry at index {idx}: {e}")


    ax.set_title(title, fontsize=18)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()


# Randomly sample ten rows from gpd:
samples = buildings.sample(n=100, random_state=1)

plot_buildings_with_floor_area(samples)

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx

def plot_buildings(
    geojson_path: str,
    title: str = "Building Footprints",
    figsize: tuple = (12, 12),
    polygon_color: str = 'blue',
    alpha: float = 0.6,
    basemap_provider=ctx.providers.OpenStreetMap.Mapnik
) -> None:
    """
    Reads a GeoJSON file and plots building footprints on a web map using Contextily.

    Parameters:
    -----------
    geojson_path : str
        Path to the GeoJSON file containing building polygons.
    title : str, optional
        Plot title. Defaults to "Building Footprints".
    figsize : tuple, optional
        Figure size as (width, height). Defaults to (12, 12).
    polygon_color : str, optional
        Color for building polygons. Defaults to 'blue'.
    alpha : float, optional
        Polygon transparency. Defaults to 0.6.
    basemap_provider : contextily provider, optional
        Tile provider. Defaults to OpenStreetMap.Mapnik.

    Returns:
    --------
    None
    """

    # Step 1: Load data
    try:
        gdf = gpd.read_file(geojson_path)
    except Exception as e:
        print(f"Error reading GeoJSON: {e}")
        return

    # # Step 2: Ensure valid CRS
    # if gdf.crs is None or gdf.crs.to_string().lower() in ["urn:ogc:def:crs:ogc:1.3:crs84", "crs84"]:
    #     gdf = gdf.set_crs(epsg=4326)  # WGS84 lon/lat

    # Step 3: Reproject to Web Mercator
    gdf_web_mercator = gdf.to_crs(epsg=3857)

    # Step 4: Plot
    fig, ax = plt.subplots(figsize=figsize)
    gdf_web_mercator.plot(
        ax=ax,
        color=polygon_color,
        alpha=alpha,
        edgecolor='black',
        linewidth=0.5
    )

    try:
        ctx.add_basemap(ax, source=basemap_provider, crs=gdf_web_mercator.crs)
    except Exception as e:
        print(f"Could not add basemap: {e}. Plotting without basemap.")

    ax.set_title(title, fontsize=16)
    ax.set_axis_off()
    plt.tight_layout()
    plt.show()

path = "/Users/magic-rabbit/Documents/00_Tech-Repositories/05_MASTER_THESIS/SynGrid/syngrid/data_processor/output/MA/Middlesex_County/Cambridge_city/MICROSOFT_BUILDINGS/microsoft_buildings_25_017_11000.geojson"

plot_buildings(path)

In [None]:
import folium

gdf = gpd.read_file(path)
centroid = gdf.unary_union.centroid
m = folium.Map(location=[centroid.y, centroid.x], zoom_start=12)
folium.GeoJson(gdf).add_to(m)
m.save("debug_map.html")