Dependencies

In [None]:
!pip install geopandas
!pip install shapely



Un/zip Files

In [None]:
!unzip mean-sea-level-pressure-2020.zip

Archive:  mean-sea-level-pressure-2020.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of mean-sea-level-pressure-2020.zip or
        mean-sea-level-pressure-2020.zip.zip, and cannot find mean-sea-level-pressure-2020.zip.ZIP, period.


In [None]:
!zip -r /content/merged.zip /content/merged

Processing ERA5 to create polygons with attributes

In [None]:
import xarray as xr

filename = 'data_stream-oper.nc'

# Load the NetCDF dataset
ds = xr.open_dataset(filename)
ds

In [None]:
# Remove the attribute dimension
#ds = ds.squeeze(dim='msl')

# Round latitude and longitude to the nearest degree
ds = ds.assign_coords(
    lat_rounded=ds.latitude.round(0),
    lon_rounded=ds.longitude.round(0)
)
# Group by the rounded coordinates and compute the mean temperature
ds_grouped = ds.groupby(['lat_rounded', 'lon_rounded']).mean()
print(ds_grouped)

<xarray.Dataset> Size: 708B
Dimensions:      (lat_rounded: 5, lon_rounded: 5, valid_time: 5)
Coordinates:
  * lat_rounded  (lat_rounded) float64 40B 0.0 1.0 2.0 3.0 4.0
  * lon_rounded  (lon_rounded) float64 40B -120.0 -119.0 -118.0 -117.0 -116.0
    number       int64 8B 0
  * valid_time   (valid_time) datetime64[ns] 40B 2020-05-01 ... 2020-05-02
    expver       (valid_time) <U4 80B '0001' '0001' '0001' '0001' '0001'
Data variables:
    msl          (valid_time, lat_rounded, lon_rounded) float32 500B 1.009e+0...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-11-19T23:30 GRIB to CDM+CF via cfgrib-0.9.1...


In [None]:
# Convert the grouped dataset to a DataFrame for tabular display
df_grouped = ds_grouped.to_dataframe().reset_index()

# Print the resulting DataFrame
print(df_grouped[:10])

   lat_rounded  lon_rounded          valid_time            msl  number expver
0          0.0       -120.0 2020-05-01 00:00:00  100877.593750       0   0001
1          0.0       -120.0 2020-05-01 06:00:00  101223.148438       0   0001
2          0.0       -120.0 2020-05-01 12:00:00  100960.304688       0   0001
3          0.0       -120.0 2020-05-01 18:00:00  101278.570312       0   0001
4          0.0       -120.0 2020-05-02 00:00:00  100917.718750       0   0001
5          0.0       -120.0 2020-05-02 06:00:00  101290.343750       0   0001
6          0.0       -120.0 2020-05-02 12:00:00  101031.335938       0   0001
7          0.0       -120.0 2020-05-02 18:00:00  101348.242188       0   0001
8          0.0       -120.0 2020-05-03 00:00:00  100988.062500       0   0001
9          0.0       -120.0 2020-05-03 06:00:00  101294.656250       0   0001


In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import numpy as np
from datetime import datetime
from typing import Union, List, Optional, Dict

def create_grid_cell_polygon(longitude: float, latitude: float, cell_size: float = 1.0) -> Polygon:
    """
    Create a square grid cell centered at (longitude, latitude) with given cell size.

    Parameters:
        longitude (float): Center point longitude
        latitude (float): Center point latitude
        cell_size (float): Size of the grid cell in degrees

    Returns:
        Polygon: Square polygon representing the grid cell
    """
    half_size = cell_size / 2
    return Polygon([
        (longitude - half_size, latitude - half_size),  # bottom left
        (longitude - half_size, latitude + half_size),  # top left
        (longitude + half_size, latitude + half_size),  # top right
        (longitude + half_size, latitude - half_size),  # bottom right
        (longitude - half_size, latitude - half_size)   # back to start
    ])

def process_era5_grid(
    era5_data: pd.DataFrame,
    time_column: str = 'valid_time',
    lat_column: str = 'latitude',
    lon_column: str = 'longitude',
    value_column: str = 'msl',
    grid_cell_size: float = 1.0,
    simplify_tolerance: Optional[float] = 0.01,
    round_decimals: Optional[int] = 1,
    output_filename_pattern: str = 'msl_{}.geojson',
    crs: str = "EPSG:4326"
)  -> Dict[datetime, Dict]:
    """
    Convert ERA5 gridded data into GeoJSON files, one for each timestamp.

    Parameters:
        era5_data (pd.DataFrame): ERA5 data with columns for time, lat, lon, and values
        time_column (str): Name of timestamp column
        lat_column (str): Name of latitude column
        lon_column (str): Name of longitude column
        value_column (str): Name of data value column (e.g., temperature, pressure)
        grid_cell_size (float): Size of each grid cell in degrees
        simplify_tolerance (float): Tolerance for simplifying polygon geometries
        round_decimals (int): Number of decimal places to round values
        output_filename_pattern (str): Pattern for output files (e.g., 'temperature_{}.geojson')
        crs (str): Coordinate reference system
     Returns:
        Dict[str, Dict]: Dictionary mapping timestamp strings to GeoJSON objects
    """
    # Initialize results dictionary
    geojson_results = {}

    # Ensure timestamp column is datetime
    if not pd.api.types.is_datetime64_any_dtype(era5_data[time_column]):
        era5_data[time_column] = pd.to_datetime(era5_data[time_column])

    # Get list of unique timestamps
    unique_timestamps = sorted(era5_data[time_column].unique())

    # Process each timestamp separately
    for timestamp in unique_timestamps:
        print(f"Processing data for {timestamp}...")

        # Extract data for this timestamp
        current_timestamp_data = era5_data[era5_data[time_column] == timestamp].copy()

        # Get unique coordinate pairs for this timestamp
        unique_coordinates = current_timestamp_data[[lat_column, lon_column]].drop_duplicates()

        # Initialize lists to store our processed data
        grid_cells = []         # Will store the polygon for each grid cell
        grid_values = []        # Will store the data value for each cell
        grid_latitudes = []     # Will store the latitude of each cell
        grid_longitudes = []    # Will store the longitude of each cell

        # Process each unique coordinate pair
        total_coords = len(unique_coordinates)
        for idx, coordinate in unique_coordinates.iterrows():
            # Extract the latitude and longitude
            latitude = coordinate[lat_column]
            longitude = coordinate[lon_column]

            # Find the matching data value for this location
            matching_data = current_timestamp_data[
                (current_timestamp_data[lat_column] == latitude) &
                (current_timestamp_data[lon_column] == longitude)
            ]

            # Get the data value (temperature, pressure, etc.)
            data_value = matching_data[value_column].iloc[0]

            # Skip if the value is missing
            if pd.isna(data_value):
                continue

            # Create a polygon representing this grid cell
            grid_cell = create_grid_cell_polygon(
                longitude=longitude,
                latitude=latitude,
                cell_size=grid_cell_size
            )

            # Store all information for this grid cell
            grid_cells.append(grid_cell)
            grid_values.append(data_value)
            grid_latitudes.append(latitude)
            grid_longitudes.append(longitude)


        # Create GeoDataFrame from processed data
        geodata = gpd.GeoDataFrame(
            {
                time_column: timestamp,
                value_column: grid_values,
                lat_column: grid_latitudes,
                lon_column: grid_longitudes,
                'geometry': grid_cells
            },
            crs=crs
        )

        # Simplify the polygons if requested
        if simplify_tolerance is not None:
            geodata['geometry'] = geodata['geometry'].simplify(
                tolerance=simplify_tolerance,
                preserve_topology=True
            )

        # Round the data values if requested
        if round_decimals is not None:
            geodata[value_column] = geodata[value_column].round(round_decimals)

        # Convert to GeoJSON
        geojson_data = geodata.__geo_interface__

        # Store in results dictionary
        geojson_results[timestamp] = geojson_data

    return geojson_results

In [None]:
geojson_results = process_era5_grid(
    era5_data=df_grouped,
    time_column='valid_time',
    lat_column='lat_rounded',
    lon_column='lon_rounded',
    value_column='msl',
    grid_cell_size=1.0,
    round_decimals=1,
    simplify_tolerance = 0.01,
    output_filename_pattern = 'msl/msl_{}.geojson',
    crs="EPSG:4326"
)

geojson_results

Processing data for 2020-05-01 00:00:00...
Processing data for 2020-05-01 06:00:00...
Processing data for 2020-05-01 12:00:00...
Processing data for 2020-05-01 18:00:00...
Processing data for 2020-05-02 00:00:00...
Processing data for 2020-05-02 06:00:00...
Processing data for 2020-05-02 12:00:00...
Processing data for 2020-05-02 18:00:00...
Processing data for 2020-05-03 00:00:00...
Processing data for 2020-05-03 06:00:00...
Processing data for 2020-05-03 12:00:00...
Processing data for 2020-05-03 18:00:00...
Processing data for 2020-05-04 00:00:00...
Processing data for 2020-05-04 06:00:00...
Processing data for 2020-05-04 12:00:00...
Processing data for 2020-05-04 18:00:00...
Processing data for 2020-05-05 00:00:00...
Processing data for 2020-05-05 06:00:00...
Processing data for 2020-05-05 12:00:00...
Processing data for 2020-05-05 18:00:00...
Processing data for 2020-05-06 00:00:00...
Processing data for 2020-05-06 06:00:00...
Processing data for 2020-05-06 12:00:00...
Processing 

KeyboardInterrupt: 

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import numpy as np
from typing import Dict, Optional
from datetime import datetime
from tqdm import tqdm

def validate_era5_data(
    data: pd.DataFrame,
    time_column: str,
    lat_column: str,
    lon_column: str,
    value_column: str
) -> bool:
    """
    Comprehensive validation of ERA5 input data.

    Args:
        data: Input DataFrame
        time_column: Name of timestamp column
        lat_column: Name of latitude column
        lon_column: Name of longitude column
        value_column: Name of value column

    Returns:
        bool: True if data passes all validations

    Raises:
        TypeError: If input is not a DataFrame
        ValueError: If validation fails
    """
    # Type checking
    if not isinstance(data, pd.DataFrame):
        raise TypeError(f"Expected pandas DataFrame, got {type(data)}")

    # Check required columns
    missing_columns = [
        col for col in [time_column, lat_column, lon_column, value_column]
        if col not in data.columns
    ]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Validate data types
    try:
        pd.to_datetime(data[time_column])
    except (TypeError, ValueError):
        raise ValueError("Invalid timestamp column format")

    # Latitude range check
    lat_valid = data[lat_column].apply(lambda x: -90 <= x <= 90)
    if not lat_valid.all():
        invalid_lats = data[~lat_valid][lat_column]
        print(f"WARNING: Invalid latitude values found: {invalid_lats}")
        raise ValueError("Latitude values must be between -90 and 90 degrees")

    # Longitude range check
    lon_valid = data[lon_column].apply(lambda x: -180 <= x <= 180)
    if not lon_valid.all():
        invalid_lons = data[~lon_valid][lon_column]
        print(f"WARNING: Invalid longitude values found: {invalid_lons}")
        raise ValueError("Longitude values must be between -180 and 180 degrees")

    # Check for excessive NaN values
    nan_ratio = data[value_column].isna().mean()
    if nan_ratio > 0.5:
        print(f"WARNING: High NaN ratio in value column: {nan_ratio:.2%}")
        raise ValueError("Too many NaN values in the value column")

    return True

def process_era5_grid_optimized(
    era5_data: pd.DataFrame,
    time_column: str = 'valid_time',
    lat_column: str = 'latitude',
    lon_column: str = 'longitude',
    value_column: str = 'msl',
    grid_cell_size: float = 1.0,
    simplify_tolerance: Optional[float] = 0.01,
    round_decimals: Optional[int] = 1,
    crs: str = "EPSG:4326",
    agg_method: str = 'first'
) -> Dict[datetime, Dict]:
    """
    Optimized ERA5 grid data processor with enhanced error handling.

    Args:
        era5_data: Input DataFrame with grid data
        time_column: Timestamp column name
        lat_column: Latitude column name
        lon_column: Longitude column name
        value_column: Value column name
        grid_cell_size: Grid cell size in degrees
        simplify_tolerance: Geometry simplification tolerance
        round_decimals: Decimal places for value rounding
        crs: Coordinate reference system
        agg_method: Aggregation method for pivot table

    Returns:
        Dictionary of timestamps to GeoJSON data
    """
    # Validate input data
    validate_era5_data(
        era5_data,
        time_column,
        lat_column,
        lon_column,
        value_column
    )

    # Parameter validation
    if not isinstance(grid_cell_size, (int, float)) or grid_cell_size <= 0:
        raise ValueError("Grid cell size must be a positive number")

    # Parameter type checking
    if simplify_tolerance is not None and (not isinstance(simplify_tolerance, (int, float)) or simplify_tolerance < 0):
        raise ValueError("Simplify tolerance must be a non-negative number")

    geojson_results: Dict[datetime, Dict] = {}

    # Convert timestamp column to datetime
    era5_data[time_column] = pd.to_datetime(era5_data[time_column])
    unique_timestamps = era5_data[time_column].unique()

    # Efficient pivot table creation with custom aggregation
    print(f"Processing data with {len(unique_timestamps)} unique timestamps...")

    try:
        pivoted_data = (
            era5_data.pivot_table(
                index=[lat_column, lon_column],
                columns=time_column,
                values=value_column,
                aggfunc=agg_method
            )
            .dropna(how='all')
            .reset_index()
        )
    except Exception as e:
        print(f"ERROR: Pivot table creation failed: {e}")
        raise

    # Vectorized polygon creation
    half_size = grid_cell_size / 2

    def safe_create_polygon(lat: float, lon: float) -> Polygon:
        """
        Create polygon with additional error checking.
        """
        try:
            return Polygon([
                (lon - half_size, lat - half_size),
                (lon - half_size, lat + half_size),
                (lon + half_size, lat + half_size),
                (lon + half_size, lat - half_size),
                (lon - half_size, lat - half_size)
            ])
        except Exception as e:
            print(f"ERROR: Polygon creation failed for lat={lat}, lon={lon}: {e}")
            raise

    pivoted_data['geometry'] = pivoted_data.apply(
        lambda row: safe_create_polygon(row[lat_column], row[lon_column]),
        axis=1
    )

    # Process timestamps
    for timestamp in tqdm(unique_timestamps, desc="Processing timestamps"):
        # Extract data for current timestamp
        timestamp_data = pivoted_data[[
            lat_column,
            lon_column,
            'geometry',
            timestamp
        ]].copy()
        timestamp_data.columns = [lat_column, lon_column, 'geometry', value_column]

        # Robust NaN handling
        timestamp_data.dropna(subset=[value_column], inplace=True)

        # Round values if specified
        if round_decimals is not None:
            timestamp_data[value_column] = timestamp_data[value_column].round(round_decimals)

        # Create GeoDataFrame with error handling
        try:
            geodata = gpd.GeoDataFrame(
                timestamp_data,
                geometry='geometry',
                crs=crs
            )
        except Exception as e:
            print(f"ERROR: GeoDataFrame creation failed for {timestamp}: {e}")
            continue

        # Geometry simplification with topology preservation
        if simplify_tolerance:
            try:
                geodata['geometry'] = geodata['geometry'].simplify(
                    tolerance=simplify_tolerance,
                    preserve_topology=True
                )
            except Exception as e:
                print(f"WARNING: Geometry simplification failed: {e}")

        # Store GeoJSON
        geojson_results[timestamp] = geodata.__geo_interface__

    print(f"Processed {len(geojson_results)} timestamps successfully")
    return geojson_results

In [None]:
geojson_results = process_era5_grid_optimized(
    era5_data=df_grouped,
    time_column='valid_time',
    lat_column='lat_rounded',
    lon_column='lon_rounded',
    value_column='msl',
    grid_cell_size=1.0,
    round_decimals=1,
    simplify_tolerance = 0.01,
    crs="EPSG:4326"
)

geojson_results

Processing data for 2020-05-01 00:00:00...
Processing data for 2020-05-01 06:00:00...
Processing data for 2020-05-01 12:00:00...
Processing data for 2020-05-01 18:00:00...
Processing data for 2020-05-02 00:00:00...
Processing data for 2020-05-02 06:00:00...
Processing data for 2020-05-02 12:00:00...
Processing data for 2020-05-02 18:00:00...
Processing data for 2020-05-03 00:00:00...
Processing data for 2020-05-03 06:00:00...
Processing data for 2020-05-03 12:00:00...
Processing data for 2020-05-03 18:00:00...
Processing data for 2020-05-04 00:00:00...
Processing data for 2020-05-04 06:00:00...
Processing data for 2020-05-04 12:00:00...
Processing data for 2020-05-04 18:00:00...
Processing data for 2020-05-05 00:00:00...
Processing data for 2020-05-05 06:00:00...
Processing data for 2020-05-05 12:00:00...
Processing data for 2020-05-05 18:00:00...
Processing data for 2020-05-06 00:00:00...
Processing data for 2020-05-06 06:00:00...
Processing data for 2020-05-06 12:00:00...
Processing 

In [None]:
geojson_output_file='msl-2020-processed.geojson'
with open(geojson_output_file, 'w') as f:
        json.dump(geojson_results, f, indent=2)  # Use indent for readability

print(f"GeoJSON data written to: {geojson_output_file}")

Reading geojson files to merge polygons and reduce file size

In [None]:
import json
from shapely.geometry import shape, mapping
from shapely.ops import unary_union
import numpy as np

def kelvin_to_celsius(kelvin):
    return float(kelvin) - 273.15

def get_temp_bucket(temp_celsius):
    """
    Assigns temperature to a fixed 0.5°C bucket
    Example: 20.3°C -> 20.0-20.5 bucket, 20.7°C -> 20.5-21.0 bucket
    Returns the lower bound of the bucket
    """
    return np.floor(temp_celsius * 2) / 2

def are_polygons_adjacent(geom1, geom2):
    """Check if two polygons are touching"""
    return geom1.touches(geom2) or geom1.intersects(geom2)

def merge_temperature_ranges(geojson_data, debug=True):
    features = geojson_data['features']

    # First, group features by temperature buckets
    temp_buckets = {}
    for feature in features:
        temp = kelvin_to_celsius(float(feature['properties']['temperature']))
        bucket = get_temp_bucket(temp)
        if bucket not in temp_buckets:
            temp_buckets[bucket] = []
        temp_buckets[bucket].append(feature)

    if debug:
        print("\nTemperature buckets:")
        for bucket in sorted(temp_buckets.keys()):
            print(f"{bucket}°C to {bucket+0.5}°C: {len(temp_buckets[bucket])} polygons")

    merged_features = []

    # Process each temperature bucket
    for bucket_temp in sorted(temp_buckets.keys()):
        bucket_features = temp_buckets[bucket_temp]
        if debug:
            print(f"\nProcessing bucket {bucket_temp}°C to {bucket_temp+0.5}°C")

        # Keep track of which features in this bucket we've processed
        processed = set()

        # Find connected groups within each temperature bucket
        for i, feature in enumerate(bucket_features):
            if i in processed:
                continue

            current_geom = shape(feature['geometry'])
            current_group = [feature]
            processed.add(i)

            # Find all adjacent polygons in the same temperature bucket
            changed = True
            while changed:
                changed = False
                for j, other_feature in enumerate(bucket_features):
                    if j in processed:
                        continue

                    other_geom = shape(other_feature['geometry'])
                    if are_polygons_adjacent(current_geom, other_geom):
                        current_group.append(other_feature)
                        current_geom = unary_union([current_geom, other_geom])
                        processed.add(j)
                        changed = True

            # Create merged feature for this connected group
            temps_kelvin = [float(f['properties']['temperature']) for f in current_group]
            avg_temp_kelvin = np.mean(temps_kelvin)

            if debug:
                print(f"  Created group of {len(current_group)} polygons")

            merged_feature = {
                "type": "Feature",
                "properties": {
                    "valid_time": current_group[0]['properties']['valid_time'],
                    "temperature": str(kelvin_to_celsius(avg_temp_kelvin)),
                    "latitude": current_geom.centroid.y,
                    "longitude": current_geom.centroid.x,
                    "merged_count": len(current_group),
                    "temp_bucket": f"{bucket_temp:.1f}°C to {bucket_temp+0.5:.1f}°C",
                    "lower_bound_temp": f"{bucket_temp:.1f}",
                    "upper_bound_temp": f"{bucket_temp+0.5:.1f}",
                },
                "geometry": mapping(current_geom)
            }
            merged_features.append(merged_feature)

    output_geojson = {
        "type": "FeatureCollection",
        "name": geojson_data['name'],
        "crs": geojson_data['crs'],
        "features": merged_features
    }

    return output_geojson

def process_geojson_file(input_file, output_file, debug=True):
    """
    Process a GeoJSON file and save the merged result

    Args:
        input_file (str): Path to input GeoJSON file
        output_file (str): Path to save merged GeoJSON
        debug (bool): Whether to print debug information
    """
    with open(input_file, 'r') as f:
        geojson_data = json.load(f)

    merged_geojson = merge_temperature_ranges(geojson_data, debug)

    with open(output_file, 'w') as f:
        json.dump(merged_geojson, f)

    # Print reduction statistics
    original_features = len(geojson_data['features'])
    merged_features = len(merged_geojson['features'])
    reduction = (1 - merged_features/original_features) * 100

    print(f"\nSummary:")
    print(f"Original features: {original_features}")
    print(f"Merged features: {merged_features}")
    print(f"Reduction: {reduction:.1f}%")
    print(f"Output File: {output_file}")

# Example usage
# process_geojson_file('temperature_20200101_000000.geojson',
#                         'temperature_20200101_000000_merged.geojson')

In [4]:
import json
from typing import Dict, Any, Optional
from shapely.geometry import shape, mapping
from shapely.ops import unary_union
import numpy as np

def get_bucket(value: float, bucket_size: float = 0.5) -> float:
    # Handle edge cases
    if value < 0:
        return 0.0

    # Calculate bucket using floor division
    bucket = (value // bucket_size) * bucket_size

    return bucket

def simplify_geojson(
    geojson_data: Dict[str, Any],
    bucket_key: str = 'temperature',
    bucket_size: float = 0.5,
    debug: bool = False
) -> Dict[str, Any]:
    """
    Simplify a GeoJSON by merging adjacent features in the same bucket.

    Args:
        geojson_data: Input GeoJSON dictionary
        bucket_key: Property key to use for bucketing (default: 'temperature')
        bucket_size: Size of bucket intervals (default: 0.5)
        debug: Enable debug printing

    Returns:
        Simplified GeoJSON with merged features
    """
    # Input validation
    if not isinstance(geojson_data, dict):
        raise ValueError("Input must be a GeoJSON dictionary")

    if 'features' not in geojson_data:
        raise ValueError("Input must contain 'features' key")

    features = geojson_data['features']

    if not features:
        print("Warning: No features to process")
        return geojson_data

    # Group features by bucket
    buckets = {}
    for feature in features:
        try:
            value = float(feature['properties'][bucket_key])
            bucket = get_bucket(value)
            if bucket not in buckets:
                buckets[bucket] = []
            buckets[bucket].append(feature)
        except (KeyError, ValueError) as e:
            print(f"Warning: Skipping feature due to error: {e}")

    if debug:
        print("\nBucket distribution:")
        for bucket, bucket_features in buckets.items():
            print(f"Bucket {bucket}: {len(bucket_features)} polygons")

    # Merge features within each bucket
    merged_features = []
    for bucket, bucket_features in buckets.items():
        processed = set()

        for i, feature in enumerate(bucket_features):
            if i in processed:
                continue

            current_geom = shape(feature['geometry'])
            current_group = [feature]
            processed.add(i)

            # Find adjacent polygons
            changed = True
            while changed:
                changed = False
                for j, other_feature in enumerate(bucket_features):
                    if j in processed:
                        continue

                    other_geom = shape(other_feature['geometry'])
                    if current_geom.touches(other_geom) or current_geom.intersects(other_geom):
                        current_group.append(other_feature)
                        current_geom = unary_union([current_geom, other_geom])
                        processed.add(j)
                        changed = True

            # Average properties for merged group
            values = [float(f['properties'][bucket_key]) for f in current_group]
            avg_value = np.mean(values)

            # Create merged feature
            merged_feature = {
                "type": "Feature",
                "properties": {
                    # **current_group[0]['properties'],  # Preserve original properties
                    bucket_key: str(avg_value),  # Update with average
                    "merged_count": len(current_group),
                    "lower_bound": bucket,
                    "upper_bound": bucket + bucket_size,
                    "bucket_range": f"{bucket} to {bucket + bucket_size}"
                },
                "geometry": mapping(current_geom)
            }
            merged_features.append(merged_feature)

    # Create output GeoJSON
    output_geojson = {
        **geojson_data,  # Preserve original metadata
        "features": merged_features
    }

    # Summary statistics
    if debug:
        original_count = len(features)
        merged_count = len(merged_features)
        reduction = (1 - merged_count/original_count) * 100
        print(f"\nSimplification Summary:")
        print(f"Original features: {original_count}")
        print(f"Merged features: {merged_count}")
        print(f"Reduction: {reduction:.1f}%")

    return output_geojson

def process_geojson_file(
    input_file: str,
    output_file: str,
    bucket_key: str = 'temperature',
    bucket_size: float = 0.5,
    debug: bool = True
) -> None:
    """
    Process a GeoJSON file and save the simplified result.

    Args:
        input_file: Path to input GeoJSON file
        output_file: Path to save simplified GeoJSON
        bucket_key: Property key to use for bucketing
        bucket_size: Size of bucket intervals
        debug: Enable debug printing
    """
    # Load input file
    with open(input_file, 'r') as f:
        geojson_data = json.load(f)

    # Simplify GeoJSON
    simplified_geojson = simplify_geojson(
        geojson_data,
        bucket_key=bucket_key,
        bucket_size=bucket_size,
        debug=debug
    )

    # Save output file
    with open(output_file, 'w') as f:
        json.dump(simplified_geojson, f, indent=2)


In [None]:
# prompt: generate a date object for jan 1 2020 00:00 and print out date like this 20200101_000000

from datetime import datetime, timedelta

date_object = datetime(2020, 5, 16, 0, 0)


while date_object != datetime(2021, 5, 21, 0, 0):
    formatted_input = date_object.strftime('content/processed/temperature_%Y%m%d_%H%M%S.geojson')
    formatted_output = date_object.strftime('merged/temperature_%Y%m%d_%H%M%S_merged.geojson')
    process_geojson_file(formatted_input, formatted_output, False)
    date_object = date_object + timedelta(hours=6)