In [None]:
!pip install netCDF4

Collecting netCDF4
  Downloading netCDF4-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netCDF4
Successfully installed cftime-1.6.4.post1 netCDF4-1.7.2


In [None]:
!pip install cftime



In [None]:
import cftime

print(f"cftime version: {cftime.__version__}")


cftime version: 1.6.4.post1


In [None]:
import xarray as xr
import cftime
import netCDF4

# Open the dataset with decode_times enabled
ds = xr.open_dataset('air.mon.1981-2010.ltm.nc', decode_times=False)

# Convert time values to formatted strings
time_values = ds['time'].values
time_units = ds['time'].attrs['units']  # e.g., "days since 1800-01-01 00:00:0.0"
calendar = ds['time'].attrs.get('calendar', 'standard')

# Convert to datetime objects
dates = netCDF4.num2date(time_values, units=time_units, calendar=calendar)


# Optionally, convert to a list for easier handling
human_readable_dates = dates.tolist()

for date in human_readable_dates[:12]:
    print(date.strftime('%Y-%m-%d'))


0001-01-01
0001-02-01
0001-03-01
0001-04-01
0001-05-01
0001-06-01
0001-07-01
0001-08-01
0001-09-01
0001-10-01
0001-11-01
0001-12-01


In [None]:
air_da = ds['air']

# Convert the DataArray to a Pandas DataFrame
df = air_da.to_dataframe().reset_index()

# Rename columns for clarity (optional but recommended)
df.rename(columns={
    'time': 'Date',
    'level': 'Pressure_Level_hPa',
    'lat': 'Latitude',
    'lon': 'Longitude',
    'air': 'Air_Temperature'
}, inplace=True)

# Display the first few rows of the DataFrame
print(df.head())

       Date  Pressure_Level_hPa  Latitude  Longitude  Air_Temperature
0 -657073.0              1000.0      90.0        0.0       -28.217329
1 -657073.0              1000.0      90.0        2.5       -28.217329
2 -657073.0              1000.0      90.0        5.0       -28.217329
3 -657073.0              1000.0      90.0        7.5       -28.217329
4 -657073.0              1000.0      90.0       10.0       -28.217329
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2144448 entries, 0 to 2144447
Data columns (total 5 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Date                float64
 1   Pressure_Level_hPa  float32
 2   Latitude            float32
 3   Longitude           float32
 4   Air_Temperature     float32
dtypes: float32(4), float64(1)
memory usage: 49.1 MB
None


In [None]:
# Check the memory usage of the DataFrame
print(df.memory_usage(deep=True))

# Convert data types to more efficient types if possible
df['Pressure_Level_hPa'] = df['Pressure_Level_hPa'].astype('float32')
df['Latitude'] = df['Latitude'].astype('float32')
df['Longitude'] = df['Longitude'].astype('float32')
df['Air_Temperature'] = df['Air_Temperature'].astype('float32')

# Verify the updated memory usage
print(df.memory_usage(deep=True))


Index                      128
Date                  17155584
Pressure_Level_hPa     8577792
Latitude               8577792
Longitude              8577792
Air_Temperature        8577792
dtype: int64
Index                      128
Date                  17155584
Pressure_Level_hPa     8577792
Latitude               8577792
Longitude              8577792
Air_Temperature        8577792
dtype: int64


In [None]:
df.rename(columns={
    'time': 'Date',
    'level': 'Pressure_Level_hPa',
    'lat': 'Latitude',
    'lon': 'Longitude',
    'air': 'Air_Temperature'
}, inplace=True)
print(df.head())

       Date  Pressure_Level_hPa  Latitude  Longitude  Air_Temperature
0 -657073.0              1000.0      90.0        0.0       -28.217329
1 -657073.0              1000.0      90.0        2.5       -28.217329
2 -657073.0              1000.0      90.0        5.0       -28.217329
3 -657073.0              1000.0      90.0        7.5       -28.217329
4 -657073.0              1000.0      90.0       10.0       -28.217329


In [None]:
# Step 7: Calculate spatial points per time
num_levels = ds.dims['level']      # 17
num_lats = ds.dims['lat']          # 73
num_lons = ds.dims['lon']          # 144
spatial_points_per_time = num_levels * num_lats * num_lons  # 178,464

# Step 8: Number of time points
num_times = ds.dims['time']        # 12

# Step 9: Expected DataFrame length
expected_length = num_times * spatial_points_per_time  # 2,141,568
actual_length = len(df)
print(f"Expected DataFrame length: {expected_length}")
print(f"Actual DataFrame length: {actual_length}")


Expected DataFrame length: 2144448
Actual DataFrame length: 2144448


  num_levels = ds.dims['level']      # 17
  num_lats = ds.dims['lat']          # 73
  num_lons = ds.dims['lon']          # 144
  num_times = ds.dims['time']        # 12


In [None]:
import numpy as np
# Step 10: Replace 'Date' column
if expected_length == actual_length:
    # Correct replacement
    repeated_dates = np.repeat(human_readable_dates, spatial_points_per_time)
    df['Date'] = repeated_dates
    print(df.head())
else:
    print('ERROR: expected and actual length do not match')

                  Date  Pressure_Level_hPa  Latitude  Longitude  \
0  0001-01-01 00:00:00              1000.0      90.0        0.0   
1  0001-01-01 00:00:00              1000.0      90.0        2.5   
2  0001-01-01 00:00:00              1000.0      90.0        5.0   
3  0001-01-01 00:00:00              1000.0      90.0        7.5   
4  0001-01-01 00:00:00              1000.0      90.0       10.0   

   Air_Temperature  
0       -28.217329  
1       -28.217329  
2       -28.217329  
3       -28.217329  
4       -28.217329  


In [None]:
# Step 11: Convert 'Date' column to string type (if not already)
df['Date'] = df['Date'].astype(str)


print(df[df['Date'] == '0001-01-01 00:00:00'])

                       Date  Pressure_Level_hPa  Latitude  Longitude  \
0       0001-01-01 00:00:00              1000.0      90.0        0.0   
1       0001-01-01 00:00:00              1000.0      90.0        2.5   
2       0001-01-01 00:00:00              1000.0      90.0        5.0   
3       0001-01-01 00:00:00              1000.0      90.0        7.5   
4       0001-01-01 00:00:00              1000.0      90.0       10.0   
...                     ...                 ...       ...        ...   
178699  0001-01-01 00:00:00                10.0     -90.0      347.5   
178700  0001-01-01 00:00:00                10.0     -90.0      350.0   
178701  0001-01-01 00:00:00                10.0     -90.0      352.5   
178702  0001-01-01 00:00:00                10.0     -90.0      355.0   
178703  0001-01-01 00:00:00                10.0     -90.0      357.5   

        Air_Temperature  
0            -28.217329  
1            -28.217329  
2            -28.217329  
3            -28.217329  
4    

In [None]:
!pip install cdsapi



In [None]:
from google.colab import userdata
uid = userdata.get('CDS_UID')
apikey = userdata.get("CDS_TOKEN")
with open("/root/.cdsapirc", "w") as f:
    print("url: https://cds.climate.copernicus.eu/api/retrieve/v1/processes/reanalysis-era5-pressure-levels", file=f)
    print(f"key: {uid}:{apikey}", file=f)


In [None]:
import cdsapi

dataset = "reanalysis-era5-pressure-levels"
request = {
    "product_type": ["reanalysis"],
    "variable": ["temperature"],
    "year": ["2020"],
    "month": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "time": [
        "00:00", "06:00", "12:00",
        "18:00"
    ],
    "pressure_level": ["850"],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": [65, -120, 0, 0]
}
target = 'download.grib'

client = cdsapi.Client()
client.retrieve(dataset, request).download()

#https://cds.climate.copernicus.eu/api/retrieve/v1/processes/reanalysis-era5-pressure-levels/execution


2024-11-16 00:26:08,686 INFO Sending request to https://cds.climate.copernicus.eu/api/retrieve/v1/processes/reanalysis-era5-pressure-levels/resources/reanalysis-era5-pressure-levels
INFO:cdsapi:Sending request to https://cds.climate.copernicus.eu/api/retrieve/v1/processes/reanalysis-era5-pressure-levels/resources/reanalysis-era5-pressure-levels


HTTPError: 404 Client Error: Not Found for url: https://cds.climate.copernicus.eu/api/retrieve/v1/processes/reanalysis-era5-pressure-levels/resources/reanalysis-era5-pressure-levels

In [None]:
!pip install cfgrib

Collecting cfgrib
  Downloading cfgrib-0.9.14.1-py3-none-any.whl.metadata (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting eccodes>=0.9.8 (from cfgrib)
  Downloading eccodes-2.38.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting findlibs (from eccodes>=0.9.8->cfgrib)
  Downloading findlibs-0.0.5.tar.gz (6.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading cfgrib-0.9.14.1-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.7/48.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eccodes-2.38.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
import cfgrib

# Open the GRIB file
ds = cfgrib.open_dataset('2020-temp.grib')

# Access data variables, metadata, etc.
print(ds['time'])

<xarray.DataArray 'time' ()> Size: 8B
[1 values with dtype=datetime64[ns]]
Coordinates:
    number         int64 8B ...
    time           datetime64[ns] 8B ...
    step           timedelta64[ns] 8B ...
    isobaricInhPa  float64 8B ...
    valid_time     datetime64[ns] 8B ...
Attributes:
    long_name:      initial time of forecast
    standard_name:  forecast_reference_time


In [None]:
import xarray as xr
import cftime
import netCDF4

# Open the dataset with decode_times enabled
ds = xr.open_dataset('temp-2020-data.nc', decode_times=False)

print(ds)


<xarray.Dataset> Size: 735MB
Dimensions:         (valid_time: 1464, pressure_level: 1, latitude: 261,
                     longitude: 481)
Coordinates:
    number          int64 8B ...
  * valid_time      (valid_time) int64 12kB 1577836800 1577858400 ... 1609437600
  * pressure_level  (pressure_level) float64 8B 850.0
  * latitude        (latitude) float64 2kB 65.0 64.75 64.5 ... 0.5 0.25 0.0
  * longitude       (longitude) float64 4kB -120.0 -119.8 -119.5 ... -0.25 0.0
    expver          (valid_time) <U4 23kB ...
Data variables:
    t               (valid_time, pressure_level, latitude, longitude) float32 735MB ...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-11-16T00:45 GRIB to CDM+CF via cfgrib-0.9.1...


In [None]:
print(ds['valid_time'])

<xarray.DataArray 'valid_time' (valid_time: 1464)> Size: 12kB
array([1577836800, 1577858400, 1577880000, ..., 1609394400, 1609416000,
       1609437600])
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) int64 12kB 1577836800 1577858400 ... 1609437600
    expver      (valid_time) <U4 23kB ...
Attributes:
    long_name:      time
    standard_name:  time
    units:          seconds since 1970-01-01
    calendar:       proleptic_gregorian


In [None]:
# Convert time values to formatted strings
time_values = ds['valid_time'].values
time_units = ds['valid_time'].attrs['units']  # e.g., "days since 1800-01-01 00:00:0.0"
calendar = ds['valid_time'].attrs.get('calendar', 'standard')

# Convert to datetime objects
dates = netCDF4.num2date(time_values, units=time_units, calendar=calendar)


# Optionally, convert to a list for easier handling
human_readable_dates = dates.tolist()

for date in human_readable_dates:
    print(date.strftime('%Y-%m-%d'))

2020-01-01
2020-01-01
2020-01-01
2020-01-01
2020-01-02
2020-01-02
2020-01-02
2020-01-02
2020-01-03
2020-01-03
2020-01-03
2020-01-03
2020-01-04
2020-01-04
2020-01-04
2020-01-04
2020-01-05
2020-01-05
2020-01-05
2020-01-05
2020-01-06
2020-01-06
2020-01-06
2020-01-06
2020-01-07
2020-01-07
2020-01-07
2020-01-07
2020-01-08
2020-01-08
2020-01-08
2020-01-08
2020-01-09
2020-01-09
2020-01-09
2020-01-09
2020-01-10
2020-01-10
2020-01-10
2020-01-10
2020-01-11
2020-01-11
2020-01-11
2020-01-11
2020-01-12
2020-01-12
2020-01-12
2020-01-12
2020-01-13
2020-01-13
2020-01-13
2020-01-13
2020-01-14
2020-01-14
2020-01-14
2020-01-14
2020-01-15
2020-01-15
2020-01-15
2020-01-15
2020-01-16
2020-01-16
2020-01-16
2020-01-16
2020-01-17
2020-01-17
2020-01-17
2020-01-17
2020-01-18
2020-01-18
2020-01-18
2020-01-18
2020-01-19
2020-01-19
2020-01-19
2020-01-19
2020-01-20
2020-01-20
2020-01-20
2020-01-20
2020-01-21
2020-01-21
2020-01-21
2020-01-21
2020-01-22
2020-01-22
2020-01-22
2020-01-22
2020-01-23
2020-01-23
2020-01-23

In [None]:
ds['valid_time'] = human_readable_dates
ds['valid_time'] = ds['valid_time'].astype(str)
print(ds['valid_time'])

<xarray.DataArray 'valid_time' (valid_time: 1464)> Size: 111kB
array(['2020-01-01 00:00:00', '2020-01-01 06:00:00', '2020-01-01 12:00:00',
       ..., '2020-12-31 06:00:00', '2020-12-31 12:00:00',
       '2020-12-31 18:00:00'], dtype='<U19')
Coordinates:
    number      int64 8B ...
  * valid_time  (valid_time) <U19 111kB '2020-01-01 00:00:00' ... '2020-12-31...
    expver      (valid_time) <U4 23kB ...


In [None]:
import pandas as pd
import xarray as xr

# Open the dataset
ds = xr.open_dataset('temp-2020-data.nc')

# Create an empty list to collect DataFrames in chunks
chunks = []

# Define the chunk size (e.g., process by 1000 rows at a time)
chunk_size = 1000
for start in range(0, len(ds['t']['valid_time']), chunk_size):
    chunk = ds['t'].isel(valid_time=slice(start, start + chunk_size))
    # Convert the chunk to a DataFrame and reset the index to include dimensions as columns
    chunk_df = chunk.to_dataframe().reset_index()

    # Append to the list of chunks
    chunks.append(chunk_df)

# Concatenate all chunks into a single DataFrame
final_df = pd.concat(chunks, ignore_index=True)

print(final_df.head())

In [None]:
!pip install geopandas
!pip install shapely



In [None]:
import xarray as xr

# Load the NetCDF dataset
ds = xr.open_dataset('temp-2020-data.nc')

# Remove the singleton 'pressure_level' dimension
ds = ds.squeeze(dim='pressure_level')
# Round latitude and longitude to the nearest degree
ds = ds.assign_coords(
    lat_rounded=ds.latitude.round(0),
    lon_rounded=ds.longitude.round(0)
)
# Group by the rounded coordinates and compute the mean temperature
ds_grouped = ds.groupby(['lat_rounded', 'lon_rounded']).mean()
print(ds_grouped)


<xarray.Dataset> Size: 47MB
Dimensions:         (lat_rounded: 66, lon_rounded: 121, valid_time: 1464)
Coordinates:
  * lat_rounded     (lat_rounded) float64 528B 0.0 1.0 2.0 ... 63.0 64.0 65.0
  * lon_rounded     (lon_rounded) float64 968B -120.0 -119.0 ... -1.0 -0.0
    number          int64 8B 0
  * valid_time      (valid_time) datetime64[ns] 12kB 2020-01-01 ... 2020-12-3...
    pressure_level  float64 8B 850.0
    expver          (valid_time) <U4 23kB '0001' '0001' '0001' ... '0001' '0001'
Data variables:
    t               (valid_time, lat_rounded, lon_rounded) float32 47MB 289.3...
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2024-11-16T00:45 GRIB to CDM+CF via cfgrib-0.9.1...


In [None]:
import geopandas as gpd
from shapely.geometry import Polygon
import pandas as pd
import shapely.ops

def create_polygon(lon, lat, resolution=1.0):
    """
    Create a square polygon centered at (lon, lat) with a given resolution.
    """
    half_res = resolution / 2
    return Polygon([
        (lon - half_res, lat - half_res),
        (lon - half_res, lat + half_res),
        (lon + half_res, lat + half_res),
        (lon + half_res, lat - half_res),
        (lon - half_res, lat - half_res)
    ])

# Convert 'valid_time' to datetime for readability
ds_grouped['valid_time'] = pd.to_datetime(ds_grouped.valid_time.values, unit='s')
# Iterate over each time step
for time in ds_grouped.valid_time.values:
    # Select data for the current time step
    temp_data = ds_grouped.sel(valid_time=time)

    # Initialize lists to store polygon geometries and temperature values
    polygons = []
    temperatures = []
    latitudes = []
    longitudes = []

    # Iterate over each rounded latitude and longitude
    for lat in temp_data.lat_rounded.values:
        for lon in temp_data.lon_rounded.values:
            # Extract temperature value
            temp = temp_data.t.sel(lat_rounded=lat, lon_rounded=lon).values

            # Skip if temperature is NaN
            if pd.isna(temp):
                continue

            # Create polygon
            polygon = create_polygon(lon, lat, resolution=1.0)

            # Append to lists
            polygons.append(polygon)
            temperatures.append(temp)
            latitudes.append(lat)
            longitudes.append(lon)

    # Create a GeoDataFrame
    gdf = gpd.GeoDataFrame({
        'valid_time': pd.to_datetime(time),
        'temperature': temperatures,
        'latitude': latitudes,
        'longitude': longitudes,
        'geometry': polygons
    }, crs="EPSG:4326")  # Assuming WGS84 Latitude/Longitude

    # (Optional) Simplify geometries to reduce file size
    gdf['geometry'] = gdf['geometry'].simplify(tolerance=0.01, preserve_topology=True)

    # (Optional) Round temperature values to the nearest degree
    gdf['temperature'] = gdf['temperature'].round(0)

    # Save to GeoJSON
    output_filename = f'processed/temperature_{pd.to_datetime(time).strftime("%Y%m%d_%H%M%S")}.geojson'
    gdf.to_file(output_filename, driver='GeoJSON')


    print(f'Saved {output_filename}')


Saved processed/temperature_20200101_000000.geojson
Saved processed/temperature_20200101_060000.geojson
Saved processed/temperature_20200101_120000.geojson
Saved processed/temperature_20200101_180000.geojson
Saved processed/temperature_20200102_000000.geojson
Saved processed/temperature_20200102_060000.geojson
Saved processed/temperature_20200102_120000.geojson
Saved processed/temperature_20200102_180000.geojson
Saved processed/temperature_20200103_000000.geojson
Saved processed/temperature_20200103_060000.geojson
Saved processed/temperature_20200103_120000.geojson
Saved processed/temperature_20200103_180000.geojson
Saved processed/temperature_20200104_000000.geojson
Saved processed/temperature_20200104_060000.geojson
Saved processed/temperature_20200104_120000.geojson
Saved processed/temperature_20200104_180000.geojson
Saved processed/temperature_20200105_000000.geojson
Saved processed/temperature_20200105_060000.geojson
Saved processed/temperature_20200105_120000.geojson
Saved proces

In [None]:
!zip -r /content/processed.zip /content/processed

  adding: content/processed/ (stored 0%)
  adding: content/processed/temperature_20200601_060000.geojson (deflated 93%)
  adding: content/processed/temperature_20200328_060000.geojson (deflated 93%)
  adding: content/processed/temperature_20200801_060000.geojson (deflated 93%)
  adding: content/processed/temperature_20200317_000000.geojson (deflated 93%)
  adding: content/processed/temperature_20200110_120000.geojson (deflated 93%)
  adding: content/processed/temperature_20201114_060000.geojson (deflated 93%)
  adding: content/processed/temperature_20200923_120000.geojson (deflated 93%)
  adding: content/processed/temperature_20200629_000000.geojson (deflated 93%)
  adding: content/processed/temperature_20201103_000000.geojson (deflated 93%)
  adding: content/processed/temperature_20201116_180000.geojson (deflated 93%)
  adding: content/processed/temperature_20200709_000000.geojson (deflated 93%)
  adding: content/processed/temperature_20200405_060000.geojson (deflated 93%)
  adding: c

In [None]:
!pip install shapely numpy



In [28]:
import json
from shapely.geometry import shape, mapping
from shapely.ops import unary_union
import numpy as np

def kelvin_to_celsius(kelvin):
    return float(kelvin) - 273.15

def get_temp_bucket(temp_celsius):
    """
    Assigns temperature to a fixed 0.5°C bucket
    Example: 20.3°C -> 20.0-20.5 bucket, 20.7°C -> 20.5-21.0 bucket
    Returns the lower bound of the bucket
    """
    return np.floor(temp_celsius * 2) / 2

def are_polygons_adjacent(geom1, geom2):
    """Check if two polygons are touching"""
    return geom1.touches(geom2) or geom1.intersects(geom2)

def merge_temperature_ranges(geojson_data, debug=True):
    features = geojson_data['features']

    # First, group features by temperature buckets
    temp_buckets = {}
    for feature in features:
        temp = kelvin_to_celsius(float(feature['properties']['temperature']))
        bucket = get_temp_bucket(temp)
        if bucket not in temp_buckets:
            temp_buckets[bucket] = []
        temp_buckets[bucket].append(feature)

    if debug:
        print("\nTemperature buckets:")
        for bucket in sorted(temp_buckets.keys()):
            print(f"{bucket}°C to {bucket+0.5}°C: {len(temp_buckets[bucket])} polygons")

    merged_features = []

    # Process each temperature bucket
    for bucket_temp in sorted(temp_buckets.keys()):
        bucket_features = temp_buckets[bucket_temp]
        if debug:
            print(f"\nProcessing bucket {bucket_temp}°C to {bucket_temp+0.5}°C")

        # Keep track of which features in this bucket we've processed
        processed = set()

        # Find connected groups within each temperature bucket
        for i, feature in enumerate(bucket_features):
            if i in processed:
                continue

            current_geom = shape(feature['geometry'])
            current_group = [feature]
            processed.add(i)

            # Find all adjacent polygons in the same temperature bucket
            changed = True
            while changed:
                changed = False
                for j, other_feature in enumerate(bucket_features):
                    if j in processed:
                        continue

                    other_geom = shape(other_feature['geometry'])
                    if are_polygons_adjacent(current_geom, other_geom):
                        current_group.append(other_feature)
                        current_geom = unary_union([current_geom, other_geom])
                        processed.add(j)
                        changed = True

            # Create merged feature for this connected group
            temps_kelvin = [float(f['properties']['temperature']) for f in current_group]
            avg_temp_kelvin = np.mean(temps_kelvin)

            if debug:
                print(f"  Created group of {len(current_group)} polygons")

            merged_feature = {
                "type": "Feature",
                "properties": {
                    "valid_time": current_group[0]['properties']['valid_time'],
                    "temperature": str(kelvin_to_celsius(avg_temp_kelvin)),
                    "latitude": current_geom.centroid.y,
                    "longitude": current_geom.centroid.x,
                    "merged_count": len(current_group),
                    "temp_bucket": f"{bucket_temp:.1f}°C to {bucket_temp+0.5:.1f}°C",
                    "lower_bound_temp": f"{bucket_temp:.1f}",
                    "upper_bound_temp": f"{bucket_temp+0.5:.1f}",
                },
                "geometry": mapping(current_geom)
            }
            merged_features.append(merged_feature)

    output_geojson = {
        "type": "FeatureCollection",
        "name": geojson_data['name'],
        "crs": geojson_data['crs'],
        "features": merged_features
    }

    return output_geojson

def process_geojson_file(input_file, output_file, debug=True):
    """
    Process a GeoJSON file and save the merged result

    Args:
        input_file (str): Path to input GeoJSON file
        output_file (str): Path to save merged GeoJSON
        debug (bool): Whether to print debug information
    """
    with open(input_file, 'r') as f:
        geojson_data = json.load(f)

    merged_geojson = merge_temperature_ranges(geojson_data, debug)

    with open(output_file, 'w') as f:
        json.dump(merged_geojson, f)

    # Print reduction statistics
    original_features = len(geojson_data['features'])
    merged_features = len(merged_geojson['features'])
    reduction = (1 - merged_features/original_features) * 100

    print(f"\nSummary:")
    print(f"Original features: {original_features}")
    print(f"Merged features: {merged_features}")
    print(f"Reduction: {reduction:.1f}%")
    print(f"Output File: {output_file}")

# Example usage
# process_geojson_file('temperature_20200101_000000.geojson',
#                         'temperature_20200101_000000_merged.geojson')

In [29]:
# prompt: generate a date object for jan 1 2020 00:00 and print out date like this 20200101_000000

from datetime import datetime, timedelta

date_object = datetime(2020, 1, 1, 0, 0)


while date_object != datetime(2021, 1, 1, 0, 0):
    formatted_input = date_object.strftime('content/processed/temperature_%Y%m%d_%H%M%S.geojson')
    formatted_output = date_object.strftime('merged/temperature_%Y%m%d_%H%M%S_merged.geojson')
    process_geojson_file(formatted_input, formatted_output, False)
    date_object = date_object + timedelta(hours=6)


Summary:
Original features: 7986
Merged features: 2580
Reduction: 67.7%
Output File: merged/temperature_20200101_000000_merged.geojson

Summary:
Original features: 7986
Merged features: 2531
Reduction: 68.3%
Output File: merged/temperature_20200101_060000_merged.geojson

Summary:
Original features: 7986
Merged features: 2536
Reduction: 68.2%
Output File: merged/temperature_20200101_120000_merged.geojson

Summary:
Original features: 7986
Merged features: 2614
Reduction: 67.3%
Output File: merged/temperature_20200101_180000_merged.geojson

Summary:
Original features: 7986
Merged features: 2553
Reduction: 68.0%
Output File: merged/temperature_20200102_000000_merged.geojson

Summary:
Original features: 7986
Merged features: 2405
Reduction: 69.9%
Output File: merged/temperature_20200102_060000_merged.geojson

Summary:
Original features: 7986
Merged features: 2435
Reduction: 69.5%
Output File: merged/temperature_20200102_120000_merged.geojson

Summary:
Original features: 7986
Merged feature

KeyboardInterrupt: 

In [25]:
!unzip processed.zip

Archive:  processed.zip
replace content/processed/temperature_20200601_060000.geojson? [y]es, [n]o, [A]ll, [N]one, [r]ename: 