# Try to merge netcdf files

In [2]:
import xarray as xr
import numpy as np
import pandas as pd
import pathlib

In [7]:
file_root_path = (pathlib.Path.cwd().parent.parent / "data/tmp/netcdf_test_files").as_posix() 
print(file_root_path)

C:/Users/PLIU/Documents/git/ConstanceDataPlatform/data/tmp/netcdf_test_files


## 1: Create dummy NetCDF files for 3 days

In [23]:
# Define dimensions
time = pd.date_range("2025-01-01", periods=3)
lat = np.array([10.0, 20.0], dtype=np.float32)
lon = np.array([30.0, 40.0], dtype=np.float32)

# Generate dummy temperature data
temperature_data = np.random.uniform(280, 300, size=(3, 2, 2)).astype(np.float32)
humidity_data = np.random.uniform(30, 80, size=(3, 2, 2)).astype(np.float32)

# Create dataset
ds = xr.Dataset(
    data_vars={
        "temperature": (["time", "lat", "lon"], temperature_data),
        "humidity": (["time", "lat", "lon"], humidity_data),
    },
    coords={
        "time": ("time", time),
        "lat": ("lat", lat),
        "lon": ("lon", lon),
    },
    attrs={
        "title": "Temperature sample Dataset",
        "institution": "CASD Meteorological Center",
        "source": "Simulated data",
        "history": "Created 2025-06-16",
        "Conventions": "CF-1.8"
    }
)

# Add variable attributes
ds["temperature"].attrs = {
    "long_name": "Surface Air Temperature",
    "units": "K",
    "_FillValue": -999.0
}
ds["humidity"].attrs = {
    "long_name": "Surface Air Humidity",
    "units": "percentage",
    "_FillValue": -1.0
}
ds["time"].attrs = {
    "long_name": "time"
}
ds["time"].encoding = {
    "units": "days since 2000-01-01 00:00:00",
    "calendar": "standard"
}

ds["lat"].attrs = {
    "units": "degrees_north",
    "long_name": "latitude"
}
ds["lon"].attrs = {
    "units": "degrees_east",
    "long_name": "longitude"
}

# Save as NetCDF
print(ds)
ds.to_netcdf(f'{file_root_path}/day{i+1}.nc',engine='netcdf4')

In [19]:

lat = np.linspace(-10, 10, 5)  # 5 latitudes
lon = np.linspace(30, 50, 5)   # 5 longitudes
lon2d, lat2d = np.meshgrid(lon, lat)

for i in range(3):
    temperature = 15 + 5 * np.random.rand(5, 5)# Random temp 15–20°C
    humidity = np.random.uniform(30, 80, size=(3, 2, 2)).astype(np.float32)
    print(f"temperature: {temperature}")
    print(f"humidity: {humidity}")
    ds = xr.Dataset(
        data_vars={
            'temperature': (['lat', 'lon'], temperature),
            "humidity": (['lat', 'lon'], humidity),
            
        },
        coords={
            'lat': lat,
            'lon': lon
        }
    )
    print(ds)
    ds.to_netcdf(f'{file_root_path}/day{i+1}.nc',engine='netcdf4')



temperature: [[16.0863181  17.88791981 15.72946617 19.21319504 18.61724826]
 [18.8708541  16.46897315 15.55449772 18.98285372 16.28906369]
 [18.54086359 15.69945871 18.75563647 19.32993382 17.26440456]
 [19.00413436 16.01441074 16.2566889  19.2240957  15.29462334]
 [15.61937012 19.99580718 15.12977254 19.82786799 19.38579257]]
humidity: [[[30.321533 51.648964]
  [49.864017 67.422325]]

 [[60.79177  34.082615]
  [30.572166 55.646294]]

 [[42.721786 45.467155]
  [70.681595 42.165558]]]


ValueError: Variable 'humidity': Could not convert tuple of form (dims, data[, attrs, encoding]): (['lat', 'lon'], array([[[30.321533, 51.648964],
        [49.864017, 67.422325]],

       [[60.79177 , 34.082615],
        [30.572166, 55.646294]],

       [[42.721786, 45.467155],
        [70.681595, 42.165558]]], dtype=float32)) to Variable.

## 2: Merge these files with a time dimension

In [16]:
files = ['day1.nc', 'day2.nc', 'day3.nc']
data_list = []
time_list = pd.date_range("2023-01-01", periods=len(files))

for i, file in enumerate(files):
    ds = xr.open_dataset(f"{file_root_path}/{file}",engine="netcdf4")
    ds = ds.expand_dims(time=[time_list[i]])  # Add time dimension
    data_list.append(ds)

merged = xr.concat(data_list, dim='time')

# Step 3: Save the merged dataset
merged.to_netcdf(f'{file_root_path}/merged_temperature.nc',engine='netcdf4')

# Optional: Print dataset summary
print(merged)

<xarray.Dataset> Size: 704B
Dimensions:      (time: 3, lat: 5, lon: 5)
Coordinates:
  * time         (time) datetime64[ns] 24B 2023-01-01 2023-01-02 2023-01-03
  * lat          (lat) float64 40B -10.0 -5.0 0.0 5.0 10.0
  * lon          (lon) float64 40B 30.0 35.0 40.0 45.0 50.0
Data variables:
    temperature  (time, lat, lon) float64 600B 16.64 19.0 16.15 ... 16.67 15.55


## 3: Advance merge to reduce file size

There are many ways to reduce netcdf file size

- Reduce data precision



In [18]:
files = ['day1.nc', 'day2.nc', 'day3.nc']
data_list = []
time_list = pd.date_range("2023-01-01", periods=len(files))

for i, file in enumerate(files):
    ds = xr.open_dataset(f"{file_root_path}/{file}",engine="netcdf4")
    ds = ds.expand_dims(time=[time_list[i]])  # Add time dimension
    # Optional: ensure only temperature is retained
    ds = ds[['temperature']]
    
    data_list.append(ds)
    data_list.append(ds)

merged = xr.concat(data_list, dim='time')

# Step 3: Save the merged dataset
merged.to_netcdf(f'{file_root_path}/ad_merged_temperature.nc',engine='netcdf4')

# Optional: Print dataset summary
print(merged)

<xarray.Dataset> Size: 1kB
Dimensions:      (time: 6, lat: 5, lon: 5)
Coordinates:
  * time         (time) datetime64[ns] 48B 2023-01-01 2023-01-01 ... 2023-01-03
  * lat          (lat) float64 40B -10.0 -5.0 0.0 5.0 10.0
  * lon          (lon) float64 40B 30.0 35.0 40.0 45.0 50.0
Data variables:
    temperature  (time, lat, lon) float64 1kB 17.49 16.59 18.25 ... 18.32 16.65
