In [1]:
from preprocessing import *

In [2]:
def _get_variables(variables) -> list[str]:
    if isinstance(variables, str):
        return [variables]
    return variables


def compress_nc4(filename, variables, compression_level=9):
    with open_nc4_dataset(filename) as dataset:
        for variable in _get_variables(variables):
            with open_nc4_dataset(filename, folder=f"compressed/{variable}", mode="w") as dst:
                dst.createDimension("lon", 288)  # 588/2 because we pack float16s as float32s
                dst.createDimension("lat", 361)
                dst.createDimension("lev", 42)
                dst.createDimension("time", 8)

                data = dataset.variables[variable]

                dst.createVariable(variable, "f", data.dimensions, chunksizes=(1, 1, 361, 288), compression="zlib",
                                   complevel=compression_level)
                dst[variable].setncatts(data.__dict__)  # copy variable attributes via a dictionary

                data_float16 = np.array(data[:], dtype="float16")
                packed_float32 = data_float16.view("float32")
                dst[variable][:] = packed_float32

                print(packed_float32.view("float16"))

In [5]:
print_nc4_metadata("MERRA2_100.tavg3_3d_cld_Np.19800101.nc4")

[21:01:58] LOG: Loading raw/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4
<xarray.Dataset>
Dimensions:    (lon: 576, lat: 361, lev: 42, time: 8)
Coordinates:
  * lon        (lon) float64 -180.0 -179.4 -178.8 -178.1 ... 178.1 178.8 179.4
  * lat        (lat) float64 -90.0 -89.5 -89.0 -88.5 ... 88.5 89.0 89.5 90.0
  * lev        (lev) float64 1e+03 975.0 950.0 925.0 900.0 ... 0.5 0.4 0.3 0.1
  * time       (time) datetime64[ns] 1980-01-01T01:30:00 ... 1980-01-01T22:30:00
Data variables:
    CFCU       (time, lev, lat, lon) float32 ...
    CLOUD      (time, lev, lat, lon) float32 ...
    DTRAIN     (time, lev, lat, lon) float32 ...
    INCLOUDQI  (time, lev, lat, lon) float32 ...
    INCLOUDQL  (time, lev, lat, lon) float32 ...
    QI         (time, lev, lat, lon) float32 ...
    QL         (time, lev, lat, lon) float32 ...
    RH         (time, lev, lat, lon) float32 ...
    TAUCLI     (time, lev, lat, lon) float32 ...
    TAUCLW     (time, lev, lat, lon) float32 ...
Attributes: (12/30)
    Hi

In [3]:
compress_nc4("MERRA2_100.tavg3_3d_cld_Np.19800101.nc4", "RH", compression_level=9)


[21:23:24] LOG: Loading raw/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4
[21:23:24] LOG: Loading compressed/RH/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4


  data_float16 = np.array(data[:], dtype="float16")


[[[[      inf       inf       inf ...       inf       inf       inf]
   [      inf       inf       inf ...       inf       inf       inf]
   [      inf       inf       inf ...       inf       inf       inf]
   ...
   [9.878e-01 9.878e-01 9.878e-01 ... 9.883e-01 9.883e-01 9.883e-01]
   [9.712e-01 9.712e-01 9.712e-01 ... 9.717e-01 9.717e-01 9.717e-01]
   [9.697e-01 9.697e-01 9.697e-01 ... 9.697e-01 9.697e-01 9.697e-01]]

  [[      inf       inf       inf ...       inf       inf       inf]
   [      inf       inf       inf ...       inf       inf       inf]
   [      inf       inf       inf ...       inf       inf       inf]
   ...
   [1.000e+00 1.000e+00 1.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]
   [1.000e+00 1.000e+00 1.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]
   [1.000e+00 1.000e+00 1.000e+00 ... 1.000e+00 1.000e+00 1.000e+00]]

  [[      inf       inf       inf ...       inf       inf       inf]
   [      inf       inf       inf ...       inf       inf       inf]
   [      inf   