In [1]:
import os

import pandas as pd
import numpy as np
import xarray as xr
import netCDF4 as nc

from util import *

In [27]:
def open_xarray_dataset(filename, folder="raw") -> xr.Dataset:
    if not os.path.isdir(folder):
        os.makedirs(folder)

    filepath = f"{folder}/{filename}"

    log(f"Loading {filepath}")
    return xr.open_dataset(filepath)


def open_nc4_dataset(filename, folder="raw", mode="r") -> nc.Dataset:
    if not os.path.isdir(folder):
        os.makedirs(folder)

    filepath = f"{folder}/{filename}"

    log(f"Loading {filepath}")
    return nc.Dataset(filepath, mode=mode)


def get_variables(variables) -> list[str]:
    if isinstance(variables, str):
        return [variables]
    return variables

In [24]:
def save_df_as_parquet(df, output, compression_level=11):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df)

    df.to_parquet(output, engine="fastparquet",
                  compression={"_default": {"type": "BROTLI", "args": {"level": compression_level}}})


def save_nc4_as_parquet(filename, variables, compression_level=11):
    output = filename[:-3] + "parquet"

    with open_xarray_dataset(filename) as dataset:
        df = dataset.to_dataframe()

    for var in get_variables(variables):
        print(f"\tLoading '{var}'")
        variable = df[var]
        variable.reset_index(inplace=True, drop=True)

        variable16 = variable.astype("float16")
        print(f"\tStandard Deviation (SD): '{variable.std()}'")
        print(f"\tSD of float32 - float16: '{(variable - variable16).std()}'")

        print(f"\tSaving '{var}'\n")

        output_dir = f"dataframes/{var}"
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        save_df_as_parquet(variable16, f"{output_dir}/{output}", compression_level)


def print_nc4_metadata(filename):
    with open_xarray_dataset(filename) as dataset:
        print(dataset)


def compress_nc4(filename, variables, compression_level=9):
    with open_nc4_dataset(filename) as dataset:
        for variable in get_variables(variables):
            with open_nc4_dataset(filename, folder=f"compressed/{variable}", mode="w") as dst:
                dst.createDimension("lon", 288)  # 588/2 because we pack float16s as float32s
                dst.createDimension("lat", 361)
                dst.createDimension("lev", 42)
                dst.createDimension("time", 8)

                data = dataset.variables[variable]

                dst.createVariable(variable, "f", data.dimensions, chunksizes=(1, 1, 361, 288), compression="zlib",
                                   complevel=compression_level)
                dst[variable].setncatts(data.__dict__)  # copy variable attributes via a dictionary

                data_float16 = np.array(data[:]).astype("float16")
                packed_float32 = data_float16.view("float32")
                dst[variable][:] = packed_float32


In [25]:
print_nc4_metadata("MERRA2_100.tavg3_3d_cld_Np.19800101.nc4")

[20:41:30] LOG: Loading raw/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4
<xarray.Dataset>
Dimensions:    (lon: 576, lat: 361, lev: 42, time: 8)
Coordinates:
  * lon        (lon) float64 -180.0 -179.4 -178.8 -178.1 ... 178.1 178.8 179.4
  * lat        (lat) float64 -90.0 -89.5 -89.0 -88.5 ... 88.5 89.0 89.5 90.0
  * lev        (lev) float64 1e+03 975.0 950.0 925.0 900.0 ... 0.5 0.4 0.3 0.1
  * time       (time) datetime64[ns] 1980-01-01T01:30:00 ... 1980-01-01T22:30:00
Data variables:
    CFCU       (time, lev, lat, lon) float32 ...
    CLOUD      (time, lev, lat, lon) float32 ...
    DTRAIN     (time, lev, lat, lon) float32 ...
    INCLOUDQI  (time, lev, lat, lon) float32 ...
    INCLOUDQL  (time, lev, lat, lon) float32 ...
    QI         (time, lev, lat, lon) float32 ...
    QL         (time, lev, lat, lon) float32 ...
    RH         (time, lev, lat, lon) float32 ...
    TAUCLI     (time, lev, lat, lon) float32 ...
    TAUCLW     (time, lev, lat, lon) float32 ...
Attributes: (12/30)
    Hi

In [28]:
compress_nc4("MERRA2_100.tavg3_3d_cld_Np.19800101.nc4", "CLOUD")


[20:41:37] LOG: Loading raw/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4
[20:41:37] LOG: Loading compressed/CLOUD/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4


  data_float16 = np.array(data[:]).astype("float16")
