In [1]:
import pandas as pd
import xarray as xr

from util import *

In [64]:
def save_nc4_as_parquet(filename, variables, compression_level=11):
    filepath = f"raw/{filename}"

    if not filename.endswith(".nc4"):
        raise ValueError("Filename must be netCDF4 file")
    output = filename[:-3] + "parquet"

    print(f"Converting {filepath}")
    dataset = xr.open_dataset(filepath)

    if isinstance(variables, str):
        variables = [variables]

    print(f"\tLoading DataFrame")
    df = dataset.to_dataframe()

    for var in variables:
        print(f"\tLoading '{var}'")
        variable = pd.DataFrame(df[var])
        variable.reset_index(inplace=True, drop=True)

        variable16 = variable.astype("float16")
        print(f"\tStandard Deviation (SD): '{variable.std()}'")
        print(f"\tSD of float32 - float16: '{(variable - variable16).std()}'")

        print(f"\tSaving '{var}'")
        variable16.to_parquet(f"dataframes/{var}/{output}", engine="fastparquet", compression={"_default": {"type": "BROTLI", "args": {"level": compression_level}}})

        print()

    print(f"\tClosing dataset")
    dataset.close()
    del dataset

In [65]:
save_nc4_as_parquet("MERRA2_100.tavg3_3d_cld_Np.19800101.nc4", "RH")

Converting raw/MERRA2_100.tavg3_3d_cld_Np.19800101.nc4
	Loading DataFrame
	Loading 'RH'
	Standard Deviation (SD): 'RH    0.374293
dtype: float32'
	SD of float32 - float16: 'RH    0.000091
dtype: float32'
	Saving 'RH'

	Closing dataset


In [25]:
pd.read_parquet("dataframes/MERRA2_100.tavg3_3d_cld_Np.19800101.parquet").equals(df)

True