In [1]:
from calendar import monthrange

import numpy as np
import pandas as pd

from nc4 import *
from merra2 import *

In [2]:
def save_download_urls(dataset_esdt: str,
                       collection: str,
                       start_year, end_year,
                       start_month=1, end_month=12,
                       start_day=1, end_day=None) -> None:
    with open("urls.txt", "w") as file:
        for yyyy in range(start_year, end_year + 1):

            mi = start_month if yyyy == start_year else 1
            mf = end_month if yyyy == end_year else 12

            for mm in range(mi, mf + 1):

                stream = get_merra_stream_from_year(yyyy, mm)

                di = start_day if yyyy == start_year and mm == mi else 1
                df = end_day if end_day and yyyy == end_year and mm == mf else monthrange(yyyy, mm)[1]

                for dd in range(di, df + 1):
                    file.write(f"https://goldsmr5.gesdisc.eosdis.nasa.gov/data/MERRA2/{dataset_esdt}/"
                               f"{yyyy}/{mm:0>2}/MERRA2_{stream}.{collection}.{yyyy}{mm:0>2}{dd:0>2}.nc4\n")


def save_download_subset_urls(dataset_esdt: str,
                              collection: str,
                              variables: str,
                              output_dir: str,
                              start_year: int,
                              end_year: int,
                              start_month: int = 1,
                              end_month: int = 12,
                              start_day: int = 1,
                              end_day: int | None = None,
                              levels=range(36, 72)) -> None:

    variables = get_merra_variables(variables)

    with open("urls.txt", "w") as file:
        for yyyy in range(start_year, end_year + 1):

            mi = start_month if yyyy == start_year else 1
            mf = end_month if yyyy == end_year else 12

            for mm in range(mi, mf + 1):

                stream = get_merra_stream_from_year(yyyy, mm)

                di = start_day if yyyy == start_year and mm == mi else 1
                df = end_day if end_day and yyyy == end_year and mm == mf else monthrange(yyyy, mm)[1]

                for dd in range(di, df + 1):
                    filename = f"MERRA2_{stream}.{collection}.{yyyy}{mm:0>2}{dd:0>2}"
                    if os.path.isfile(f"{output_dir}/{filename}.SUB.nc"):
                        continue

                    file.write("https://goldsmr5.gesdisc.eosdis.nasa.gov/daac-bin/OTF/HTTP_services.cgi?"
                               f"FILENAME=%2Fdata%2FMERRA2%2F{dataset_esdt}%2F{yyyy}%2F{mm:0>2}%2F"
                               f"MERRA2_{stream}.{collection}.{yyyy}{mm:0>2}{dd:0>2}.nc4"
                               f"&SERVICE=L34RS_MERRA2"
                               f"&DATASET_VERSION=5.12.4"
                               f"&BBOX=-90%2C-180%2C90%2C180"
                               f"&LABEL=MERRA2_{stream}.{collection}.{yyyy}{mm:0>2}{dd:0>2}.SUB.nc"
                               f"&FORMAT=bmM0Lw"
                               f"&SHORTNAME={dataset_esdt}"
                               f"&VARIABLES={'%2C'.join(variable for variable in variables)}"
                               f"&VERSION=1.02"
                               f"&LAYERS=LAYER_{'%2C'.join(str(lev + 1) for lev in levels)}"
                               "\n")

In [9]:
save_download_subset_urls("M2T3NVASM.5.12.4", "tavg3_3d_asm_Nv",
                          ["U", "V"],
                          "OUTPUT/FOLDER",
                          start_year=1981, end_year=1981,
                          start_month=7, end_month=7,
                          start_day=20, end_day=29)

In [None]:
!wget --user=BhavyeMathur --ask-password --content-disposition -i urls.txt -P "raw/"


In [3]:
def save_df_as_parquet(df, output, compression_level=11, output_dir="dataframes"):
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df)

    df.to_parquet(f"{output_dir}/{output}",
                  engine="fastparquet",
                  compression={"_default": {"type": "BROTLI", "args": {"level": compression_level}}})


def save_nc4_as_parquet(filename, variables, raw_dir="raw", output_dir="dataframes", **kwargs):
    output = filename[:-3] + "parquet"

    variables = get_merra_variables(variables)
    with open_xarray_dataset(filename, folder=raw_dir) as dataset:
        for var in variables:
            print(f"\tLoading '{var}'")
            variable = dataset[var].to_dataframe()
            variable.reset_index(inplace=True, drop=True)

            variable16 = variable.astype("float16")
            print(f"\tStandard Deviation (SD): '{variable[var].std()}'")
            print(f"\tSD of float32 - float16: '{(variable - variable16)[var].std()}'")

            print(f"\tSaving '{var}'\n")

            output_dir = f"{output_dir}/{var}"
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)

            save_df_as_parquet(variable16, output, output_dir=output_dir, **kwargs)


def compress_nc4(filename,
                 variables,
                 raw_dir="raw",
                 output_dir="compressed",
                 compression="zlib",
                 compression_level=9,
                 pack_as_float16=True):
    variables = get_merra_variables(variables)

    if os.path.isfile(f"{output_dir}/{filename}"):
        return

    if os.path.isfile(f"{raw_dir}/{filename}"):
        raw_file = filename
    else:
        raw_file = filename.replace(".nc4", ".SUB.nc4")

    with open_nc4_dataset(filename, folder=raw_dir) as dataset:
        with open_nc4_dataset(filename, folder=output_dir, mode="w") as dst:

            dimensions = dataset[variables[0]].shape
            if pack_as_float16 and dimensions[3] % 2 != 0:
                raise RuntimeError("Cannot pack float16 data as float32 because dimensions is an odd number")

            if pack_as_float16:
                dst.createDimension("lon", dimensions[3] // 2)  # //2 because we pack float16s as float32s
            else:
                dst.createDimension("lon", dimensions[3])

            dst.createDimension("lat", dimensions[2])
            dst.createDimension("lev", 36)
            dst.createDimension("time", dimensions[0])

            for variable in variables:
                data = dataset.variables[variable]

                dst.createVariable(variable, "f", data.dimensions,
                                   compression=compression,
                                   complevel=compression_level)
                dst[variable].setncatts(data.__dict__)

                if pack_as_float16:
                    data_float16 = np.array(data[:, -36:], dtype="float16")
                    packed_float32 = data_float16.view("float32")
                    dst[variable][:] = packed_float32
                else:
                    dst[variable][:] = data[:]


def compress_all_nc4(collection: str,
                     variables,
                     start_year, end_year,
                     start_month=1, end_month=12,
                     start_day=1, end_day=None,
                     **kwargs):
    for yyyy in range(start_year, end_year + 1):

        stream = get_merra_stream_from_year(yyyy)

        mi = start_month if yyyy == start_year else 1
        mf = end_month if yyyy == end_year else 12

        for mm in range(mi, mf + 1):

            di = start_day if yyyy == start_year and mm == mi else 1
            df = end_day if end_day and yyyy == end_year and mm == mf else monthrange(yyyy, mm)[1]

            for dd in range(di, df + 1):
                filename = f"MERRA2_{stream}.{collection}.{yyyy}{mm:0>2}{dd:0>2}.nc4"
                compress_nc4(filename, variables, **kwargs)

In [7]:
print_nc4_metadata("MERRA2_300.tavg3_3d_asm_Nv.20010817.nc4")

<xarray.Dataset>
Dimensions:  (time: 8, lev: 72, lat: 361, lon: 288)
Dimensions without coordinates: time, lev, lat, lon
Data variables:
    U        (time, lev, lat, lon) float32 ...
    V        (time, lev, lat, lon) float32 ...


In [None]:
compress_all_nc4("tavg3_3d_cld_Np",
                 ["U", "V"],
                 start_year=1980, end_year=2022,
                 start_month=1,
                 start_day=1)
