In [1]:
import sys

sys.path.append("../")

import pickle

from fft import *
from data_loading import *
from maths import linear_interpolate, rmse


In [3]:
def build_dataset(n, variable):
    longitudes = np.random.randint(0, 576, size=n, dtype="uint16")
    latitudes = np.random.randint(0, 361, size=n, dtype="uint16")
    times = np.random.randint(0, 365 * 8, size=n, dtype="uint16")
    levels = np.random.randint(0, 36, size=n, dtype="uint8")

    zeros = np.zeros(shape=n, dtype="float16")

    data = pd.DataFrame({"time": times, "lev": levels, "lat": latitudes, "lon": longitudes, variable: zeros})
    data = data.sort_values(by="lev", ignore_index=True)

    print(f"Size: {data.memory_usage().sum() / (1000 ** 2)} MB")
    print(f"Duplicates: {100 * data.reset_index().duplicated(subset=['lat', 'lon', 'lev', 'time']).sum() / n:.3f}%")

    indices = data[["time", "lev", "lat", "lon"]].values.T

    targets = load_variable("MERRA2.tavg3_3d_asm_Nv.YAVG{:0>2}{:0>2}.nc4", variable, cache=False, folder=".")
    data[variable] = targets[*indices]
    del targets

    data.to_feather(f"subset/{variable}-{n}.ft")


def build_3dft_dataset(n, quantile, variables=("U", "V"),
                       variable_levels={"U": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 16, 19, 23, 28, 32, 34, 35],
                                        "V": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 20, 23, 28, 32, 34, 35]}):
    longitudes = np.random.randint(0, 576, size=n, dtype="uint16")
    latitudes = np.random.randint(0, 361, size=n, dtype="uint16")
    times = np.random.randint(0, 365 * 8, size=n, dtype="uint16")
    levels = np.random.randint(0, 36, size=n, dtype="uint8")

    zeros = np.zeros(shape=n, dtype="float16")

    cols = {"time": times, "lev": levels, "lat": latitudes, "lon": longitudes}
    cols = cols.update({f"{var}_est": zeros for var in variables})
    cols = cols.update({f"{var}_est": zeros for var in variables})

    data = pd.DataFrame(cols)
    data = data.sort_values(by="lev", ignore_index=True)

    print(f"Size: {data.memory_usage().sum() / (1000 ** 2)} MB")
    print(f"Duplicates: {100 * data.reset_index().duplicated(subset=['lat', 'lon', 'lev', 'time']).sum() / n:.3f}%")

    indices = data[["time", "lev", "lat", "lon"]].values.T

    for var in variables:
        targets = load_variable("MERRA2.tavg3_3d_asm_Nv.YAVG{:0>2}{:0>2}.nc4", var, cache=False, folder=".")

        data[var] = targets[*indices]
        del targets

    for variable in variables:
        estimate = np.zeros((8 * 365, 36, 361, 576), dtype="float16")

        for lev in tqdm(variable_levels[variable]):
            with open(f"../models/3D-dft/{variable}/{quantile}/{lev}.bin", "rb") as file:
                fft = pickle.load(file)

            estimate[:, lev] = idft3_at_level(*fft)[0].astype("float16")

        i = -1
        for lev in tqdm(range(36)):
            if lev in variable_levels[variable]:
                i += 1
                continue

            lower = variable_levels[variable][i]
            upper = variable_levels[variable][i + 1]
            t = (lev - lower) / (upper - lower)
            estimate[:, lev] = linear_interpolate((estimate[:, lower], estimate[:, upper]), 0, t)

        data[f"{variable}_est"] = estimate[*indices]

    data.to_feather(f"subset/{''.join(variables)}-3DFT-{quantile}-{n}.ft")

In [4]:
build_dataset(10000000, variable="V")

Size: 90.000128 MB
Duplicates: 0.023%


  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
build_3dft_dataset(10000000, quantile=0.9975)