In [1]:
import sys
sys.path.append("../..")

import pickle

from fft import *
from data_loading import *


In [2]:
n = 100000000
estimate_quantile = 0.9935

In [3]:
longitudes = np.random.randint(0, 576, size=n, dtype="uint16")
latitudes = np.random.randint(0, 361, size=n, dtype="uint16")
times = np.random.randint(0, 365 * 8, size=n, dtype="uint16")
levels = np.random.randint(0, 36, size=n, dtype="uint8")

zeros = np.zeros(shape=n, dtype="float16")

data = pd.DataFrame({"time": times, "lev": levels, "lat": latitudes, "lon": longitudes,
                     "U_est": zeros, "V_est": zeros, "U": zeros, "V": zeros})
data = data.sort_values(by="time", ignore_index=True)
data = data.set_index("time")
data.head()

Unnamed: 0_level_0,lev,lat,lon,U_est,V_est,U,V
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,18,221,303,0.0,0.0,0.0,0.0
0,21,338,38,0.0,0.0,0.0,0.0
0,31,7,574,0.0,0.0,0.0,0.0
0,8,202,220,0.0,0.0,0.0,0.0
0,8,45,221,0.0,0.0,0.0,0.0


In [45]:
print(f"Size: {data.memory_usage().sum() / (1000 ** 2)} MB")
print(f"Duplicates: {100 * data.duplicated(subset=['lat', 'lon', 'lev', 'time']).sum() / n:.3f}%")


Size: 1500.0 MB
Duplicates: 0.228901%


In [4]:
i = 0
for mm in tqdm(range(1, 13)):
    for dd in tqdm(range(1, monthrange(2001, mm)[1] + 1)):
        for t in range(8):
            indices = data.loc[i][["lev", "lat", "lon"]].values.T

            filename = f"YAVG{mm:0>2}{dd:0>2}-{t * 3 + 1:0>2}:30.bin"

            with open(f"../../models/3D-dft-daily/U/{estimate_quantile}/{filename}", "rb") as file:
                fft = pickle.load(file)
                estimate = idft3_at_time(*fft)
                u_est = estimate[*indices]

            with open(f"../../models/3D-dft-daily/V/{estimate_quantile}/{filename}", "rb") as file:
                fft = pickle.load(file)
                estimate = idft3_at_time(*fft)
                v_est = estimate[*indices]

            data.loc[i, "U_est"] = u_est.astype("float16")
            data.loc[i, "V_est"] = v_est.astype("float16")

            filename = f"MERRA2.tavg3_3d_asm_Nv.YAVG{mm:0>2}{dd:0>2}.nc4"

            actual = load_variable_at_time(filename, "U", t, cache=False, folder="..")
            data.loc[i, "U"] = actual[*indices]

            actual = load_variable_at_time(filename, "V", t, cache=False, folder="..")
            data.loc[i, "V"] = actual[*indices]

            i += 1


  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [5]:
df = data.reset_index()
df["time"] = df["time"].astype("uint16")
df = df.sample(frac=1, ignore_index=True)  # shuffle
df.head()


Unnamed: 0,time,lev,lat,lon,U_est,V_est,U,V
0,1131,24,156,542,-5.492188,-1.463867,-5.203125,-1.026367
1,132,16,121,383,0.241455,-0.376465,0.551758,-0.787109
2,1814,7,342,319,9.96875,4.695312,9.671875,5.15625
3,309,31,186,45,-8.609375,-0.665527,-8.6875,-0.316895
4,822,30,286,284,0.281006,0.276367,-0.011658,0.172607


In [6]:
df.to_feather(f"UV-{estimate_quantile}-{n}.ft")
