In [2]:
import pickle

from fft import *
from data_loading import *


In [5]:
n = 100000000
estimate_quantile = 0.9935

In [277]:
longitudes = np.random.randint(0, 576, size=n, dtype="uint16")
latitudes = np.random.randint(0, 361, size=n, dtype="uint16")
times = np.random.randint(0, 365 * 8, size=n, dtype="uint16")
levels = np.random.randint(0, 36, size=n, dtype="uint8")

zeros = np.zeros(shape=n, dtype="float16")

data = pd.DataFrame({"time": times, "lev": levels, "lat": latitudes, "lon": longitudes,
                     "u_est": zeros, "v_est": zeros, "u": zeros, "v": zeros})
data = data.sort_values(by="time", ignore_index=True)
data = data.set_index("time")
data.head()

Unnamed: 0_level_0,lev,lat,lon,u_est,v_est,u,v
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,17,48,359,0.0,0.0,0.0,0.0
0,29,59,54,0.0,0.0,0.0,0.0
0,15,50,392,0.0,0.0,0.0,0.0
0,14,190,116,0.0,0.0,0.0,0.0
0,8,324,317,0.0,0.0,0.0,0.0


In [45]:
print(f"Size: {data.memory_usage().sum() / (1000 ** 2)} MB")
print(f"Duplicates: {100 * data.duplicated(subset=['lat', 'lon', 'lev', 'time']).sum() / n:.3f}%")


Size: 1500.0 MB
Duplicates: 0.228901%


In [None]:
i = 0
for mm in tqdm(range(1, 13)):
    for dd in tqdm(range(1, monthrange(2001, mm)[1] + 1)):
        for t in range(8):
            indices = data.loc[i][["lev", "lat", "lon"]].values.T

            filename = f"YAVG{mm:0>2}{dd:0>2}-{t * 3 + 1:0>2}:30.bin"

            with open(f"models/3D-dft-daily/U/{estimate_quantile}/{filename}", "rb") as file:
                fft = pickle.load(file)
                estimate = idft3_at_time(*fft)
                u_est = estimate[*indices]

            with open(f"models/3D-dft-daily/V/{estimate_quantile}/{filename}", "rb") as file:
                fft = pickle.load(file)
                estimate = idft3_at_time(*fft)
                v_est = estimate[*indices]

            data.loc[i, "u_est"] = u_est.astype("float16")
            data.loc[i, "v_est"] = v_est.astype("float16")

            filename = f"MERRA2.tavg3_3d_asm_Nv.YAVG{mm:0>2}{dd:0>2}.nc4"

            actual = load_variable_at_time(filename, "U", t, cache=False)
            data.loc[i, "u"] = actual[*indices]

            actual = load_variable_at_time(filename, "V", t, cache=False)
            data.loc[i, "v"] = actual[*indices]

            i += 1


In [321]:
df = data.reset_index()
df["time"] = df["time"].astype("uint8")
df["u_est"] = df["u_est"].astype("float16")
df["v_est"] = df["v_est"].astype("float16")
df = df.sample(frac=1, ignore_index=True)  # shuffle
df.head()


Unnamed: 0,time,lev,lat,lon,u_est,v_est,u,v
0,0,17,48,359,-5.46875,-1.162109,-5.003906,-0.922852
1,0,29,59,54,10.289062,-2.681641,10.375,-2.595703
2,0,15,50,392,0.776367,0.741211,0.597656,0.539062
3,0,14,190,116,-5.0,-1.376953,-4.503906,-1.520508
4,0,8,324,317,7.867188,0.561523,7.277344,0.813477


In [10]:
df.to_feather(f"raw/subset/UV-{estimate_quantile}-{n}.ft")
