In [1]:
import sys

sys.path.append("..")

from data_loading import *


In [2]:
n = 10000000
estimate_quantile = 0.9935

means = {"U": 5.589, "V": 0.018}
stds = {"U": 9.832, "V": 3.232}


In [4]:
data = pd.read_feather(f"subset/UV-{estimate_quantile}-{n}.ft")
df = data.copy()

NORMALIZED = False
GEOGRAPHICAL = False
CYCLIC_TIME = False

data.head()


Unnamed: 0,time,lev,lat,lon,U_est,V_est,U,V
0,2452,0,41,223,26.390625,3.857422,26.421875,3.845703
1,766,0,174,458,-2.804688,-2.136719,-2.679688,-1.933594
2,869,0,297,468,10.5,-2.021484,10.625,-1.998047
3,2272,0,26,264,7.726562,3.412109,7.730469,3.423828
4,2787,0,266,269,13.054688,-2.029297,13.171875,-2.205078


In [5]:
NORMALIZED = True

df["lat"] = (data["lat"] * 0.5) - 90
df["lat"] /= 90
df["lon"] = (data["lon"] * 0.625) - 180
df["lon"] /= 180
df["lev"] /= 36

df["lev"] = df["lev"].astype("float16")
df["lat"] = df["lat"].astype("float16")
df["lon"] = df["lon"].astype("float16")

df["U"] -= means["U"]
df["U"] /= stds["U"]

df["V"] -= means["V"]
df["V"] /= stds["V"]

df["U_est"] -= means["U"]
df["U_est"] /= stds["U"]

df["V_est"] -= means["V"]
df["V_est"] /= stds["V"]

df["day %"] = ((data["time"] % 8) / 8).astype("float16")
df["year %"] = (data["time"] / 2920).astype("float16")
del df["time"]

df.head()


Unnamed: 0,lev,lat,lon,U_est,V_est,U,V,day %,year %
0,0.0,-0.772461,-0.225708,2.115234,1.1875,2.119141,1.18457,0.5,0.839726
1,0.0,-0.033325,0.590332,-0.853516,-0.666504,-0.84082,-0.603516,0.75,0.262329
2,0.0,0.649902,0.625,0.499512,-0.630859,0.512207,-0.623535,0.625,0.297603
3,0.0,-0.855469,-0.083313,0.217407,1.049805,0.217773,1.053711,0.0,0.778082
4,0.0,0.477783,-0.065979,0.759766,-0.633301,0.771484,-0.6875,0.375,0.954452


In [6]:
CYCLIC_TIME = True

df["sin_day"] = np.sin(df["day %"] * np.pi * 2).astype("float16")
df["cos_day"] = np.cos(df["day %"] * np.pi * 2).astype("float16")
df["sin_year"] = np.sin(df["year %"] * np.pi * 2).astype("float16")
df["cos_year"] = np.cos(df["year %"] * np.pi * 2).astype("float16")

df.head()


Unnamed: 0,lev,lat,lon,U_est,V_est,U,V,day %,year %,sin_day,cos_day,sin_year,cos_year
0,0.0,-0.772461,-0.225708,2.115234,1.1875,2.119141,1.18457,0.5,0.839726,0.0,-1.0,-0.845215,0.53418
1,0.0,-0.033325,0.590332,-0.853516,-0.666504,-0.84082,-0.603516,0.75,0.262329,-1.0,-0.0,0.99707,-0.077393
2,0.0,0.649902,0.625,0.499512,-0.630859,0.512207,-0.623535,0.625,0.297603,-0.707031,-0.707031,0.955566,-0.294678
3,0.0,-0.855469,-0.083313,0.217407,1.049805,0.217773,1.053711,0.0,0.778082,0.0,1.0,-0.984375,0.175537
4,0.0,0.477783,-0.065979,0.759766,-0.633301,0.771484,-0.6875,0.375,0.954452,0.707031,-0.707031,-0.282227,0.959473


In [11]:
GEOGRAPHICAL = True

frland = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="FRLAND",
                                         time=0, level=0, folder="raw").astype("float16")
frocean = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="FROCEAN",
                                          time=0, level=0, folder="raw").astype("float16")
frlake = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="FRLAKE",
                                         time=0, level=0, folder="raw").astype("float16")
frlandice = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="FRLANDICE",
                                            time=0, level=0, folder="raw").astype("float16")
phis = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="PHIS",
                                       time=0, level=0, folder="raw")
sgh = load_variable_at_time_and_level("MERRA2_101.const_2d_asm_Nx.00000000.nc4", variable="SGH",
                                      time=0, level=0, folder="raw")

phis -= phis.mean()
phis /= phis.std()
phis = phis.astype("float16")

sgh -= sgh.mean()
sgh /= sgh.std()
sgh = sgh.astype("float16")

indices = data[["lat", "lon"]].values.T

df["frland"] = frland[*indices]
df["frocean"] = frocean[*indices]
df["frlake"] = frlake[*indices]
df["frlandice"] = frlandice[*indices]
df["phis"] = phis[*indices]
df["sgh"] = sgh[*indices]

df.head()


(2, 10000000)


Unnamed: 0,lev,lat,lon,U_est,V_est,U,V,day %,year %,sin_day,cos_day,sin_year,cos_year,frland,frocean,frlake,frlandice,phis,sgh
0,0.0,-0.772461,-0.225708,2.115234,1.1875,2.119141,1.18457,0.5,0.839726,0.0,-1.0,-0.845215,0.53418,0.0,1.0,0.0,0.0,-0.447754,-0.393799
1,0.0,-0.033325,0.590332,-0.853516,-0.666504,-0.84082,-0.603516,0.75,0.262329,-1.0,-0.0,0.99707,-0.077393,0.522949,0.477295,0.0,0.0,-0.435303,-0.129272
2,0.0,0.649902,0.625,0.499512,-0.630859,0.512207,-0.623535,0.625,0.297603,-0.707031,-0.707031,0.955566,-0.294678,0.996582,0.0,0.003208,0.0,0.310303,1.616211
3,0.0,-0.855469,-0.083313,0.217407,1.049805,0.217773,1.053711,0.0,0.778082,0.0,1.0,-0.984375,0.175537,0.0,0.0,0.0,1.0,1.767578,-0.242432
4,0.0,0.477783,-0.065979,0.759766,-0.633301,0.771484,-0.6875,0.375,0.954452,0.707031,-0.707031,-0.282227,0.959473,0.0,1.0,0.0,0.0,-0.447754,-0.393799


In [15]:
df.dtypes

lev          float16
lat          float16
lon          float16
U_est        float16
V_est        float16
U            float16
V            float16
day %        float16
year %       float16
sin_day      float16
cos_day      float16
sin_year     float16
cos_year     float16
frland       float16
frocean      float16
frlake       float16
frlandice    float16
phis         float16
sgh          float16
dtype: object

In [16]:
# N - normalized
# G - geographical features
# CT - cyclic time

dataset = "N" if NORMALIZED else ""
dataset += "G" if GEOGRAPHICAL else ""
dataset += "CT" if CYCLIC_TIME else ""

df.to_feather(f"UV-{dataset}-{estimate_quantile}-{n}.ft")

In [17]:
df.astype("float32").describe()


Unnamed: 0,lev,lat,lon,U_est,V_est,U,V,day %,year %,sin_day,cos_day,sin_year,cos_year,frland,frocean,frlake,frlandice,phis,sgh
count,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0
mean,0.4861209,8.733363e-05,-0.001608628,0.001357466,0.0003005289,-0.0001619807,0.0002906788,0.4375165,0.4997328,-6.621992e-05,-2.84582e-05,0.0001064984,0.000147646,0.2208213,0.6579908,0.004570988,0.1166172,-0.0001563366,0.0002166144
std,0.2885287,0.5788891,0.5772211,0.9992734,0.9932732,1.00036,0.9996375,0.2864048,0.2886822,0.707075,0.7070631,0.7072459,0.7069758,0.3983926,0.4615592,0.0294222,0.314903,0.9995689,1.000632
min,0.0,-1.0,-1.0,-5.261719,-8.234375,-5.25,-8.234375,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,-0.5087891,-0.3937988
25%,0.25,-0.5,-0.5,-0.6621094,-0.5620117,-0.6635742,-0.5683594,0.125,0.2496338,-0.7070312,-0.7070312,-0.7070312,-0.7070312,0.0,0.0,0.0,0.0,-0.4477539,-0.3937988
50%,0.5,0.0,0.0,-0.213623,-0.01167297,-0.2144775,-0.0114975,0.5,0.4997559,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.4477539,-0.3937988
75%,0.722168,0.5,0.5,0.4868164,0.546875,0.4868164,0.5532227,0.75,0.75,0.7070312,0.7070312,0.7070312,0.7070312,0.09796143,1.0,0.0,0.0,-0.1680908,-0.1483154
max,0.972168,1.0,0.996582,7.855469,8.5625,7.886719,8.4375,0.875,0.9995117,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.078125,12.28906
