# Test initial Dataloader Change

In [2]:
import pandas as pd
from pathlib import Path
import xarray as xr
import numpy as np
# colormaps = https://matplotlib.org/cmocean/
import cmocean
import matplotlib.pyplot as plt
import geopandas as gpd
from typing import List, Tuple, Dict, Union, Optional

import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi'] = 100

import os

# set the working directory
current_path = Path('.').resolve()
if current_path == Path('/home/tommy/ml_drought/notebooks/draft'):
    os.chdir(current_path.parents[1].as_posix())

current_path = Path('.').resolve()
assert current_path.name == "ml_drought"

%load_ext autoreload
%autoreload 2

In [3]:
from scripts.utils import get_data_path
from src.engineer import Engineer
from src.utils import get_ds_mask

In [4]:
data_dir = get_data_path()

# Load the preprocessed data

In [11]:
from src.engineer import Engineer

engineer = Engineer(
    get_data_path(), experiment="one_month_forecast", process_static=True
)
e = engineer.engineer_class

In [12]:
data = e._make_dataset(static=False)

Processing /cats/datastore/data/interim/VCI_preprocessed/data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/volumetric_soil_water_layer_1_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/volumetric_soil_water_layer_2_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/volumetric_soil_water_layer_3_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/volumetric_soil_water_layer_4_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/potential_evaporation_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/total_precipitation_data_india.nc
Processing /cats/datastore/data/interim/reanalysis-era5-land-monthly-means_preprocessed/2m_temperature_data_india.nc
Processing /cats/datastore/data/interi

In [14]:
data

In [141]:
dynamic_data = data
static_data = e._make_dataset(static=True)

Processing /cats/datastore/data/interim/static/srtm_preprocessed/india.nc
Processing /cats/datastore/data/interim/static/reanalysis-era5-single-levels-monthly-means_preprocessed/data_india.nc
Processing /cats/datastore/data/interim/static/esa_cci_landcover_preprocessed/esa_cci_landcover_india_one_hot.nc


In [133]:
from collections import defaultdict
from typing import DefaultDict, Dict
from pathlib import Path
import xarray as xr


# class Normalizer:
def calculate_normalization_dict(ds: xr.Dataset, static: bool = False) -> DefaultDict[str, Dict[str, float]]:
    normalization_values: DefaultDict[str, Dict[str, float]] = defaultdict(dict)
    if static:
        reducing_dims = ["lat", "lon"]
    else:
        reducing_dims = ["lat", "lon", "time"]

    for var in ds.data_vars:
        if var.endswith("one_hot"):
            mean = 0.0
            std = 1.0
        else:
            mean = float(
                ds[var].mean(dim=reducing_dims, skipna=True).values
            )
            std = float(ds[var].std(
                dim=reducing_dims, skipna=True).values)

        normalization_values[var]["mean"] = mean
        normalization_values[var]["std"] = std
        
    return normalization_values


def normalize_xr(ds: xr.Dataset, static: bool = False) -> Tuple[xr.Dataset, DefaultDict[str, Dict[str, float]]]:
    """Normalize the xarray object

    Args:
        ds (xr.Dataset): [description]
        static (bool, optional): [description]. Defaults to False.

    Returns:
        xr.Dataset: Xarray Dataset with normalized values
        DefaultDict[str, Dict[str, float]]: The values
    """
    norm_dict = calculate_normalization_dict(ds, static)

    list_of_normed: List[xr.DataArray] = []
    for variable in ds.data_vars:
        list_of_normed.append((ds[variable] - norm_dict[variable]
                        ["mean"]) / norm_dict[variable]["std"])

    ds_norm = xr.merge(list_of_normed)

    return ds_norm, norm_dict


def unnormalize_xr(ds: xr.Dataset, normalization_dict: DefaultDict[str, Dict[str, float]]) -> xr.Dataset:
    list_of_unnormed: List[xr.DataArray] = []
    for variable in ds.data_vars:
        list_of_unnormed.append(
            (ds[variable] * normalization_dict[variable]["std"]) +
            normalization_dict[variable]["mean"]
        )

    ds_unnorm = xr.merge(list_of_unnormed)

    return ds_unnorm

In [139]:
cfg = dict(
    dynamic_inputs=["tp", "tprate_mean_1"],
    target_variable=["VCI"],
    static_inputs=[],
    seq_length=3,
    min_test_time="2010-01-01",
    max_test_time="2020-01-01",
    min_train_time="2000-01-01",
    max_train_time="2009-12-31",
)

test_times = pd.date_range(cfg["min_test_time"], cfg["max_test_time"], freq="M")
train_times = pd.date_range(cfg["min_train_time"], cfg["max_train_time"], freq="M")

In [143]:
dyn_scaler = calculate_normalization_dict(dynamic_data)
stat_scaler = calculate_normalization_dict(static_data, static=True)

# Convert into pixel based information

In [16]:
pixels = data.copy().stack(pixel=["lat", "lon"])

In [19]:
df = data.to_dataframe()

In [20]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,VCI,swvl1,swvl2,swvl3,swvl4,pev,tp,t2m,e,t2m_std_1,...,erate_std_2,erate_mean_2,erate_std_3,erate_mean_3,tprate_std_1,tprate_mean_1,tprate_std_2,tprate_mean_2,tprate_std_3,tprate_mean_3
lat,lon,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
36.0,67.300003,2000-02-29,63.625,0.414532,0.389866,0.294564,0.308208,-0.000927,0.000686,269.374756,-0.0002,1.683767,...,,,,,4.39246e-09,1.36465e-08,,,,
36.0,67.300003,2000-03-31,55.457497,0.402257,0.393125,0.327063,0.308119,-0.001773,0.000501,275.35495,-0.000781,1.541664,...,1.035139e-09,-3.085186e-09,,,3.034482e-09,1.347471e-08,4.559223e-09,1.636154e-08,,
36.0,67.300003,2000-04-30,47.074997,0.319747,0.325866,0.323359,0.307672,-0.00348,0.000563,285.641968,-0.001639,1.380861,...,3.013228e-09,-8.664283e-09,2.58497e-09,-8.76998e-09,4.76329e-09,1.997592e-08,9.146565e-09,2.593243e-08,9.685661e-09,2.447746e-08
36.0,67.300003,2000-05-31,11.894,0.217281,0.246682,0.29288,0.305913,-0.00465,5e-05,291.65683,-0.001123,1.355013,...,3.054498e-09,-2.387456e-08,4.536804e-09,-2.057241e-08,9.622732e-09,2.031139e-08,8.299525e-09,2.892003e-08,8.969445e-09,3.169543e-08
36.0,67.300003,2000-06-30,2.4275,0.20454,0.228852,0.263577,0.303094,-0.004817,8.3e-05,292.572998,-0.000602,0.817621,...,4.978621e-09,-3.039982e-08,2.819519e-09,-3.373821e-08,1.482762e-09,4.693968e-09,8.613428e-09,1.330562e-08,1.054342e-08,1.819778e-08


In [21]:
df.shape

(22438240, 27)

In [27]:
d = df.iloc[0:1000]

In [32]:
metadata = d.index.to_numpy()


In [53]:
def get_pixel_data(ds, pixel):
    pass


NameError: name 'input_variables' is not defined

In [None]:
# import h5py

# h5_file = data_dir / "test.h5"

# n_dyn_inputs = len(cfg["dynamic_inputs"])
# n_targets = len(cfg["target_variable"])
# # we only store user-defined additional static features provided in the additional_features table
# n_stat = len(cfg["static_inputs"])

# with h5py.File(h5_file, "w") as out_f:
#     dyn_input_data = out_f.create_dataset(
#         "dynamic_inputs",
#         shape=(0, cfg["seq_length"], n_dyn_inputs),
#         maxshape=(None, cfg["seq_length"], n_dyn_inputs),
#         chunks=True,
#         dtype=np.float32,
#         compression="gzip",
#     )
#     if n_stat > 0:
#         stat_input_data = out_f.create_dataset(
#             "static_inputs",
#             shape=(0, n_stat),
#             maxshape=(None, n_stat),
#             chunks=True,
#             dtype=np.float32,
#             compression="gzip",
#         )
#     target_data = out_f.create_dataset(
#         "target_data",
#         shape=(0, cfg["seq_length"], n_targets),
#         maxshape=(None, cfg["seq_length"], n_targets),
#         chunks=True,
#         dtype=np.float32,
#         compression="gzip",
#     )

#     sample_2_basin = out_f.create_dataset(
#         "sample_2_basin",
#         shape=(0,),
#         maxshape=(None,),
#         dtype="S11",
#         compression="gzip",
#         chunks=True,
#     )

In [62]:
input_variables = ["tp", "tprate_mean_1"]
target_variable = "VCI"

In [63]:
# sample_id
for spatial_unit in all_spatial_units:
    pass

In [64]:
all_spatial_units = data.stack(pixel=["lat", "lon"]).pixel.values[:5]

# TEST
spatial_unit = all_spatial_units[0]
# ------

su_data = data.sel(lat=spatial_unit[0], lon=spatial_unit[1])

In [146]:
total_samples = 0
spatial_units_without_train_data = []

dynamic_data = su_data[cfg["dynamic_inputs"] + cfg["target_variable"]]

# drop missing timesteps
missing_timesteps = np.any(np.isnan(dynamic_data).to_array().values, axis=0)
assert len(missing_timesteps.shape) == 1, "Expect to have reduced missing data shape to missing timesteps (in any variable) (1 Dimensional)"
dynamic_data = dynamic_data.sel(time=~missing_timesteps)

# update data in h5 file
num_samples = len(dynamic_data["time"])
total_samples = dyn_input_data.shape[0] + num_samples

# resize h5file
# dyn_input_data.resize((total_samples, cfg["seq_length"], n_dyn_inputs))




ValueError: Not a dataset (not a dataset)

NameError: name 'norm_dict' is not defined

In [125]:
# import h5py

# with h5py.File(h5_file, "w") as out_f:
#     dyn_input_data = out_f.create_dataset(
#         "dynamic_inputs",
#         shape=(0, cfg["seq_length"], n_dyn_inputs),
#         maxshape=(None, cfg["seq_length"], n_dyn_inputs),
#         chunks=True,
#         dtype=np.float32,
#         compression="gzip",
#     )
#     if n_stat > 0:
#         stat_input_data = out_f.create_dataset(
#             "static_inputs",
#             shape=(0, n_stat),
#             maxshape=(None, n_stat),
#             chunks=True,
#             dtype=np.float32,
#             compression="gzip",
#         )
#     target_data = out_f.create_dataset(
#         "target_data",
#         shape=(0, cfg["seq_length"], n_targets),
#         maxshape=(None, cfg["seq_length"], n_targets),
#         chunks=True,
#         dtype=np.float32,
#         compression="gzip",
#     )
#     q_stds = out_f.create_dataset(
#         "q_stds",
#         shape=(0, 1),
#         maxshape=(None, 1),
#         dtype=np.float32,
#         compression="gzip",
#         chunks=True,
#     )
#     sample_2_basin = out_f.create_dataset(
#         "sample_2_basin",
#         shape=(0,),
#         maxshape=(None,),
#         dtype="S11",
#         compression="gzip",
#         chunks=True,
#     )

#     scalers = {
#         "dyn_mean": np.zeros(n_dyn_inputs),
#         "dyn_std": np.zeros(n_dyn_inputs),
#         "target_mean": np.zeros(n_targets),
#         "target_std": np.zeros(n_targets),
#     }
#     total_samples = 0

#     basins_without_train_data = []


In [86]:
dynamic_data.where(np.isna)

TypeError: isnull() got an unexpected keyword argument 'dim'

In [99]:
np.any(np.isnan(dynamic_data).to_array().values, axis=-1)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

3

(92720,)

In [24]:
304*305*242

22438240