In [52]:
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

In [53]:
samples = 10
timesteps = 5  # or window length
latitudes = 3
longitudes = 4
channels = 4

lat = np.linspace(-90, 90, latitudes)
lon = np.linspace(-180, 180, longitudes)
sample = np.arange(samples)
timestep = np.arange(timesteps)
channel = np.arange(channels)

x_data = np.random.rand(samples, timesteps, latitudes, longitudes, channels)
y_data = np.random.rand(samples, timesteps, latitudes, longitudes, channels)

ds = xr.Dataset(
    {
        "x": (["sample", "timestep", "lat", "lon", "channel"], x_data),
        "y": (["sample", "timestep", "lat", "lon", "channel"], y_data),
    },
    coords={"sample": sample, "timestep": timestep, "lat": lat, "lon": lon, "channel": channel},
)

print("Created Dataset:")
# ds.to_netcdf("output_dataset.nc")
# loaded_ds = xr.load_dataset("output_dataset.nc")
# print("\nLoaded Dataset from NetCDF:\n", loaded_ds)
# nparrayresult = ds.to_array()
# nparrayresult.shape
print(len(ds.y))
print(len(ds.x))
print(ds.y.values.shape)
ds.y

Created Dataset:
10
10
(10, 5, 3, 4, 4)


Criando de forma dinamica, como ocorre no spatiotemporal_builder, código abaixo: 

In [54]:
np.array([[[1], [2]], [[3], [4]]]).shape

(2, 2, 1)

In [55]:
CHANNEL = 1

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])
c = np.array([[9, 10], [11, 12]])
data = np.stack([a, b, c], axis=0)

print(f"a.shape: {a.shape}")
print(f"b.shape: {b.shape}")
print(f"c.shape: {c.shape}")
print(f"""
    data.shape: {data.shape}
    window length or timesteps: {data.shape[0]}
    lat: {data.shape[1]}
    lon: {data.shape[2]}
    No channel dimension yet
""")

# Como o channel é 1 precisamos dar um reshape pra transofmarr [[1, 2], [3, 4]] em [[[1], [2]], [[3], [4]]]
# podemos fazer com reshape ou expand_dims
data_reshape = data.reshape(data.shape[0], data.shape[1], data.shape[2], CHANNEL)
data_expand_dims = np.expand_dims(data, axis=-1)
print(
    f"np.array_equal(data_reshape, data_expand_dims): {np.array_equal(data_reshape, data_expand_dims)}"
)
print(f"""
    data_reshape.shape: {data_reshape.shape}
    window length or timesteps: {data_reshape.shape[0]}
    lat: {data_reshape.shape[1]}
    lon: {data_reshape.shape[2]}
    channel: {data_reshape.shape[3]}, now we have the channel dimension
""")
# A primeiras versão do WebsirenesDataset.py executas as linhas comentadas abaixo:
# data = np.stack(timesteps, axis=0)
# data = data.reshape(data.shape[0], data.shape[1], data.shape[2], channel)
# return data

# o proximo passo é adicionar essa amostra de 3 timesteps 2x2x1 em um array de amostras
# nesse caso temos 1 amostra
data = np.stack([data_reshape], axis=0)
print(f"""
    data.shape: {data.shape}
    Samples: {data.shape[0]}
    window length or timesteps: {data.shape[1]}
    lat: {data.shape[2]}
    lon: {data.shape[3]}
    channel: {data.shape[4]}
""")

a.shape: (2, 2)
b.shape: (2, 2)
c.shape: (2, 2)

    data.shape: (3, 2, 2)
    window length or timesteps: 3
    lat: 2
    lon: 2
    No channel dimension yet

np.array_equal(data_reshape, data_expand_dims): True

    data_reshape.shape: (3, 2, 2, 1)
    window length or timesteps: 3
    lat: 2
    lon: 2
    channel: 1, now we have the channel dimension


    data.shape: (1, 3, 2, 2, 1)
    Samples: 1
    window length or timesteps: 3
    lat: 2
    lon: 2
    channel: 1



Agora vamos criar um exemplo com 2 amostras, 9 latitudes por 11 longitudes e channel 3, por exemplo, temperatura no indice 0, precipitação no indice 1 e umidade no indice 2 do channel

In [56]:
lats = np.array([-21.8, -22.05, -22.3, -22.55, -22.8, -23.05, -23.3, -23.55, -23.8])
lons = np.array(
    [
        -45.053,
        -44.8029,
        -44.5528,
        -44.3027,
        -44.0526,
        -43.8025,
        -43.5524,
        -43.3023,
        -43.0522,
        -42.8021,
        -42.552,
    ]
)
print(lats.shape)
print(lons.shape)

(9,)
(11,)


In [57]:
lat_sorted_ascending = np.sort(lats)[::-1]
lon_sorted_ascending = np.sort(lons)

sample = np.arange(2)
timestep = np.arange(5)
channel = np.arange(3)


def generate_sample(lat, lon, channels):
    timestep = np.zeros((len(lat), len(lon), channels))
    timestep[:, :, 0] = np.random.uniform(20, 30, (len(lat), len(lon)))  # Temperature
    timestep[:, :, 1] = np.random.uniform(0, 60, (len(lat), len(lon)))  # Precipitation
    timestep[:, :, 2] = np.random.uniform(0, 100, (len(lat), len(lon)))  # Humidity
    return timestep


# create a np array of shape len(lat) x len(lon) x len(channel), channel 0 should have temperature values, i.e. from 20 to 30, channel 1 precipitation values 0 to 60, channel 2 humidity values 0 to 100:


timestep0 = generate_sample(lats, lons, len(channel))
timestep1 = generate_sample(lats, lons, len(channel))
timestep2 = generate_sample(lats, lons, len(channel))
timestep3 = generate_sample(lats, lons, len(channel))
timestep4 = generate_sample(lats, lons, len(channel))

sample1 = np.stack([timestep0, timestep1, timestep2, timestep3, timestep4], axis=0)

timestep0 = generate_sample(lats, lons, len(channel))
timestep1 = generate_sample(lats, lons, len(channel))
timestep2 = generate_sample(lats, lons, len(channel))
timestep3 = generate_sample(lats, lons, len(channel))
timestep4 = generate_sample(lats, lons, len(channel))

sample2 = np.stack([timestep0, timestep1, timestep2, timestep3, timestep4], axis=0)

print(f"sample1.shape: {sample1.shape}")
print(f"sample2.shape: {sample2.shape}")

data = np.stack([sample1, sample2], axis=0)
print(f"data after stacking samples: {data.shape}")

ds = xr.Dataset(
    {"data": (["sample", "timestep", "lat", "lon", "channel"], data)},
    coords={
        "sample": sample,
        "timestep": timestep,
        "lat": lat_sorted_ascending,
        "lon": lon_sorted_ascending,
        "channel": channel,
    },
)

print("Created Dataset:")
print(ds)


sample1.shape: (5, 9, 11, 3)
sample2.shape: (5, 9, 11, 3)
data after stacking samples: (2, 5, 9, 11, 3)
Created Dataset:
<xarray.Dataset> Size: 24kB
Dimensions:   (sample: 2, timestep: 5, lat: 9, lon: 11, channel: 3)
Coordinates:
  * sample    (sample) int64 16B 0 1
  * timestep  (timestep) int64 40B 0 1 2 3 4
  * lat       (lat) float64 72B -21.8 -22.05 -22.3 -22.55 ... -23.3 -23.55 -23.8
  * lon       (lon) float64 88B -45.05 -44.8 -44.55 ... -43.05 -42.8 -42.55
  * channel   (channel) int64 24B 0 1 2
Data variables:
    data      (sample, timestep, lat, lon, channel) float64 24kB 27.57 ... 64.43


Ainda precisamos criar os data_vars "x" e "y", vamos utilizar 2 amostras no mesmo formato, vamos ter duas amostras em `ds.x` e duas amostras em `ds.y`. Para ter duas amostras vamos precisar de pelo menos 7 passos de tempo:

T1 T2 T3 T4 T5 T6 T7

```python
# amostra 1:
x = [T1, T2, T3, T4, T5]
y = [T2, T3, T4, T5, T6]

# amostra 2:
x = [T2, T3, T4, T5, T6]
y = [T3, T4, T5, T6, T7]
```

total_samples = (validated_total_timestamps - self.TIMESTEPS)
total_samples = 7 - 5 = 2

A célula abaixo cria 7 timesteps dentro da pasta timesteps, as datas são ano-mes-dia-hora

In [58]:
timesteps = [
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
    generate_sample(lats, lons, len(channel)),
]

timesteps_range = pd.date_range(start="2024-01-01T00:00:00", end="2024-01-01T06:00:00", freq="h")
len(timesteps_range)

for timestep in timesteps_range:
    timestep_folder = Path("timesteps/")
    timestep_folder.mkdir(parents=True, exist_ok=True)
    year = timestep.year
    month = timestep.month
    day = timestep.day
    hour = timestep.hour
    feature_file = timestep_folder / f"{year:04}_{month:02}_{day:02}_{hour:02}.npy"
    np.save(feature_file, generate_sample(lats, lons, len(channel)))


In [59]:
WINDOW_LENGTH = 5


def _has_timesteps(year: int, month: int, day: int, hour: int) -> bool:
    # essa função ela faz o seguinte:
    # dado um ano, mes dia e ano, ela verifica se existe os 5 timesteps anteriores a esse horario
    # se existe a gente pode montar a amostra, se não a gente não pode montar a amostra
    start_time = pd.Timestamp(year=year, month=month, day=day, hour=hour)
    for timestep in reversed(range(WINDOW_LENGTH)):
        current_time = start_time - pd.Timedelta(hours=timestep)
        file = (
            Path("timesteps/")
            / f"{current_time.year:04}_{current_time.month:02}_{current_time.day:02}_{current_time.hour:02}.npy"
        )
        if not Path(file).exists():
            return False
    return True


# por exemplo passando timesteps/2024_01_01_01 retorna False, mas passando 2024_01_01_04 retorna True:
print(_has_timesteps(2024, 1, 1, 1))
print(_has_timesteps(2024, 1, 1, 4))


False
True


In [60]:
def _get_dataset_with_timesteps(year: int, month: int, day: int, hour: int, verbose=False):
    start_time = pd.Timestamp(year=year, month=month, day=day, hour=hour)
    timesteps = []
    oldest_to_newest = reversed(range(WINDOW_LENGTH))
    # processamos oldest to newest para garantir que a ordem do passado para o futuro, queremos a lista de timesteps seja na ordem abaixo:
    # [2024_01_01_00, 2024_01_01_01, 2024_01_01_02, 2024_01_01_03, 2024_01_01_04]
    # e não:
    # [2024_01_01_04, 2024_01_01_03, 2024_01_01_02, 2024_01_01_01, 2024_01_01_00]
    for timestep in oldest_to_newest:
        current_time = start_time - pd.Timedelta(hours=timestep)
        file = (
            Path("timesteps/")
            / f"{current_time.year:04}_{current_time.month:02}_{current_time.day:02}_{current_time.hour:02}.npy"
        )
        data = np.load(file)
        if verbose:
            print(f"Timestep adicionado: {current_time}")
        timesteps.append(data)
    data = np.stack(timesteps, axis=0)
    if verbose:
        print(f"""
            data.shape: {data.shape}
            window length or timesteps: {data.shape[0]}
            lat: {data.shape[1]}
            lon: {data.shape[2]}
        """)
    return data


# exemplo
result = _get_dataset_with_timesteps(2024, 1, 1, 4, verbose=True)
print(result.shape)


Timestep adicionado: 2024-01-01 00:00:00
Timestep adicionado: 2024-01-01 01:00:00
Timestep adicionado: 2024-01-01 02:00:00
Timestep adicionado: 2024-01-01 03:00:00
Timestep adicionado: 2024-01-01 04:00:00

            data.shape: (5, 9, 11, 3)
            window length or timesteps: 5
            lat: 9
            lon: 11
        
(5, 9, 11, 3)


In [None]:
# agora vamos montar uma função que verifica se temos T1, T2, ..., até T7 suficientes para x e para y e retorna x e y caso seja possivel


def _process_timestamp(timestamp: pd.Timestamp, verbose=False):
    year = timestamp.year
    month = timestamp.month
    day = timestamp.day
    hour = timestamp.hour
    if not _has_timesteps(year, month, day, hour):
        return None, None

    next_timestamp = timestamp + pd.Timedelta(hours=1)
    year_y = next_timestamp.year
    month_y = next_timestamp.month
    day_y = next_timestamp.day
    hour_y = next_timestamp.hour

    if not _has_timesteps(year_y, month_y, day_y, hour_y):
        return None, None
    if verbose:
        print("Executando data X")
    data_x = _get_dataset_with_timesteps(year, month, day, hour, verbose)
    if verbose:
        print("Executando data Y")
    data_y = _get_dataset_with_timesteps(year_y, month_y, day_y, hour_y, verbose)
    return data_x, data_y


result_x, result_y = _process_timestamp(pd.Timestamp("2024-01-01T04:00:00"), verbose=True)
print(result_x.shape)
print(result_y.shape)

Executando data X
Timestep adicionado: 2024-01-01 00:00:00
Timestep adicionado: 2024-01-01 01:00:00
Timestep adicionado: 2024-01-01 02:00:00
Timestep adicionado: 2024-01-01 03:00:00
Timestep adicionado: 2024-01-01 04:00:00

            data.shape: (5, 9, 11, 3)
            window length or timesteps: 5
            lat: 9
            lon: 11
        
Executando data Y
Timestep adicionado: 2024-01-01 01:00:00
Timestep adicionado: 2024-01-01 02:00:00
Timestep adicionado: 2024-01-01 03:00:00
Timestep adicionado: 2024-01-01 04:00:00
Timestep adicionado: 2024-01-01 05:00:00

            data.shape: (5, 9, 11, 3)
            window length or timesteps: 5
            lat: 9
            lon: 11
        
(5, 9, 11, 3)
(5, 9, 11, 3)


In [62]:
# juntando tudo e montando o xr.Dataset


def build_netcdf(
    min_timestamp: pd.Timestamp,
    max_timestamp: pd.Timestamp,
):
    timestamps = pd.date_range(start=min_timestamp, end=max_timestamp, freq="h")

    data_x_list = []
    data_y_list = []
    for timestamp in timestamps:
        data_x, data_y = _process_timestamp(timestamp)
        if data_x is None and data_y is None:
            continue
        data_x_list.append(data_x)
        data_y_list.append(data_y)

    assert len(data_x_list) == len(data_y_list), "Mismatch between data_x and data_y lists"
    data_x = np.stack(data_x_list, axis=0)
    data_y = np.stack(data_y_list, axis=0)
    print(f"data_x shape: {data_x.shape}")
    print(f"data_y shape: {data_y.shape}")

    assert data_x.shape == data_y.shape, f"x shape {data_x.shape} != y shape {data_y.shape}"

    sample = np.arange(data_x.shape[0])
    timestep = np.arange(data_x.shape[1])
    channel = np.arange(data_x.shape[4])

    ds = xr.Dataset(
        {
            "x": (["sample", "timestep", "lat", "lon", "channel"], data_x),
            "y": (["sample", "timestep", "lat", "lon", "channel"], data_y),
        },
        coords={
            "sample": sample,
            "timestep": timestep,
            "lat": lat_sorted_ascending,
            "lon": lon_sorted_ascending,
            "channel": channel,
        },
    )
    return ds


ds = build_netcdf(pd.Timestamp("2024-01-01T00:00:00"), pd.Timestamp("2024-01-01T06:00:00"))
ds

data_x shape: (2, 5, 9, 11, 3)
data_y shape: (2, 5, 9, 11, 3)
