In [2]:
import sys

In [27]:
import argparse
from datetime import datetime
import glob
import numpy as np
import xarray as xr
import re

def process_single_day(files):
    """
    Processa os arquivos de um único dia e retorna um dataset combinado.
    """
    daily_datasets = []

    for file_path in files:
        ds = xr.open_dataset(file_path)

        time_stamps = []
        data_arrays = []

        for var_name in ds.variables:
            match = re.search(r'CMI_(\d{4}_\d{2}_\d{2}_\d{2}_\d{2})', var_name)
            if match:
                timestamp_str = match.group(1)
                timestamp = np.datetime64(
                    f"{timestamp_str[:4]}-{timestamp_str[5:7]}-{timestamp_str[8:10]}T{timestamp_str[11:13]}:{timestamp_str[14:]}"
                )
                time_stamps.append(timestamp)

                var_data = ds[var_name].rename({
                    f'dim_0_{var_name}': 'lat',
                    f'dim_1_{var_name}': 'lon'
                })
                data_arrays.append(var_data)

        time_array = xr.DataArray(
            np.array(time_stamps, dtype='datetime64[ns]'),
            dims='time'
        )
        daily_data = xr.concat(data_arrays, dim=time_array)

        daily_data = daily_data.assign_coords(lat=np.arange(daily_data.sizes['lat']),
                                              lon=np.arange(daily_data.sizes['lon']))

        daily_datasets.append(daily_data)

    # Concatenar todos os datasets do dia ao longo da dimensão 'time'
    combined_day = xr.concat(daily_datasets, dim='time')
    combined_day = combined_day.sortby('time')  # Ordenar por timestamps

    return combined_day


def collect_samples(combined_data, TIMESTEP, max_gap):
    """
    Coleta os samples de um dataset baseado em janelas de tempo e considera todas as bandas.
    
    Parâmetros:
        combined_data: xarray.Dataset - Dataset combinado com as dimensões `time` e `channel`.
        TIMESTEP: int - Número de timestamps por sample.
        max_gap: int - Máximo intervalo permitido entre timestamps consecutivos (em minutos).
    
    Retorna:
        xarray.Dataset - Dataset contendo os samples X e Y.
    """
    total_time = combined_data.sizes['time']

    X_samples = []
    Y_samples = []

    for i in range(total_time - TIMESTEP):
        # Coleta do sample X
        X_sample = combined_data.isel(time=slice(i, i + TIMESTEP)).assign_coords(
            time=combined_data.time.isel(time=slice(i, i + TIMESTEP))
        )
        max_gap_X = check_max_gap(X_sample)

        # Coleta do sample Y
        Y_sample = combined_data.isel(time=slice(i + 1, i + 1 + TIMESTEP)).assign_coords(
            time=combined_data.time.isel(time=slice(i + 1, i + 1 + TIMESTEP))
        )
        max_gap_Y = check_max_gap(Y_sample)

        # Verificar gaps
        if max_gap_X > max_gap:
            print(f'Timestamp faltando no X_sample: {X_sample.time.values}')
            continue
        elif max_gap_Y > max_gap:
            print(f'Timestamp faltando no Y_sample: {Y_sample.time.values}')
            continue

        # Adicionar os samples válidos
        X_samples.append(X_sample)
        Y_samples.append(Y_sample)

    # Concatenar os samples ao longo da dimensão 'sample'
    X_samples = xr.concat(X_samples, dim='sample')
    Y_samples = xr.concat(Y_samples, dim='sample')

    # Dataset final contendo X e Y
    combined_samples = xr.Dataset({'x': X_samples, 'y': Y_samples})

    return combined_samples

def check_max_gap(sample):
    """
    Calcula a maior diferença entre timestamps consecutivos dentro de um sample.
    
    Parâmetros:
        sample: xarray.Dataset - Dataset contendo a dimensão `time`.
    
    Retorna:
        int - O maior intervalo de tempo (em minutos) entre timestamps consecutivos.
    """
    times = sample.time.values
    gaps = np.diff(times).astype('timedelta64[m]').astype(int)  # Diferenças em minutos
    return np.max(gaps) if len(gaps) > 0 else 0

def main(path, max_gap, features, output):
    TIMESTEP = 5
    files = glob.glob(path)
    files.sort()
    # print(files)

    # Organizar arquivos por dia
    days = {}
    for file in files:
        # print(file)
        day = re.search(r'(\d{4}_\d{2}_\d{2})', file).group(1)
        if day not in days:
            days[day] = []
        days[day].append(file)

    # print(days)
    # sys.exit(1)

    # Processar cada dia
    for day, day_files in days.items():
        print(f"Processando o dia: {day}...")

        band_datasets = []
        for feature in features:
            band_files = [file for file in day_files if f"{feature}" in file]
            if not band_files:
                print(f"Sem arquivos encontrados para a feature {feature} no dia {day}")
                continue

            daily_dataset = process_single_day(band_files)
            band_datasets.append(daily_dataset)

        if band_datasets:
            combined_day = xr.concat(band_datasets, dim='channel')
            combined_day = combined_day.assign_coords(channel=('channel', features))

            # Coletar samples do dia
            daily_samples = collect_samples(combined_day, TIMESTEP, max_gap)

            # Salvar os samples por dia
            daily_output_path = f"{output}/{day}_samples.nc"
            daily_samples.to_netcdf(daily_output_path)
            print(f"Samples do dia {day} salvos em {daily_output_path}")
        else:
            print(f"Nenhum dataset processado para o dia {day}.")
        
        print("")

In [28]:
path = "../features/CMI/*/*/*.nc"
max_gap = 30
features = ['dF_dt']
output = "output"
main(path, max_gap, features, output)

Processando o dia: 2023_10_31...
Samples do dia 2023_10_31 salvos em output/2023_10_31_samples.nc

Processando o dia: 2024_01_13...
Samples do dia 2024_01_13 salvos em output/2024_01_13_samples.nc

Processando o dia: 2024_07_07...
Samples do dia 2024_07_07 salvos em output/2024_07_07_samples.nc



In [None]:
import netCDF4 as nc
dataset = nc.Dataset('output/2024_01_13_samples.nc')

for variable_name in dataset.variables.keys():
    print(f'var name: {variable_name}')
    data = dataset.variables[variable_name][:]
    print(data)

var name: time
[   0   10   20   30   40   50   60   70   80   90  100  110  120  130
  140  150  160  170  180  190  200  210  220  230  240  250  260  270
  280  290  300  310  320  330  340  350  360  370  380  390  400  410
  420  430  440  450  460  470  480  490  500  510  520  530  540  550
  560  570  580  590  600  610  620  630  640  650  660  670  680  690
  700  710  720  730  740  750  760  770  780  790  800  810  820  830
  840  850  860  870  880  890  900  910  920  930  940  950  960  970
  980  990 1000 1010 1020 1030 1040 1050 1060 1070 1080 1090 1100 1110
 1120 1130 1140 1150 1160 1170 1180 1190 1200 1210 1220 1230 1240 1250
 1260 1270 1280 1290 1300 1310 1320 1330 1340 1350 1360 1370 1380 1390
 1400 1410 1420]
var name: lat
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 

In [42]:
dataset.variables.keys()

dict_keys(['time', 'lat', 'lon', 'channel', 'x', 'y'])

In [43]:
import netCDF4 as nc
dataset = nc.Dataset('../features/CMI/dF_dt/2024/FA_2024_01_13.nc')

for variable_name in dataset.variables.keys():
    print(f'var name: {variable_name}')
    data = dataset.variables[variable_name][:]
    print(data)

var name: CMI_2024_01_13_03_00
[[-0.23966828 -0.2949768  -0.2949768  ... -0.55922544 -0.03072815
  -0.30726624]
 [-0.05530853 -0.29497528 -0.48548126 ... -0.15363464 -0.15363464
  -0.4486084 ]
 [ 0.16592255  0.11676178 -0.3748657  ... -0.08603515 -0.0921814
   0.16592407]
 ...
 [ 0.06759949  0.06759796 -0.03072662 ...  0.06145325  0.12290649
   0.17821503]
 [ 0.06145325  0.0737442   0.00614624 ...  0.06145325  0.10447083
   0.10447083]
 [ 0.12290649  0.06759796  0.0737442  ...  0.11061554  0.12905273
   0.12905273]]
var name: CMI_2024_01_13_03_30
[[ 0.00614624  0.00614471  0.00614471 ...  0.12905273  0.3748657
   0.28883058]
 [-0.00614471  0.14134216  0.09832611 ...  0.42402953  0.42402953
  -0.16592407]
 [ 0.13519745  0.21508637  0.11061706 ... -0.01843567 -0.06145325
  -0.03072815]
 ...
 [ 0.          0.20894012  0.18435974 ...  0.19050446  0.1966507
   0.17207031]
 [ 0.11676178  0.27653962  0.17821503 ...  0.20279694  0.15977784
   0.12905121]
 [ 0.17821503  0.27653962  0.17206879 .

In [44]:
dataset.variables.keys()

dict_keys(['CMI_2024_01_13_03_00', 'CMI_2024_01_13_03_30', 'CMI_2024_01_13_05_00', 'CMI_2024_01_13_01_00', 'CMI_2024_01_13_02_00', 'CMI_2024_01_13_00_00', 'CMI_2024_01_13_03_50', 'CMI_2024_01_13_08_00', 'CMI_2024_01_13_00_10', 'CMI_2024_01_13_03_20', 'CMI_2024_01_13_05_10', 'CMI_2024_01_13_02_10', 'CMI_2024_01_13_09_00', 'CMI_2024_01_13_03_40', 'CMI_2024_01_13_07_00', 'CMI_2024_01_13_01_30', 'CMI_2024_01_13_01_20', 'CMI_2024_01_13_00_20', 'CMI_2024_01_13_00_30', 'CMI_2024_01_13_08_10', 'CMI_2024_01_13_05_20', 'CMI_2024_01_13_09_10', 'CMI_2024_01_13_07_10', 'CMI_2024_01_13_06_10', 'CMI_2024_01_13_08_20', 'CMI_2024_01_13_05_30', 'CMI_2024_01_13_00_40', 'CMI_2024_01_13_01_10', 'CMI_2024_01_13_03_10', 'CMI_2024_01_13_09_20', 'CMI_2024_01_13_04_00', 'CMI_2024_01_13_06_00', 'CMI_2024_01_13_04_10', 'CMI_2024_01_13_01_40', 'CMI_2024_01_13_07_20', 'CMI_2024_01_13_02_20', 'CMI_2024_01_13_05_40', 'CMI_2024_01_13_01_50', 'CMI_2024_01_13_02_40', 'CMI_2024_01_13_02_30', 'CMI_2024_01_13_06_30', 'CMI_