# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-09-15 11:38:05,696][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:05,702][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:05,721][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:05,726][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 591.91it/s]
[2025-09-15 11:38:05,824][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [233 239 373 343 263 ... 282  33 141 196 412], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-15 11:38:05,831][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:05,831][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:05,833][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:05,834][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:05,834][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-15 11:38:05,839][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:05,839][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:05,842][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:05,842][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:05,842][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:38:05,850][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 196.14it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:38:06,396][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 130.19it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:38:06,654][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 103.78it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:38:06,774][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 173.58it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:07,086][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:07,109][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:07,114][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 708.96it/s]
[2025-09-15 11:38:07,191][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [458  64 289 412 228 ... 524  24  76 408 244], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 11:38:07,200][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 181.50it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:07,790][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:07,809][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:07,813][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 420.83it/s]
[2025-09-15 11:38:07,825][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:38:07,834][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1538.22it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:07,913][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:07,986][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:07,990][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2237.27it/s]
[2025-09-15 11:38:08,017][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [352 458  60 390 264 ... 296 524 186 196  21], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type: no_anomaly_handler        
    Batch sizes
        Train batch size: 

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-15 11:38:08,025][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4516.08it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-15 11:38:08,142][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:08,142][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-15 11:38:08,143][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:08,145][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:38:08,145][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:38:08,145][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:08,151][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:08,171][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:08,175][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1817.38it/s]
[2025-09-15 11:38:08,206][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [492 260 253 286 507 ... 191 284 371 170 543], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type: no_anomaly_handler        
    Batch sizes
        Train batch size:

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-15 11:38:08,214][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 3760.80it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:08,352][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:08,372][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:08,377][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 530.90it/s]
[2025-09-15 11:38:08,481][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [330  48 451 448  88 ...  17 501 420 186 228], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,330.0,0.0,99957.0,3307832.0,2092624000.0,1262.0,26.85,27.709999,667.0,14.19,10.91,9870.0,210.0,286.709991,0.75,0.720215,0.429932,0.379883,28.790001,132.339996
1,330.0,1.0,115315.0,3721341.0,2404447000.0,1481.0,30.219999,33.09,705.0,14.39,11.86,12047.0,245.860001,341.329987,0.77002,0.759766,0.449951,0.399902,30.959999,135.0
2,330.0,2.0,161090.0,9909474.0,7693969000.0,1876.0,40.779999,45.41,698.0,15.17,13.51,18622.0,404.829987,523.25,0.779785,0.779785,0.389893,0.310059,25.799999,121.32
3,330.0,3.0,216966.0,17219384.0,13905710000.0,2506.0,53.32,61.02,830.0,17.66,16.75,26878.0,571.869995,735.97998,0.759766,0.75,0.439941,0.340088,36.939999,129.350006
4,330.0,4.0,193328.0,17067339.0,14739440000.0,2673.0,60.75,66.629997,803.0,18.25,17.040001,28258.0,642.22998,778.200012,0.700195,0.669922,0.399902,0.300049,37.040001,123.949997
5,330.0,5.0,170822.0,11323888.0,9126596000.0,2560.0,58.18,64.790001,729.0,16.57,14.99,26883.0,610.97998,737.390015,0.709961,0.689941,0.419922,0.320068,41.41,124.639999
6,330.0,6.0,164171.0,13369965.0,11172520000.0,2713.0,63.09,69.459999,759.0,17.65,16.950001,27344.0,635.909973,755.309998,0.77002,0.72998,0.459961,0.360107,48.580002,119.230003
7,330.0,7.0,183204.0,14857054.0,13167690000.0,2667.0,63.5,65.0,744.0,17.709999,15.59,27388.0,652.099976,752.280029,0.709961,0.689941,0.370117,0.27002,48.990002,129.520004
8,330.0,8.0,178902.0,10934836.0,8524662000.0,2607.0,59.25,66.029999,734.0,16.68,15.44,26768.0,608.359985,737.130005,0.72998,0.709961,0.399902,0.320068,50.540001,130.410004
9,330.0,9.0,180545.0,13924548.0,11778290000.0,2557.0,63.919998,65.260002,727.0,18.18,16.66,26874.0,671.849976,749.950012,0.779785,0.740234,0.379883,0.27002,49.759998,129.619995


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,330.0,3359.0,69248.0,3747886.0,2870739000.0,1163.0,24.23,26.02,652.0,13.58,11.15,9024.0,188.0,235.830002,0.680176,0.660156,0.419922,0.350098,29.190001,132.25
1,330.0,3360.0,70573.0,5193969.0,5224641000.0,1079.0,22.959999,23.959999,646.0,13.74,11.3,8750.0,186.169998,230.580002,0.680176,0.660156,0.409912,0.350098,27.969999,131.460007
2,330.0,3361.0,70825.0,2999975.0,2277655000.0,1088.0,24.18,24.67,610.0,13.56,10.86,8791.0,195.360001,233.160004,0.680176,0.669922,0.439941,0.379883,28.17,117.43
3,330.0,3362.0,68013.0,3083066.0,2497420000.0,1131.0,24.59,25.469999,642.0,13.96,12.09,8560.0,186.089996,225.830002,0.660156,0.669922,0.429932,0.379883,29.25,114.160004
4,330.0,3363.0,70732.0,2459319.0,1715647000.0,1130.0,26.280001,26.74,611.0,14.21,11.29,9516.0,221.300003,260.130005,0.75,0.75,0.459961,0.409912,30.73,125.870003
5,330.0,3364.0,97903.0,6936177.0,5918118000.0,1469.0,31.93,35.93,615.0,13.37,11.57,14500.0,315.220001,396.890015,0.72998,0.720215,0.459961,0.409912,23.77,118.639999
6,330.0,3365.0,150967.0,11193465.0,9341011000.0,1898.0,44.139999,48.66,659.0,15.33,14.17,21748.0,505.769989,599.950012,0.700195,0.689941,0.409912,0.360107,25.17,119.959999
7,330.0,3366.0,159366.0,12888488.0,11175910000.0,2194.0,46.68,54.029999,744.0,15.83,15.73,24861.0,528.960022,668.309998,0.680176,0.660156,0.419922,0.340088,37.549999,120.690002
8,330.0,3367.0,154295.0,10409986.0,8375796000.0,2340.0,52.0,58.57,706.0,15.69,14.63,25090.0,557.559998,677.640015,0.740234,0.740234,0.409912,0.350098,35.189999,124.720001
9,330.0,3368.0,147132.0,7968592.0,6027222000.0,2273.0,47.349998,56.09,729.0,15.19,15.13,24917.0,519.099976,667.47998,0.669922,0.629883,0.439941,0.370117,44.25,121.169998


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,330.0,5374.0,120602.0,3712172.0,2362708000.0,1087.0,24.16,29.719999,595.0,13.22,11.5,9554.0,212.309998,330.670013,0.810059,0.810059,0.509766,0.439941,37.099998,112.720001
1,330.0,5375.0,111984.0,3255939.0,1968785000.0,1036.0,21.139999,26.940001,598.0,12.2,11.26,9049.0,184.669998,303.220001,0.799805,0.790039,0.52002,0.459961,36.5,115.949997
2,330.0,5376.0,110020.0,2914083.0,1515122000.0,1042.0,22.65,27.629999,622.0,13.52,12.17,9100.0,197.830002,311.380005,0.77002,0.759766,0.52002,0.47998,32.939999,124.389999
3,330.0,5377.0,116564.0,3705369.0,2108870000.0,1117.0,25.98,28.59,576.0,13.4,11.62,10294.0,239.399994,359.519989,0.75,0.75,0.509766,0.439941,36.189999,121.309998
4,330.0,5378.0,150543.0,6028615.0,4368485000.0,1356.0,30.129999,39.66,593.0,13.18,12.2,14545.0,323.220001,520.619995,0.819824,0.810059,0.419922,0.340088,36.029999,135.809998
5,330.0,5379.0,209364.0,11961034.0,9479166000.0,1805.0,41.98,56.950001,678.0,15.77,16.43,21173.0,492.399994,775.109985,0.819824,0.810059,0.439941,0.379883,36.419998,125.760002
6,330.0,5380.0,226467.0,13298959.0,10570640000.0,2190.0,54.75,70.550003,779.0,19.48,19.57,24076.0,601.900024,896.72998,0.77002,0.759766,0.449951,0.360107,42.860001,125.360001
7,330.0,5381.0,214585.0,11699241.0,9344198000.0,2291.0,46.759998,70.110001,853.0,17.41,22.92,24632.0,502.690002,854.820007,0.720215,0.709961,0.47998,0.399902,53.099998,123.599998
8,330.0,5382.0,210943.0,10856091.0,8053770000.0,2140.0,49.77,68.400002,842.0,19.58,22.08,23763.0,552.630005,862.590027,0.700195,0.689941,0.48999,0.429932,52.650002,125.160004
9,330.0,5383.0,203567.0,10227818.0,7700846000.0,2134.0,50.810001,68.089996,785.0,18.690001,18.639999,22905.0,545.359985,833.330017,0.75,0.72998,0.449951,0.370117,60.130001,129.770004


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,330.0,0.0,99957.0,3307832.0,2092624000.0,1262.0,26.85,27.709999,667.0,14.19,10.91,9870.0,210.0,286.709991,0.75,0.720215,0.429932,0.379883,28.790001,132.339996
1,330.0,1.0,115315.0,3721341.0,2404447000.0,1481.0,30.219999,33.09,705.0,14.39,11.86,12047.0,245.860001,341.329987,0.77002,0.759766,0.449951,0.399902,30.959999,135.0
2,330.0,2.0,161090.0,9909474.0,7693969000.0,1876.0,40.779999,45.41,698.0,15.17,13.51,18622.0,404.829987,523.25,0.779785,0.779785,0.389893,0.310059,25.799999,121.32
3,330.0,3.0,216966.0,17219384.0,13905710000.0,2506.0,53.32,61.02,830.0,17.66,16.75,26878.0,571.869995,735.97998,0.759766,0.75,0.439941,0.340088,36.939999,129.350006
4,330.0,4.0,193328.0,17067339.0,14739440000.0,2673.0,60.75,66.629997,803.0,18.25,17.040001,28258.0,642.22998,778.200012,0.700195,0.669922,0.399902,0.300049,37.040001,123.949997
5,330.0,5.0,170822.0,11323888.0,9126596000.0,2560.0,58.18,64.790001,729.0,16.57,14.99,26883.0,610.97998,737.390015,0.709961,0.689941,0.419922,0.320068,41.41,124.639999
6,330.0,6.0,164171.0,13369965.0,11172520000.0,2713.0,63.09,69.459999,759.0,17.65,16.950001,27344.0,635.909973,755.309998,0.77002,0.72998,0.459961,0.360107,48.580002,119.230003
7,330.0,7.0,183204.0,14857054.0,13167690000.0,2667.0,63.5,65.0,744.0,17.709999,15.59,27388.0,652.099976,752.280029,0.709961,0.689941,0.370117,0.27002,48.990002,129.520004
8,330.0,8.0,178902.0,10934836.0,8524662000.0,2607.0,59.25,66.029999,734.0,16.68,15.44,26768.0,608.359985,737.130005,0.72998,0.709961,0.399902,0.320068,50.540001,130.410004
9,330.0,9.0,180545.0,13924548.0,11778290000.0,2557.0,63.919998,65.260002,727.0,18.18,16.66,26874.0,671.849976,749.950012,0.779785,0.740234,0.379883,0.27002,49.759998,129.619995


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:09,153][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:09,171][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:09,175][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 678.70it/s]
[2025-09-15 11:38:09,257][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [511  50 333  79 229 ... 190 542 450 185 461], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:38:09,569][time_config][INFO] - Quick validation succeeded.
[2025-09-15 11:38:09,591][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:38:09,595][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1098.35it/s]
[2025-09-15 11:38:09,645][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [195 292 203 529 212 ... 310 206 515 343 223], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)