# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-15 11:55:43,406][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:43,411][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:43,432][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:43,437][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1170.06it/s]
[2025-09-15 11:55:43,689][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1266.67it/s]
[2025-09-15 11:55:43,787][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1249.14it/s]
[2025-09-15 11:55:43,835][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [190 239 385 211 219 ... 121 441 511 247 432], Length=274
        Val time series IDs: [401 471  92  99 177 ... 494 108  45 386 535], Length=109
        Test time series IDs: [349 279  97 372  14 ... 415 516  59 390 461], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-15 11:55:43,840][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:43,841][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:43,841][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:43,842][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:43,842][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-15 11:55:43,850][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:43,850][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:43,850][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:43,851][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:43,851][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:55:43,860][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 43.83it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:55:46,268][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 67.06it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:55:46,760][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 100.78it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:46,879][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:46,901][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:46,906][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2468.09it/s]
[2025-09-15 11:55:47,024][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2623.83it/s]
[2025-09-15 11:55:47,071][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3359.28it/s]
[2025-09-15 11:55:47,089][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [405 361 507 212 251 ... 371 198 436 519 546], Length=274
        Val time series IDs: [392 177 441 433 356 ...  47 410 539 149 202], Length=109
        Test time series IDs: [429 204 214  37 399 ... 250 205 207 328 383], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 11:55:47,098][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 49.12it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:49,243][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:49,253][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:49,256][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1999.19it/s]
[2025-09-15 11:55:49,260][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:55:49,269][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2103.25it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:49,331][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:49,350][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:49,354][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7932.57it/s]
[2025-09-15 11:55:49,394][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7121.17it/s]
[2025-09-15 11:55:49,414][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7192.06it/s]
[2025-09-15 11:55:49,423][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [226 274 500 395  39 ... 106  25 387 357 365], Length=274
        Val time series IDs: [252 546   1  37 488 ... 116 399 502  47 173], Length=109
        Test time series IDs: [217  75  27 210 429 ... 402 222  98   5 362], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
   

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-15 11:55:49,431][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1228.57it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-15 11:55:49,836][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:49,836][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-15 11:55:49,837][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-15 11:55:49,837][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:49,837][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:55:49,838][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:55:49,838][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:49,843][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:49,863][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:49,866][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 8274.28it/s]
[2025-09-15 11:55:49,904][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 8072.09it/s]
[2025-09-15 11:55:49,923][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7197.78it/s]
[2025-09-15 11:55:49,933][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [286 526  68 291 489 ... 266 311 304 181 142], Length=274
        Val time series IDs: [547 483  77 298 375 ... 259 464 224 309 303], Length=109
        Test time series IDs: [373 321 414 249 121 ... 211 148 317 430  54], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
  

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-15 11:55:49,940][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1301.30it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:50,323][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:50,342][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:50,345][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2754.88it/s]
[2025-09-15 11:55:50,450][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3154.66it/s]
[2025-09-15 11:55:50,487][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3594.95it/s]
[2025-09-15 11:55:50,504][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [104 536 358 329 282 ... 303  40 470 176 101], Length=274
        Val time series IDs: [542 249 135 291 299 ... 522  67 544 325 453], Length=109
        Test time series IDs: [343 447 411 362 170 ... 177 423 432 244 263], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,104.0,0.0,981.0,4535.0,613573.0,499.0,8.6,3.8,555.0,9.57,4.19,853.0,14.71,7.34,0.779785,0.859863,0.5,0.540039,7.13,138.669998
1,104.0,1.0,1038.0,3970.0,528563.0,507.0,8.45,3.24,549.0,9.15,3.84,901.0,15.02,7.28,0.720215,0.799805,0.5,0.529785,5.69,143.279999
2,104.0,2.0,854.0,11724.0,13321828.0,482.0,8.03,3.62,485.0,8.08,4.39,743.0,12.38,6.5,0.700195,0.75,0.459961,0.47998,6.07,137.190002
3,104.0,3.0,753.0,3756.0,518802.0,458.0,7.63,3.65,476.0,7.93,4.67,658.0,10.97,5.56,0.72998,0.779785,0.5,0.529785,7.52,142.660004
4,104.0,4.0,595.0,2499.0,325401.0,380.0,6.33,2.43,367.0,6.12,2.62,536.0,8.93,3.86,0.700195,0.740234,0.48999,0.529785,5.3,148.25
5,104.0,5.0,405.0,1257.0,157788.0,265.0,4.91,2.31,236.0,4.37,2.06,381.0,7.06,4.17,0.689941,0.740234,0.459961,0.469971,6.67,135.710007
6,104.0,6.0,492.0,1891.0,262350.0,324.0,5.59,2.24,296.0,5.1,2.21,458.0,7.9,3.5,0.700195,0.75,0.5,0.529785,8.0,147.940002
7,104.0,7.0,546.0,1952.0,239497.0,345.0,5.95,3.1,314.0,5.41,2.55,493.0,8.5,4.57,0.680176,0.75,0.469971,0.459961,8.22,145.380005
8,104.0,8.0,624.0,2852.0,389423.0,366.0,6.1,2.98,375.0,6.25,3.6,551.0,9.18,4.58,0.709961,0.759766,0.509766,0.549805,6.64,136.669998
9,104.0,9.0,616.0,3505.0,523403.0,393.0,6.55,3.09,392.0,6.53,3.44,569.0,9.48,4.55,0.75,0.779785,0.5,0.549805,4.08,139.169998


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,542.0,3359.0,44.0,182.0,63658.0,33.0,3.3,1.25,34.0,3.4,1.35,41.0,4.1,1.73,0.850098,0.859863,0.629883,0.620117,9.29,170.979996
1,542.0,3360.0,78.0,251.0,70836.0,36.0,3.27,1.27,36.0,3.27,1.27,75.0,6.82,4.75,0.779785,0.850098,0.529785,0.509766,17.219999,205.919998
2,542.0,3361.0,82.0,297.0,103334.0,44.0,4.4,1.43,43.0,4.3,1.49,75.0,7.5,4.62,0.790039,0.839844,0.600098,0.609863,4.47,208.630005
3,542.0,3362.0,54.0,150.0,35600.0,33.0,3.3,1.57,34.0,3.4,1.26,53.0,5.3,3.89,0.77002,0.819824,0.700195,0.759766,2.48,191.399994
4,542.0,3363.0,55.0,16240.0,16693378.0,28.0,3.11,1.36,29.0,3.22,1.48,52.0,5.78,3.6,0.689941,0.689941,0.48999,0.389893,5.2,189.860001
5,542.0,3364.0,36.0,79.0,11799.0,24.0,2.0,1.13,24.0,2.0,1.13,33.0,2.75,1.48,0.549805,0.5,0.560059,0.540039,2.76,162.059998
6,542.0,3365.0,34.0,5790.0,375269.0,26.0,2.89,1.96,28.0,3.11,2.15,33.0,3.67,1.94,0.799805,0.799805,0.629883,0.629883,7.63,173.979996
7,542.0,3366.0,23.0,139.0,59035.0,19.0,2.11,0.6,18.0,2.0,0.71,19.0,2.11,0.6,0.77002,0.75,0.569824,0.52002,6.73,163.850006
8,542.0,3367.0,252.0,12361.0,11237845.0,67.0,7.44,4.61,27.0,3.0,1.12,161.0,17.889999,13.86,1.0,1.0,0.389893,0.23999,10.04,128.119995
9,542.0,3368.0,77.0,52027.0,53835932.0,31.0,2.82,1.89,24.0,2.18,1.17,67.0,6.09,5.03,0.879883,0.859863,0.669922,0.620117,19.33,172.380005


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,343.0,5374.0,4986.0,318865.0,263314404.0,311.0,34.560001,24.110001,125.0,13.89,8.52,1620.0,180.0,134.360001,0.879883,0.870117,0.439941,0.25,18.41,137.029999
1,343.0,5375.0,4740.0,190335.0,105505574.0,274.0,24.91,20.809999,134.0,12.18,8.4,1426.0,129.639999,121.260002,0.790039,0.720215,0.469971,0.290039,17.09,134.270004
2,343.0,5376.0,4881.0,264123.0,280953091.0,284.0,23.67,21.66,126.0,10.5,7.75,1497.0,124.75,127.580002,0.950195,0.950195,0.350098,0.23999,15.66,119.730003
3,343.0,5377.0,6790.0,531341.0,441832644.0,348.0,38.669998,26.629999,141.0,15.67,9.59,2237.0,248.559998,187.039993,0.939941,0.879883,0.429932,0.180054,17.09,138.979996
4,343.0,5378.0,10373.0,852483.0,816636969.0,440.0,44.0,36.5,156.0,15.6,12.02,3383.0,338.299988,294.209991,0.779785,0.759766,0.399902,0.27002,14.76,149.669998
5,343.0,5379.0,12172.0,1032194.0,980536223.0,493.0,54.779999,39.880001,153.0,17.0,11.94,4120.0,457.779999,344.480011,0.899902,0.899902,0.379883,0.22998,17.139999,136.509995
6,343.0,5380.0,13260.0,912025.0,834918904.0,584.0,64.889999,47.610001,177.0,19.67,13.72,4667.0,518.559998,387.929993,0.779785,0.75,0.389893,0.209961,19.15,124.809998
7,343.0,5381.0,12364.0,857274.0,726894546.0,561.0,56.099998,46.119999,154.0,15.4,10.9,4564.0,456.399994,390.75,0.859863,0.830078,0.360107,0.199951,18.389999,130.440002
8,343.0,5382.0,14066.0,1082922.0,958582463.0,599.0,54.450001,51.82,162.0,14.73,13.02,5014.0,455.820007,438.220001,0.759766,0.75,0.449951,0.320068,16.809999,146.149994
9,343.0,5383.0,12656.0,1059771.0,804990218.0,567.0,63.0,46.130001,155.0,17.219999,11.49,4562.0,506.890015,381.869995,0.680176,0.640137,0.419922,0.280029,20.870001,122.309998


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:51,268][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:51,288][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:51,291][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2622.65it/s]
[2025-09-15 11:55:51,400][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3145.13it/s]
[2025-09-15 11:55:51,438][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3594.15it/s]
[2025-09-15 11:55:51,454][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [286 524 213 438 460 ... 410  48 374 104 201], Length=274
        Val time series IDs: [503  75 400 107  54 ... 276 254 388 139 350], Length=109
        Test time series IDs: [237 484 329 131 529 ... 165  46 291 182 343], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:55:51,812][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:55:51,835][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:55:51,839][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2621.67it/s]
[2025-09-15 11:55:51,949][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2937.17it/s]
[2025-09-15 11:55:51,990][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3081.86it/s]
[2025-09-15 11:55:52,010][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 15  77  85 119 488 ...  79 375  51 269 174], Length=274
        Val time series IDs: [294  66 487 527  38 ...  16 497 531 150 336], Length=109
        Test time series IDs: [ 90 343  82 176  50 ... 391 241 400 169  61], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)