# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-14 14:35:47,310][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:47,316][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:47,337][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:47,342][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1245.48it/s]
[2025-09-14 14:35:47,581][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1180.77it/s]
[2025-09-14 14:35:47,685][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1147.73it/s]
[2025-09-14 14:35:47,738][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [  7  51  69 178  24 ...  93 365 387 243 393], Length=274
        Val time series IDs: [185 281 339 409 428 ... 398  47 208 325 391], Length=109
        Test time series IDs: [260  97 269 462  91 ... 469 251 295 172 484], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-14 14:35:47,746][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:47,746][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:47,750][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:47,751][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:47,751][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-14 14:35:47,758][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:47,759][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:47,762][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:47,764][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:47,764][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:35:47,775][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 43.26it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:35:50,215][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 56.46it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:35:50,802][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 70.65it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:50,968][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:50,992][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:50,997][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2411.99it/s]
[2025-09-14 14:35:51,118][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2342.57it/s]
[2025-09-14 14:35:51,169][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3358.63it/s]
[2025-09-14 14:35:51,188][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [189  52 490  53  94 ... 354 293 274 504 335], Length=274
        Val time series IDs: [113 284 264 422 323 ... 128 303 336 208  22], Length=109
        Test time series IDs: [267 263 247 533 194 ... 243 500 343 222 509], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-14 14:35:51,198][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 45.40it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:53,520][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:53,530][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:53,533][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1999.67it/s]
[2025-09-14 14:35:53,537][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:35:53,545][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2045.38it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:53,612][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:53,632][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:53,637][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7167.20it/s]
[2025-09-14 14:35:53,679][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 5885.80it/s]
[2025-09-14 14:35:53,703][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 5996.78it/s]
[2025-09-14 14:35:53,713][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [456 425 269 411 487 ... 199 166  73 136 540], Length=274
        Val time series IDs: [171 214 491 378 360 ...  37 399 474 170 325], Length=109
        Test time series IDs: [135 271  23 512 130 ... 402  20  58 137 287], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handle

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-14 14:35:53,721][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1052.60it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-14 14:35:54,195][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:54,196][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-14 14:35:54,196][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-14 14:35:54,197][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:54,200][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:35:54,200][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:35:54,201][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:54,205][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:54,225][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:54,230][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 6863.47it/s]
[2025-09-14 14:35:54,276][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7405.99it/s]
[2025-09-14 14:35:54,296][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 5946.71it/s]
[2025-09-14 14:35:54,307][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [372 309 468  50 362 ...   6 475  86  67 257], Length=274
        Val time series IDs: [176 140 530 383 543 ... 483 186  93 119 501], Length=109
        Test time series IDs: [134 532 290 480 400 ...  83 353  66 495 137], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handl

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-14 14:35:54,316][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1155.09it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:54,750][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:54,776][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:54,781][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2505.26it/s]
[2025-09-14 14:35:54,895][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3025.75it/s]
[2025-09-14 14:35:54,937][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3075.42it/s]
[2025-09-14 14:35:54,956][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 84 148  83 253 189 ... 350 473 509 328 331], Length=274
        Val time series IDs: [395 247 421 393 398 ...  88  74  67  46 480], Length=109
        Test time series IDs: [120 465 342 184 304 ... 485  36 183 532 224], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,84.0,0.0,64.0,113.0,12130.0,54.0,5.4,1.84,55.0,5.5,2.07,57.0,5.7,2.31,0.779785,0.680176,0.280029,0.280029,0.31,162.389999
1,84.0,1.0,56.0,90.0,12639.0,49.0,4.9,1.91,50.0,5.0,2.0,52.0,5.2,1.87,0.649902,0.620117,0.370117,0.350098,1.24,149.240005
2,84.0,2.0,46.0,71.0,3566.0,44.0,4.4,2.01,46.0,4.6,2.32,46.0,4.6,2.32,0.669922,0.569824,0.389893,0.439941,0.93,161.199997
3,84.0,3.0,38.0,54.0,2375.0,32.0,4.57,1.27,36.0,5.14,1.35,36.0,5.14,1.35,0.899902,0.870117,0.389893,0.399902,0.01,147.25
4,84.0,4.0,23.0,27.0,1257.0,19.0,3.8,1.48,20.0,4.0,0.71,22.0,4.4,1.52,0.830078,0.779785,0.409912,0.429932,0.0,141.740005
5,84.0,5.0,39.0,61.0,3977.0,28.0,3.5,2.2,28.0,3.5,2.2,37.0,4.62,3.29,0.569824,0.52002,0.640137,0.620117,2.37,100.139999
6,84.0,6.0,13.0,17.0,704.0,13.0,3.25,0.5,13.0,3.25,0.5,13.0,3.25,0.5,1.0,1.0,0.439941,0.419922,0.01,123.190002
7,84.0,7.0,20.0,28.0,1192.0,15.0,3.0,1.22,16.0,3.2,1.3,19.0,3.8,2.17,0.939941,0.910156,0.409912,0.409912,0.89,156.539993
8,84.0,8.0,14.0,25.0,1356.0,14.0,2.33,1.03,14.0,2.33,1.03,14.0,2.33,1.03,0.799805,0.75,0.52002,0.529785,1.91,132.5
9,84.0,9.0,35.0,45.0,2168.0,30.0,3.0,1.05,33.0,3.3,1.16,32.0,3.2,1.03,0.819824,0.77002,0.389893,0.419922,0.03,137.830002


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,395.0,3359.0,434.0,10226.0,4398907.0,201.0,8.38,5.41,237.0,9.88,7.99,290.0,12.08,9.56,0.899902,0.919922,0.589844,0.629883,15.27,122.449997
1,395.0,3360.0,411.0,9531.0,2168210.0,173.0,7.86,5.7,196.0,8.91,8.22,272.0,12.36,10.38,0.890137,0.910156,0.459961,0.48999,20.18,115.970001
2,395.0,3361.0,452.0,13456.0,4318335.0,191.0,8.3,7.25,235.0,10.22,10.12,304.0,13.22,12.48,0.899902,0.910156,0.439941,0.469971,15.2,117.889999
3,395.0,3362.0,453.0,8246.0,2032584.0,188.0,8.95,5.95,221.0,10.52,8.21,311.0,14.81,12.87,0.830078,0.850098,0.5,0.560059,16.24,105.900002
4,395.0,3363.0,596.0,12566.0,5266832.0,214.0,9.73,7.97,262.0,11.91,12.15,380.0,17.27,17.459999,0.859863,0.890137,0.509766,0.529785,9.84,111.550003
5,395.0,3364.0,2060.0,105013.0,94309470.0,360.0,16.360001,14.77,351.0,15.95,16.42,1043.0,47.41,54.16,0.950195,0.959961,0.47998,0.419922,6.98,115.540001
6,395.0,3365.0,11388.0,558074.0,541129600.0,587.0,25.52,23.48,529.0,23.0,24.860001,3751.0,163.089996,191.720001,0.939941,0.939941,0.379883,0.280029,5.41,110.330002
7,395.0,3366.0,19820.0,1172138.0,1072015000.0,763.0,38.150002,28.84,543.0,27.15,25.51,6198.0,309.899994,299.76001,0.919922,0.910156,0.419922,0.27002,8.02,89.540001
8,395.0,3367.0,23397.0,1254745.0,1155010000.0,805.0,36.59,31.52,532.0,24.18,20.440001,6742.0,306.450012,330.019989,0.930176,0.939941,0.449951,0.330078,7.51,97.050003
9,395.0,3368.0,22239.0,992079.0,880285900.0,797.0,34.650002,30.450001,505.0,21.959999,19.67,6429.0,279.519989,311.0,0.919922,0.950195,0.409912,0.320068,7.35,111.599998


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,120.0,5374.0,449.0,654.0,31882.0,302.0,2.7,1.24,333.0,2.97,1.31,428.0,3.82,2.2,0.649902,0.609863,0.469971,0.469971,4.31,135.039993
1,120.0,5375.0,532.0,771.0,36755.0,325.0,2.95,1.5,333.0,3.03,1.49,502.0,4.56,3.26,0.569824,0.549805,0.469971,0.469971,7.51,143.690002
2,120.0,5376.0,477.0,740.0,40428.0,316.0,2.87,1.33,332.0,3.02,1.53,458.0,4.16,2.65,0.629883,0.589844,0.459961,0.459961,4.71,147.309998
3,120.0,5377.0,500.0,868.0,52108.0,322.0,3.07,1.62,344.0,3.28,1.64,476.0,4.53,2.84,0.640137,0.620117,0.439941,0.439941,5.41,133.940002
4,120.0,5378.0,500.0,728.0,36803.0,334.0,2.9,1.41,352.0,3.06,1.4,474.0,4.12,2.61,0.640137,0.609863,0.47998,0.469971,2.48,130.869995
5,120.0,5379.0,406.0,610.0,30912.0,259.0,2.51,1.32,280.0,2.72,1.59,383.0,3.72,2.72,0.649902,0.620117,0.5,0.48999,2.52,132.179993
6,120.0,5380.0,472.0,781.0,40125.0,295.0,2.81,1.86,290.0,2.76,1.36,449.0,4.28,3.08,0.589844,0.580078,0.48999,0.47998,4.68,137.259995
7,120.0,5381.0,391.0,630.0,32277.0,252.0,2.45,1.24,249.0,2.42,1.28,380.0,3.69,2.66,0.549805,0.52002,0.509766,0.509766,3.64,135.330002
8,120.0,5382.0,354.0,594.0,31495.0,243.0,2.45,1.26,248.0,2.51,1.29,343.0,3.46,2.1,0.540039,0.52002,0.509766,0.509766,6.94,142.039993
9,120.0,5383.0,389.0,574.0,31292.0,255.0,2.58,1.42,251.0,2.54,1.42,379.0,3.83,2.81,0.540039,0.48999,0.529785,0.529785,2.49,125.779999


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:55,842][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:55,862][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:55,866][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2520.88it/s]
[2025-09-14 14:35:55,981][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3067.70it/s]
[2025-09-14 14:35:56,021][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3270.41it/s]
[2025-09-14 14:35:56,040][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [343  26 260 237 322 ... 219 465 108 208 318], Length=274
        Val time series IDs: [151   4 222 276 395 ... 205 329 439 291 225], Length=109
        Test time series IDs: [368 319 106 545 256 ... 436  79 397 176 419], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:35:56,464][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 14:35:56,491][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:35:56,496][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2422.67it/s]
[2025-09-14 14:35:56,615][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2865.93it/s]
[2025-09-14 14:35:56,659][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3275.95it/s]
[2025-09-14 14:35:56,678][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [357 497 360  88 388 ... 175 472 147 485 411], Length=274
        Val time series IDs: [345 421 152 151 315 ... 397 157  21 193 187], Length=109
        Test time series IDs: [  9  44 514 524 239 ... 479 358  54 263 150], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)