# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-15 11:29:29,065][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:29,069][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:29,089][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:29,096][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1373.50it/s]
[2025-09-15 11:29:29,315][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1310.11it/s]
[2025-09-15 11:29:29,412][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1254.74it/s]
[2025-09-15 11:29:29,459][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [218  73 545 211 327 ... 248 135 162 184   2], Length=274
        Val time series IDs: [324  72 505 439  51 ...  16 395 380 283  31], Length=109
        Test time series IDs: [516 336  39 348  17 ... 349  37  95 513 152], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-15 11:29:29,468][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:29,469][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:29,472][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:29,472][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:29,473][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-15 11:29:29,479][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:29,481][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:29,484][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:29,485][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:29,485][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:29:29,493][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 44.37it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:29:31,874][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 58.89it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:29:32,434][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 86.58it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:32,570][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:32,596][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:32,601][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2326.86it/s]
[2025-09-15 11:29:32,775][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3112.75it/s]
[2025-09-15 11:29:32,814][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3372.68it/s]
[2025-09-15 11:29:32,832][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 91 388 309 279  32 ... 507 163 284 131  97], Length=274
        Val time series IDs: [399 165 186 212 544 ... 441 530 244 384   5], Length=109
        Test time series IDs: [190 146 389 293 271 ...  20  49 138  60 141], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 11:29:32,842][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 46.50it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:35,108][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:35,118][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:35,121][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1999.67it/s]
[2025-09-15 11:29:35,124][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:29:35,133][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2062.89it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:35,196][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:35,214][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:35,218][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7821.41it/s]
[2025-09-15 11:29:35,258][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7261.65it/s]
[2025-09-15 11:29:35,278][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3369.52it/s]
[2025-09-15 11:29:35,297][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [244 261  52 300 346 ... 347 356 441 523  94], Length=274
        Val time series IDs: [ 66 481 274 229 258 ...  44 212 447  80 150], Length=109
        Test time series IDs: [ 24 175 139 202 475 ... 371 275 311 220 439], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anom

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-15 11:29:35,308][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1033.07it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-15 11:29:35,791][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:35,791][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-15 11:29:35,792][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-15 11:29:35,792][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:35,796][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:29:35,796][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:29:35,796][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:35,801][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:35,820][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:35,824][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 8172.60it/s]
[2025-09-15 11:29:35,863][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7370.17it/s]
[2025-09-15 11:29:35,882][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7197.55it/s]
[2025-09-15 11:29:35,891][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [157 192 182 236 222 ...  74 259  60  24 504], Length=274
        Val time series IDs: [104  41   7 356 469 ... 482  87 395 514 254], Length=109
        Test time series IDs: [538 367 378 484 436 ... 153 414 350 506  43], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Ano

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-15 11:29:35,900][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1080.48it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:36,358][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:36,377][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:36,382][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2585.17it/s]
[2025-09-15 11:29:36,492][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3157.40it/s]
[2025-09-15 11:29:36,532][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3722.20it/s]
[2025-09-15 11:29:36,548][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [264 381 387 295 253 ... 258  64 316 223  44], Length=274
        Val time series IDs: [263 515 197 219 348 ... 337  65 495 325 183], Length=109
        Test time series IDs: [455 285 361 448 333 ... 545 354 265 465 503], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,264.0,0.0,10310.0,512654.0,251354900.0,1811.0,9.9,11.4,2058.0,11.25,14.95,4718.0,25.780001,64.459999,0.879883,0.870117,0.409912,0.429932,3.73,146.460007
1,264.0,1.0,11968.0,947350.0,453898900.0,2149.0,9.59,12.07,2324.0,10.38,13.58,5658.0,25.26,73.970001,0.850098,0.819824,0.379883,0.409912,3.12,142.419998
2,264.0,2.0,34052.0,6666384.0,3961873000.0,2328.0,11.09,19.120001,2246.0,10.7,14.98,9458.0,45.040001,185.130005,0.879883,0.859863,0.429932,0.439941,5.35,144.949997
3,264.0,3.0,61598.0,12421866.0,7313597000.0,2513.0,12.56,26.790001,2346.0,11.73,17.219999,13196.0,65.980003,290.559998,0.890137,0.879883,0.449951,0.449951,11.4,136.190002
4,264.0,4.0,60929.0,6752997.0,3819981000.0,2464.0,13.69,29.16,2252.0,12.51,20.59,13214.0,73.410004,309.869995,0.870117,0.859863,0.5,0.47998,10.24,126.970001
5,264.0,5.0,48781.0,6665884.0,3530089000.0,2533.0,13.26,28.709999,2012.0,10.53,15.48,12167.0,63.700001,263.76001,0.859863,0.859863,0.48999,0.48999,13.78,131.789993
6,264.0,6.0,47264.0,6822807.0,4415642000.0,2528.0,13.17,27.01,2050.0,10.68,16.110001,11916.0,62.060001,253.860001,0.830078,0.819824,0.509766,0.509766,15.11,127.050003
7,264.0,7.0,52247.0,4583420.0,2252856000.0,2688.0,13.05,26.530001,2243.0,10.89,15.76,12372.0,60.060001,252.470001,0.839844,0.830078,0.48999,0.48999,13.22,124.0
8,264.0,8.0,47781.0,5217293.0,2595123000.0,2624.0,11.93,25.68,2196.0,9.98,14.42,12032.0,54.689999,239.259995,0.839844,0.830078,0.48999,0.48999,11.8,126.910004
9,264.0,9.0,49405.0,5741806.0,3705850000.0,2612.0,12.32,25.83,2339.0,11.03,16.65,11833.0,55.82,235.149994,0.810059,0.799805,0.469971,0.459961,12.8,127.190002


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,263.0,3359.0,340925.0,16000882.0,15289340000.0,51358.0,4.67,8.85,217371.0,19.76,409.329987,127906.0,11.63,108.379997,0.709961,0.700195,0.509766,0.5,14.03,125.43
1,263.0,3360.0,357883.0,5768504.0,5646213000.0,50552.0,4.75,8.45,243856.0,22.91,474.399994,125098.0,11.75,114.099998,0.700195,0.689941,0.48999,0.47998,12.48,132.240005
2,263.0,3361.0,369431.0,6554477.0,6361014000.0,49192.0,4.75,9.52,250424.0,24.16,517.130005,122779.0,11.84,116.389999,0.700195,0.689941,0.509766,0.5,12.46,127.529999
3,263.0,3362.0,371764.0,9535527.0,10645310000.0,47121.0,4.63,8.01,261170.0,25.66,581.76001,116613.0,11.46,114.290001,0.689941,0.680176,0.5,0.5,12.16,130.589996
4,263.0,3363.0,377352.0,3849618.0,3513866000.0,45349.0,4.4,8.53,268625.0,26.059999,536.409973,116257.0,11.28,113.769997,0.689941,0.680176,0.509766,0.5,12.09,127.879997
5,263.0,3364.0,268254.0,6141949.0,6397246000.0,40429.0,4.04,9.29,185647.0,18.530001,392.48999,97734.0,9.76,91.099998,0.689941,0.680176,0.52002,0.509766,12.74,127.339996
6,263.0,3365.0,309632.0,5639107.0,5155055000.0,46984.0,4.03,8.88,213056.0,18.26,371.220001,104153.0,8.93,77.800003,0.740234,0.72998,0.52002,0.48999,13.84,125.75
7,263.0,3366.0,322451.0,12138919.0,12540280000.0,58794.0,3.91,9.33,202160.0,13.43,298.720001,119932.0,7.97,65.949997,0.779785,0.77002,0.529785,0.48999,16.41,118.419998
8,263.0,3367.0,326949.0,10171018.0,9336324000.0,68122.0,3.88,10.0,199981.0,11.38,247.119995,132178.0,7.52,64.949997,0.799805,0.790039,0.540039,0.5,17.74,120.379997
9,263.0,3368.0,336258.0,11338591.0,10337610000.0,71842.0,3.88,10.39,201013.0,10.87,238.080002,137854.0,7.45,63.5,0.799805,0.790039,0.540039,0.48999,17.940001,115.949997


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,455.0,5374.0,612.0,6714.0,2330530.0,113.0,9.42,7.12,115.0,9.58,7.62,211.0,17.58,9.56,0.459961,0.509766,0.439941,0.439941,22.360001,138.119995
1,455.0,5375.0,273.0,2674.0,748132.0,102.0,11.33,6.14,101.0,11.22,6.34,170.0,18.889999,9.36,0.48999,0.560059,0.47998,0.459961,35.689999,110.639999
2,455.0,5376.0,334.0,4157.0,1581494.0,112.0,10.18,7.7,105.0,9.55,6.9,202.0,18.360001,12.28,0.529785,0.620117,0.419922,0.340088,28.26,113.519997
3,455.0,5377.0,281.0,2199.0,584323.0,133.0,16.620001,8.55,98.0,12.25,4.98,199.0,24.879999,12.26,0.569824,0.629883,0.419922,0.330078,35.439999,107.470001
4,455.0,5378.0,2379.0,163763.0,155851400.0,174.0,15.82,12.74,94.0,8.55,6.31,684.0,62.18,67.230003,0.600098,0.629883,0.360107,0.199951,12.17,120.779999
5,455.0,5379.0,4426.0,415156.0,400847000.0,254.0,28.219999,19.860001,119.0,13.22,9.3,1532.0,170.220001,129.229996,0.629883,0.689941,0.330078,0.160034,13.16,134.039993
6,455.0,5380.0,9573.0,1019715.0,1040031000.0,393.0,43.669998,33.029999,143.0,15.89,12.32,3036.0,337.329987,256.170013,0.640137,0.660156,0.439941,0.290039,15.51,104.980003
7,455.0,5381.0,5967.0,453461.0,446408200.0,352.0,29.33,28.16,172.0,14.33,12.65,2447.0,203.919998,207.800003,0.48999,0.5,0.419922,0.300049,19.110001,122.040001
8,455.0,5382.0,6411.0,605779.0,606335300.0,363.0,33.0,29.99,117.0,10.64,9.59,2623.0,238.449997,230.690002,0.649902,0.680176,0.26001,0.150024,36.900002,123.410004
9,455.0,5383.0,4774.0,451653.0,483947500.0,296.0,26.91,24.09,365.0,33.18,72.18,1888.0,171.639999,161.309998,0.48999,0.509766,0.370117,0.27002,14.44,108.169998


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:37,377][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:37,397][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:37,401][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2636.61it/s]
[2025-09-15 11:29:37,510][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3135.16it/s]
[2025-09-15 11:29:37,549][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3595.97it/s]
[2025-09-15 11:29:37,566][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 92  74 433 131 357 ... 358 468 296 378 530], Length=274
        Val time series IDs: [431   7 213 372 522 ... 291 283 329 311 423], Length=109
        Test time series IDs: [142  58 324 466 316 ... 350 481 274 298 525], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:29:37,955][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-15 11:29:37,979][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:29:37,983][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2558.60it/s]
[2025-09-15 11:29:38,095][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2739.60it/s]
[2025-09-15 11:29:38,140][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3479.95it/s]
[2025-09-15 11:29:38,158][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [547  54 381 342  76 ... 204 355 331 238 236], Length=274
        Val time series IDs: [447 425 245 231 488 ...  19 365 478 305 389], Length=109
        Test time series IDs: [142 267 183 400 273 ... 235  21 513  35  32], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)