# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-14 15:51:44,391][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:44,395][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:44,416][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:44,421][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1318.28it/s]
[2025-09-14 15:51:44,647][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1241.61it/s]
[2025-09-14 15:51:44,745][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1219.67it/s]
[2025-09-14 15:51:44,795][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 48 490 109  12 495 ... 292 478 328 263 188], Length=274
        Val time series IDs: [491 520 152  49 162 ... 251 394 380 123 514], Length=109
        Test time series IDs: [250 344 204  73 236 ... 369 352 398 366  65], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-14 15:51:44,804][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:44,805][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:44,808][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:44,809][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:44,809][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-14 15:51:44,817][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:44,817][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:44,822][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:44,822][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:44,823][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:51:44,832][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 44.97it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:51:47,180][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 67.00it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:51:47,675][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 99.21it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:47,795][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:47,819][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:47,823][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2457.24it/s]
[2025-09-14 15:51:47,940][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3030.69it/s]
[2025-09-14 15:51:47,980][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 2898.32it/s]
[2025-09-14 15:51:48,001][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 37  97 374 419 461 ... 125  72 395 158 491], Length=274
        Val time series IDs: [298  30 247  79 429 ... 328 218  70  64 192], Length=109
        Test time series IDs: [205   9  27 333 160 ... 542 101 392 170 502], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-14 15:51:48,059][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 46.69it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:50,316][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:50,326][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:50,329][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 2000.86it/s]
[2025-09-14 15:51:50,334][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:51:50,342][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2049.62it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:50,406][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:50,425][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:50,429][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7191.11it/s]
[2025-09-14 15:51:50,472][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 6051.43it/s]
[2025-09-14 15:51:50,495][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 5748.68it/s]
[2025-09-14 15:51:50,506][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [268  51 272 175 535 ... 457 215 325   6   1], Length=274
        Val time series IDs: [257 351 397  50 209 ... 193 121 201 178  54], Length=109
        Test time series IDs: [146  24 446 424 224 ... 274 286 186 442 404], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handle

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-14 15:51:50,516][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1114.26it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-14 15:51:50,966][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:50,967][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-14 15:51:50,967][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-14 15:51:50,968][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:50,971][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:51:50,972][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:51:50,972][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:50,977][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:50,998][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:51,003][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7500.53it/s]
[2025-09-14 15:51:51,044][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7258.54it/s]
[2025-09-14 15:51:51,065][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 6343.44it/s]
[2025-09-14 15:51:51,076][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [195 286 181 426 220 ... 353  86 304  80 285], Length=274
        Val time series IDs: [ 10 152 191 365 167 ... 399 521 299 514  45], Length=109
        Test time series IDs: [440 256 436 530 263 ... 122 133 318 329 209], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handl

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-14 15:51:51,085][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1147.58it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:51,519][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:51,539][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:51,544][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2569.86it/s]
[2025-09-14 15:51:51,655][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3052.77it/s]
[2025-09-14 15:51:51,696][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3595.97it/s]
[2025-09-14 15:51:51,713][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [410 140 364 293  46 ... 428 472 320 246 237], Length=274
        Val time series IDs: [329 262 429 115 415 ... 524 160 265 343  48], Length=109
        Test time series IDs: [368 295 438 200 475 ... 416 508 118 367   6], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,410.0,0.0,17932.0,309496.0,244636300.0,4357.0,11.97,22.299999,8951.0,24.59,73.989998,11634.0,31.959999,80.099998,0.819824,0.810059,0.5,0.589844,19.370001,126.889999
1,410.0,1.0,19346.0,206997.0,114085900.0,4652.0,13.1,25.059999,8921.0,25.129999,77.120003,12190.0,34.34,87.699997,0.790039,0.77002,0.509766,0.589844,16.84,123.580002
2,410.0,2.0,18598.0,280303.0,219251700.0,4455.0,12.62,25.280001,8129.0,23.030001,72.589996,11576.0,32.790001,86.059998,0.799805,0.779785,0.52002,0.600098,18.68,121.330002
3,410.0,3.0,21700.0,260579.0,154454800.0,4779.0,12.54,27.15,7619.0,20.0,63.240002,12871.0,33.779999,100.279999,0.819824,0.799805,0.529785,0.580078,18.450001,114.690002
4,410.0,4.0,26898.0,634499.0,451920700.0,4980.0,13.17,32.689999,6558.0,17.35,53.560001,14187.0,37.529999,134.160004,0.810059,0.790039,0.52002,0.560059,22.809999,111.080002
5,410.0,5.0,33964.0,1723201.0,1950019000.0,5593.0,14.3,39.939999,5868.0,15.01,47.650002,17026.0,43.540001,177.419998,0.830078,0.810059,0.529785,0.549805,24.870001,113.57
6,410.0,6.0,37365.0,602967.0,414466400.0,6048.0,15.51,42.009998,6164.0,15.81,45.66,18425.0,47.240002,192.990005,0.810059,0.790039,0.52002,0.560059,26.41,116.209999
7,410.0,7.0,45452.0,1078724.0,964529300.0,6374.0,15.4,44.779999,6608.0,15.96,46.360001,20888.0,50.450001,222.699997,0.810059,0.799805,0.52002,0.549805,26.309999,114.980003
8,410.0,8.0,49490.0,912496.0,608659500.0,6643.0,16.860001,47.669998,6650.0,16.879999,49.369999,22070.0,56.02,243.809998,0.830078,0.810059,0.509766,0.529785,26.809999,113.089996
9,410.0,9.0,58177.0,1722207.0,1640971000.0,7190.0,17.709999,52.860001,6791.0,16.73,52.860001,25108.0,61.84,281.290009,0.819824,0.810059,0.5,0.52002,28.41,111.419998


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,329.0,3359.0,8801.0,421698.0,446004982.0,1058.0,18.889999,20.52,1013.0,18.09,39.599998,4589.0,81.949997,89.300003,0.070007,0.080017,0.48999,0.47998,16.85,132.490005
1,329.0,3360.0,8160.0,27166.0,4713643.0,978.0,16.860001,17.66,975.0,16.809999,36.75,4298.0,74.099998,79.050003,0.130005,0.140015,0.5,0.469971,17.549999,137.029999
2,329.0,3361.0,8265.0,34283.0,13401347.0,955.0,17.360001,17.84,1120.0,20.360001,45.84,4435.0,80.639999,78.360001,0.080017,0.099976,0.47998,0.459961,19.719999,127.709999
3,329.0,3362.0,8185.0,81766.0,84602388.0,987.0,17.620001,17.85,1066.0,19.040001,43.490002,4445.0,79.379997,76.089996,0.119995,0.130005,0.5,0.469971,16.48,133.580002
4,329.0,3363.0,9643.0,27651.0,4389521.0,1061.0,19.65,23.02,1146.0,21.219999,49.459999,5092.0,94.300003,107.580002,0.109985,0.130005,0.48999,0.459961,17.74,132.550003
5,329.0,3364.0,10768.0,40380.0,11578536.0,1173.0,21.719999,29.08,977.0,18.09,39.849998,5256.0,97.330002,135.419998,0.170044,0.180054,0.5,0.469971,15.39,125.93
6,329.0,3365.0,11691.0,281858.0,280245160.0,1299.0,24.51,34.900002,911.0,17.190001,37.799999,5729.0,108.089996,163.679993,0.150024,0.160034,0.529785,0.5,21.67,123.190002
7,329.0,3366.0,11387.0,889783.0,993538285.0,1381.0,24.66,33.830002,1117.0,19.950001,40.619999,5673.0,101.300003,158.949997,0.199951,0.199951,0.5,0.469971,24.25,129.419998
8,329.0,3367.0,11617.0,104836.0,86513503.0,1439.0,26.65,35.599998,1229.0,22.76,42.200001,5936.0,109.93,158.960007,0.099976,0.109985,0.5,0.469971,19.049999,118.599998
9,329.0,3368.0,11424.0,204062.0,229446527.0,1487.0,27.040001,34.990002,1185.0,21.549999,40.360001,6016.0,109.379997,160.220001,0.160034,0.150024,0.48999,0.459961,18.68,126.690002


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,368.0,5374.0,1458.0,17303.0,6953577.0,148.0,13.45,11.86,85.0,7.73,4.88,443.0,40.27,35.799999,0.879883,0.870117,0.449951,0.330078,3.3,134.429993
1,368.0,5375.0,1412.0,17138.0,5987169.0,110.0,10.0,7.75,82.0,7.45,5.28,396.0,36.0,32.200001,0.910156,0.930176,0.569824,0.449951,5.52,160.75
2,368.0,5376.0,1371.0,17381.0,5405039.0,111.0,12.33,8.05,88.0,9.78,5.52,384.0,42.669998,30.18,0.97998,0.990234,0.5,0.310059,3.86,131.539993
3,368.0,5377.0,1793.0,38166.0,26492680.0,195.0,21.67,15.57,108.0,12.0,7.43,690.0,76.669998,60.619999,0.97998,0.990234,0.5,0.320068,5.78,144.160004
4,368.0,5378.0,1972.0,55882.0,42757330.0,230.0,20.91,18.209999,128.0,11.64,9.53,793.0,72.089996,69.879997,0.830078,0.839844,0.52002,0.379883,4.47,166.720001
5,368.0,5379.0,7560.0,511275.0,521285800.0,396.0,36.0,35.950001,239.0,21.73,19.620001,2744.0,249.449997,315.079987,0.850098,0.839844,0.399902,0.300049,6.22,139.520004
6,368.0,5380.0,18508.0,1538049.0,1512350000.0,621.0,88.709999,38.459999,256.0,36.57,16.360001,5949.0,849.859985,386.390015,0.890137,0.890137,0.350098,0.290039,12.84,112.139999
7,368.0,5381.0,15613.0,2731280.0,3162274000.0,610.0,55.450001,51.650002,260.0,23.639999,20.040001,5474.0,497.640015,483.929993,0.819824,0.790039,0.320068,0.23999,9.04,133.740005
8,368.0,5382.0,19500.0,2558341.0,2639545000.0,690.0,62.73,59.400002,211.0,19.18,16.1,6570.0,597.27002,582.400024,0.899902,0.879883,0.409912,0.320068,10.28,145.240005
9,368.0,5383.0,18765.0,1726645.0,1714331000.0,683.0,75.889999,54.810001,187.0,20.780001,13.39,6866.0,762.890015,571.5,0.910156,0.870117,0.399902,0.290039,12.96,124.43


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:52,456][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:52,479][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:52,483][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2455.77it/s]
[2025-09-14 15:51:52,600][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2984.80it/s]
[2025-09-14 15:51:52,641][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3371.38it/s]
[2025-09-14 15:51:52,660][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 89 183 202  64 307 ... 310 486 393 443 207], Length=274
        Val time series IDs: [ 61  34 219 541 399 ...  11 413 181 468 330], Length=109
        Test time series IDs: [272 509 494 346  13 ... 410  76 446 373 129], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:51:53,033][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-14 15:51:53,056][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:51:53,062][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2589.28it/s]
[2025-09-14 15:51:53,173][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2811.87it/s]
[2025-09-14 15:51:53,217][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3476.64it/s]
[2025-09-14 15:51:53,234][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [223 516 376 340 250 ... 414 293 140 380  43], Length=274
        Val time series IDs: [255 131 194 417 118 ... 355 436 264 184 312], Length=109
        Test time series IDs: [ 81 527 397  10 211 ... 289 458 162 395   3], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)