# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-06 17:04:55,787][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:04:55,793][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:04:55,814][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:04:55,818][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1300.67it/s]
[2025-09-06 17:04:56,049][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1222.35it/s]
[2025-09-06 17:04:56,151][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1148.14it/s]
[2025-09-06 17:04:56,203][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [508 232 110 291 296 ... 251 212 249 347 219], Length=274
        Val time series IDs: [145  79 107 189  24 ... 376 464 412 393 210], Length=109
        Test time series IDs: [223 201 503  25 144 ... 306 333 397 362 339], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-06 17:04:56,209][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:04:56,210][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:04:56,210][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:04:56,210][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:04:56,211][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-06 17:04:56,217][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:04:56,217][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:04:56,218][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:04:56,219][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:04:56,219][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:04:56,228][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 45.72it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:04:58,536][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 69.43it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:04:59,013][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 97.68it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:04:59,132][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:04:59,158][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:04:59,162][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2512.29it/s]
[2025-09-06 17:04:59,276][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2778.65it/s]
[2025-09-06 17:04:59,320][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3445.69it/s]
[2025-09-06 17:04:59,337][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [355 378  59 428 102 ... 366 342  95 506 356], Length=274
        Val time series IDs: [219 184 135  51 449 ... 108 151 475 365 146], Length=109
        Test time series IDs: [463 380 478 503  43 ... 456 282 538 138  85], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 17:04:59,345][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 48.34it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:01,526][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:01,536][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:01,540][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1995.15it/s]
[2025-09-06 17:05:01,545][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:05:01,553][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1813.99it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:01,624][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:01,643][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:01,647][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7605.22it/s]
[2025-09-06 17:05:01,688][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 6223.93it/s]
[2025-09-06 17:05:01,709][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 6350.55it/s]
[2025-09-06 17:05:01,718][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 92 180  29  62  38 ...  28 390 468 523  73], Length=274
        Val time series IDs: [  2  52 370 270 229 ... 324  13 471 145 380], Length=109
        Test time series IDs: [194 539  69 174 290 ... 228 536 276 100 490], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
   

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-06 17:05:01,726][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1162.76it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-06 17:05:02,155][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:05:02,155][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-06 17:05:02,156][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-06 17:05:02,156][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:05:02,157][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:05:02,157][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:05:02,158][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:02,163][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:02,183][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:02,186][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7562.08it/s]
[2025-09-06 17:05:02,227][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 6898.53it/s]
[2025-09-06 17:05:02,247][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 6351.44it/s]
[2025-09-06 17:05:02,258][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [149 125 305 156 189 ... 373  84  20 454  60], Length=274
        Val time series IDs: [364 322 105   3  99 ...  16 141  40 407 146], Length=109
        Test time series IDs: [445 442 179 220 122 ... 210 472 328 298 142], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
  

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-06 17:05:02,265][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1213.27it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:02,674][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:02,696][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:02,700][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2496.40it/s]
[2025-09-06 17:05:02,815][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2983.73it/s]
[2025-09-06 17:05:02,856][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3470.40it/s]
[2025-09-06 17:05:02,874][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [437 344 480 545 181 ... 404 511 380 415 313], Length=274
        Val time series IDs: [319  34 393 376 487 ... 520  36 184 182  46], Length=109
        Test time series IDs: [ 70 137 316 482 317 ... 461 361 250 186 311], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,437.0,0.0,7363.0,293705.0,202098400.0,968.0,16.129999,25.190001,1029.0,17.15,24.719999,3315.0,55.25,117.519997,0.839844,0.859863,0.449951,0.459961,11.67,157.559998
1,437.0,1.0,8498.0,625108.0,413097900.0,1079.0,17.4,27.110001,1153.0,18.6,26.1,3789.0,61.110001,128.990005,0.779785,0.779785,0.5,0.5,16.280001,161.880005
2,437.0,2.0,14758.0,1775400.0,960635200.0,1323.0,20.67,40.16,1096.0,17.120001,23.280001,6153.0,96.139999,248.589996,0.779785,0.790039,0.5,0.5,18.58,154.899994
3,437.0,3.0,12768.0,3458994.0,3069756000.0,1219.0,19.66,39.720001,1024.0,16.52,23.799999,5476.0,88.32,229.139999,0.810059,0.819824,0.5,0.47998,17.290001,154.669998
4,437.0,4.0,10983.0,833532.0,645968000.0,1184.0,20.41,40.880001,944.0,16.280001,24.9,5077.0,87.529999,218.429993,0.850098,0.870117,0.47998,0.449951,12.55,146.820007
5,437.0,5.0,9451.0,1952500.0,1558092000.0,1092.0,19.16,37.810001,876.0,15.37,22.940001,4657.0,81.699997,198.779999,0.759766,0.77002,0.509766,0.47998,13.82,157.190002
6,437.0,6.0,9715.0,1777357.0,1141526000.0,1160.0,19.66,40.040001,882.0,14.95,21.790001,4839.0,82.019997,205.979996,0.799805,0.819824,0.48999,0.469971,16.540001,160.410004
7,437.0,7.0,10663.0,1319190.0,928429000.0,1142.0,19.360001,39.310001,2168.0,36.75,93.720001,4615.0,78.220001,192.669998,0.790039,0.799805,0.5,0.459961,16.51,158.610001
8,437.0,8.0,8987.0,1968387.0,1451183000.0,1084.0,18.07,36.400002,799.0,13.32,18.040001,4430.0,73.830002,186.440002,0.77002,0.77002,0.5,0.5,19.15,155.470001
9,437.0,9.0,9484.0,1523540.0,1213049000.0,1143.0,18.440001,37.209999,938.0,15.13,20.940001,4870.0,78.550003,199.740005,0.810059,0.830078,0.47998,0.469971,19.059999,157.580002


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,319.0,3359.0,2513.0,21843.0,6066112.0,637.0,8.73,9.02,755.0,10.34,12.36,1850.0,25.34,38.369999,0.700195,0.740234,0.48999,0.540039,27.190001,107.470001
1,319.0,3360.0,2692.0,22687.0,5671834.0,603.0,8.74,8.73,789.0,11.43,15.92,1805.0,26.16,41.18,0.709961,0.740234,0.52002,0.560059,30.889999,100.419998
2,319.0,3361.0,2438.0,19217.0,4471561.0,566.0,8.98,9.06,696.0,11.05,16.84,1612.0,25.59,37.240002,0.640137,0.689941,0.52002,0.560059,30.700001,101.410004
3,319.0,3362.0,2001.0,17110.0,3881513.0,546.0,7.91,7.79,497.0,7.2,6.5,1525.0,22.1,34.299999,0.680176,0.709961,0.529785,0.620117,32.57,112.050003
4,319.0,3363.0,2064.0,17908.0,4640596.0,580.0,8.41,8.09,551.0,7.99,6.38,1622.0,23.51,36.889999,0.640137,0.660156,0.52002,0.600098,32.860001,108.739998
5,319.0,3364.0,1780.0,32680.0,21223516.0,564.0,8.06,9.97,463.0,6.61,5.4,1462.0,20.889999,35.080002,0.790039,0.810059,0.540039,0.589844,26.18,108.75
6,319.0,3365.0,2390.0,83203.0,61611336.0,615.0,9.92,13.68,475.0,7.66,7.47,1762.0,28.42,54.310001,0.759766,0.77002,0.52002,0.569824,26.41,95.949997
7,319.0,3366.0,2652.0,232023.0,235383730.0,715.0,9.79,14.1,601.0,8.23,9.01,1969.0,26.969999,55.939999,0.720215,0.720215,0.549805,0.589844,24.309999,90.389999
8,319.0,3367.0,2617.0,73454.0,55534519.0,670.0,10.81,12.42,633.0,10.21,12.98,1925.0,31.049999,53.240002,0.740234,0.759766,0.569824,0.609863,28.9,89.18
9,319.0,3368.0,2338.0,76602.0,59310406.0,678.0,9.83,12.58,664.0,9.62,13.37,1890.0,27.389999,50.27,0.77002,0.77002,0.540039,0.580078,31.200001,103.830002


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,70.0,5374.0,23363.0,88104.0,29372510.0,11328.0,4.98,3.89,12727.0,5.6,3.65,20107.0,8.85,12.37,0.620117,0.600098,0.449951,0.459961,8.54,119.510002
1,70.0,5375.0,22926.0,103614.0,40980930.0,11299.0,5.02,4.08,11913.0,5.29,3.52,19943.0,8.85,12.21,0.600098,0.580078,0.459961,0.47998,8.33,119.660004
2,70.0,5376.0,23772.0,118174.0,52400040.0,11018.0,4.97,4.36,11599.0,5.23,3.49,20609.0,9.29,14.58,0.589844,0.580078,0.469971,0.47998,10.06,114.120003
3,70.0,5377.0,23259.0,77200.0,7116808.0,11021.0,4.93,4.26,11557.0,5.17,3.21,20310.0,9.08,14.42,0.589844,0.569824,0.459961,0.459961,8.91,117.589996
4,70.0,5378.0,20283.0,85519.0,19418640.0,9798.0,4.57,4.18,9878.0,4.61,3.12,17773.0,8.29,13.33,0.580078,0.560059,0.459961,0.469971,9.29,117.790001
5,70.0,5379.0,16925.0,63427.0,6448770.0,8545.0,4.1,3.62,8540.0,4.1,2.54,15107.0,7.25,11.06,0.629883,0.620117,0.459961,0.459961,9.21,115.290001
6,70.0,5380.0,17754.0,68388.0,17111520.0,9010.0,4.36,3.68,9370.0,4.54,3.35,15353.0,7.43,11.24,0.640137,0.620117,0.419922,0.429932,8.67,120.269997
7,70.0,5381.0,16157.0,71715.0,27632160.0,8316.0,4.06,3.48,8336.0,4.07,2.53,14481.0,7.07,10.42,0.620117,0.609863,0.459961,0.459961,9.42,116.599998
8,70.0,5382.0,16315.0,959472.0,1339319000.0,8486.0,4.08,3.56,8483.0,4.08,2.51,14574.0,7.01,10.87,0.640137,0.629883,0.459961,0.459961,9.67,117.940002
9,70.0,5383.0,16625.0,91188.0,41657890.0,8450.0,4.08,3.55,8415.0,4.07,2.41,14817.0,7.16,11.15,0.600098,0.589844,0.469971,0.469971,10.12,116.160004


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:03,705][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:03,725][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:03,728][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2423.24it/s]
[2025-09-06 17:05:03,848][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3044.69it/s]
[2025-09-06 17:05:03,890][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3174.74it/s]
[2025-09-06 17:05:03,910][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [259 461  79  65 146 ... 234 362 189  97 334], Length=274
        Val time series IDs: [527 337 306 216 231 ...  42 187 492 479  33], Length=109
        Test time series IDs: [ 88  10 169 340  61 ... 531 512 355 379 109], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:05:04,313][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 17:05:04,337][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:05:04,341][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2644.87it/s]
[2025-09-06 17:05:04,450][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3067.99it/s]
[2025-09-06 17:05:04,489][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3722.33it/s]
[2025-09-06 17:05:04,506][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [435 246   1  24 114 ... 413 365 511  27  62], Length=274
        Val time series IDs: [427 202 377 378 479 ... 451 358 151 326 445], Length=109
        Test time series IDs: [352 112 167 129 540 ...  51  98 222 389 526], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)