# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-08-19 12:02:27,240][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:27,245][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:27,266][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:27,270][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1309.00it/s]
[2025-08-19 12:02:27,498][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1205.64it/s]
[2025-08-19 12:02:27,601][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1183.63it/s]
[2025-08-19 12:02:27,653][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [151 138  51 433 280 ... 420  90 472 189 511], Length=274
        Val time series IDs: [504 266 499 235 415 ... 407 354 308 231 286], Length=109
        Test time series IDs: [533 457 520 226 245 ... 257 259 547  91  15], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-08-19 12:02:27,659][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:27,659][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:27,660][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:27,660][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:27,660][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-08-19 12:02:27,667][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:27,668][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:27,668][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:27,669][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:27,670][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:02:27,679][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 44.31it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:02:30,060][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 68.34it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:02:30,542][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 98.65it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:30,662][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:30,685][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:30,688][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2667.71it/s]
[2025-08-19 12:02:30,796][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3301.58it/s]
[2025-08-19 12:02:30,834][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3482.94it/s]
[2025-08-19 12:02:30,851][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [205 461 243 161  28 ... 133 287 478 395  46], Length=274
        Val time series IDs: [406  77  27 437 506 ...  83 225 102  17 495], Length=109
        Test time series IDs: [336 316 503 487 107 ... 499 305 118 204 190], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-19 12:02:30,861][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 51.74it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:32,901][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:32,911][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:32,914][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1334.60it/s]
[2025-08-19 12:02:32,919][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:02:32,927][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2121.63it/s]


(1, 32, 20)

#### Sliding window

- Both `sliding_window_size` and `sliding_window_prediction_size` must be set if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:32,988][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:33,007][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:33,010][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 8255.79it/s]
[2025-08-19 12:02:33,048][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 6808.02it/s]
[2025-08-19 12:02:33,068][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7193.66it/s]
[2025-08-19 12:02:33,078][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [163 293 524 238 429 ... 323 464 356 376 103], Length=274
        Val time series IDs: [314 158 269 472  18 ...  15 340 243 160 317], Length=109
        Test time series IDs: [262 364 152 214 187 ...  49 468 143 181 334], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
       

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-08-19 12:02:33,086][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1281.26it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-08-19 12:02:33,472][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:33,473][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-08-19 12:02:33,473][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-08-19 12:02:33,474][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:33,474][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:02:33,475][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:02:33,475][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:33,480][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:33,500][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:33,504][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 8958.97it/s]
[2025-08-19 12:02:33,539][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 8379.38it/s]
[2025-08-19 12:02:33,557][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7585.65it/s]
[2025-08-19 12:02:33,566][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [507  54 115 234 427 ... 308 211 276 418 178], Length=274
        Val time series IDs: [106 425   2 290  94 ... 314  51 286 200 468], Length=109
        Test time series IDs: [265  18 414 281 145 ... 320 374 171 319 310], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
      

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-08-19 12:02:33,574][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1280.88it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:33,961][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:33,979][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:33,984][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2885.26it/s]
[2025-08-19 12:02:34,083][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2654.59it/s]
[2025-08-19 12:02:34,129][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3853.88it/s]
[2025-08-19 12:02:34,144][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 43 345 130 428 406 ...  19 227 372  16 174], Length=274
        Val time series IDs: [254 375 112 466 303 ... 323 396 499  60 106], Length=109
        Test time series IDs: [378 176 165 241 483 ...  95 133 224 486 440], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,43.0,0.0,1920.0,4620.0,479213.0,455.0,10.58,3.35,378.0,8.79,5.2,1692.0,39.349998,12.05,0.409912,0.580078,0.5,0.529785,5.92,159.660004
1,43.0,1.0,1853.0,4762.0,519159.0,427.0,10.17,3.12,346.0,8.24,3.47,1663.0,39.599998,16.309999,0.439941,0.620117,0.5,0.540039,5.52,156.440002
2,43.0,2.0,1675.0,6291.0,3576567.0,410.0,9.53,2.81,333.0,7.74,4.43,1441.0,33.509998,11.17,0.439941,0.609863,0.47998,0.48999,6.2,153.279999
3,43.0,3.0,1564.0,6464.0,4600882.0,390.0,9.29,3.81,287.0,6.83,4.7,1359.0,32.360001,10.89,0.370117,0.52002,0.459961,0.5,5.35,162.5
4,43.0,4.0,1198.0,2806.0,269909.0,291.0,6.93,2.44,210.0,5.0,3.08,1083.0,25.790001,8.45,0.320068,0.469971,0.48999,0.5,6.08,157.509995
5,43.0,5.0,968.0,2894.0,430395.0,303.0,7.21,2.34,239.0,5.69,2.97,890.0,21.190001,6.63,0.449951,0.589844,0.52002,0.569824,5.97,153.179993
6,43.0,6.0,1032.0,2940.0,325876.0,347.0,7.89,2.87,247.0,5.61,2.91,956.0,21.73,7.57,0.48999,0.620117,0.449951,0.459961,6.51,149.399994
7,43.0,7.0,933.0,2543.0,335667.0,268.0,6.38,2.47,197.0,4.69,2.23,849.0,20.209999,7.71,0.429932,0.569824,0.48999,0.5,6.89,155.429993
8,43.0,8.0,1092.0,2762.0,273003.0,344.0,8.19,3.95,241.0,5.74,3.71,995.0,23.690001,9.42,0.389893,0.52002,0.509766,0.549805,6.35,149.5
9,43.0,9.0,1141.0,29686.0,4743319.0,368.0,8.36,3.51,285.0,6.48,3.89,1043.0,23.700001,8.32,0.47998,0.600098,0.459961,0.469971,5.76,150.75


In [21]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets    n_bytes  \
 0                      43.0      0.0   1920.0     4620.0   479213.0   
 1                      43.0      1.0   1853.0     4762.0   519159.0   
 2                      43.0      2.0   1675.0     6291.0  3576567.0   
 3                      43.0      3.0   1564.0     6464.0  4600882.0   
 4                      43.0      4.0   1198.0     2806.0   269909.0   
 ...                     ...      ...      ...        ...        ...   
 3354                   43.0   3354.0   1716.0     6741.0   966350.0   
 3355                   43.0   3355.0   1800.0     5815.0   740944.0   
 3356                   43.0   3356.0   1622.0     5168.0   627765.0   
 3357                   43.0   3357.0   1992.0     6461.0   864905.0   
 3358                   43.0   3358.0   2156.0     7161.0   871972.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              455.0           10.58            3.35      

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,254.0,3359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
1,254.0,3360.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
2,254.0,3361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
3,254.0,3362.0,7.0,30.0,6613.0,6.0,3.0,1.41,7.0,3.5,0.71,6.0,3.0,1.41,0.910156,0.930176,0.290039,0.409912,3.3,114.709999
4,254.0,3363.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
5,254.0,3364.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,254.0,3365.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,254.0,3366.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
8,254.0,3367.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,254.0,3368.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [23]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets    n_bytes  \
 0                     254.0   3359.0      0.0        0.0        0.0   
 1                     254.0   3360.0      0.0        0.0        0.0   
 2                     254.0   3361.0      0.0        0.0        0.0   
 3                     254.0   3362.0      7.0       30.0     6613.0   
 4                     254.0   3363.0      0.0        0.0        0.0   
 ...                     ...      ...      ...        ...        ...   
 2010                  254.0   5369.0     95.0     3021.0   874624.0   
 2011                  254.0   5370.0     86.0     2086.0   710403.0   
 2012                  254.0   5371.0    108.0     3050.0  1782206.0   
 2013                  254.0   5372.0    128.0     2641.0  1186062.0   
 2014                  254.0   5373.0     78.0     1604.0   516885.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0                0.0            0.00            0.00      

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,378.0,5374.0,3336.0,13767.0,4600890.0,1074.0,7.36,6.11,594.0,4.07,2.11,2535.0,17.360001,21.110001,0.709961,0.700195,0.469971,0.419922,14.68,109.470001
1,378.0,5375.0,3470.0,39051.0,35904330.0,1118.0,7.55,6.68,632.0,4.27,2.25,2604.0,17.59,22.18,0.709961,0.709961,0.469971,0.429932,16.23,115.550003
2,378.0,5376.0,3424.0,16205.0,5675345.0,1063.0,6.95,6.05,628.0,4.1,2.34,2579.0,16.860001,20.940001,0.669922,0.649902,0.469971,0.429932,17.870001,111.830002
3,378.0,5377.0,3589.0,33523.0,13862560.0,1145.0,7.48,6.11,679.0,4.44,2.25,2605.0,17.030001,20.58,0.669922,0.660156,0.5,0.48999,17.690001,112.43
4,378.0,5378.0,6213.0,306028.0,209445600.0,1284.0,8.68,10.31,603.0,4.07,2.53,3936.0,26.59,52.279999,0.660156,0.660156,0.47998,0.429932,16.42,103.489998
5,378.0,5379.0,12511.0,842119.0,708537800.0,2042.0,12.3,13.92,723.0,4.36,2.25,7238.0,43.599998,76.489998,0.75,0.75,0.429932,0.340088,13.64,104.480003
6,378.0,5380.0,14408.0,1061292.0,854037200.0,2457.0,12.66,15.73,829.0,4.27,3.31,8427.0,43.439999,80.709999,0.740234,0.75,0.469971,0.389893,18.15,106.660004
7,378.0,5381.0,15381.0,1316856.0,1000647000.0,2429.0,12.85,16.309999,778.0,4.12,2.09,8639.0,45.709999,90.379997,0.740234,0.75,0.469971,0.370117,23.0,105.57
8,378.0,5382.0,14667.0,1313077.0,833578100.0,2286.0,12.03,15.08,782.0,4.12,2.35,8226.0,43.290001,87.540001,0.779785,0.790039,0.5,0.409912,22.200001,106.989998
9,378.0,5383.0,12447.0,1088938.0,738849500.0,2133.0,11.59,14.68,778.0,4.23,2.59,7037.0,38.240002,80.519997,0.779785,0.790039,0.47998,0.399902,21.93,105.709999


In [25]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     378.0   5374.0   3336.0    13767.0    4600890.0   
 1                     378.0   5375.0   3470.0    39051.0   35904334.0   
 2                     378.0   5376.0   3424.0    16205.0    5675345.0   
 3                     378.0   5377.0   3589.0    33523.0   13862555.0   
 4                     378.0   5378.0   6213.0   306028.0  209445569.0   
 ...                     ...      ...      ...        ...          ...   
 1338                  378.0   6712.0   2439.0     6560.0    1256891.0   
 1339                  378.0   6713.0   2270.0     6966.0    1370390.0   
 1340                  378.0   6714.0   2732.0     8244.0    2873758.0   
 1341                  378.0   6715.0   2563.0     6681.0    1635115.0   
 1342                  378.0   6716.0   2623.0     7478.0    2061599.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             1074.0            7.

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:36,606][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:36,626][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:36,630][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2774.19it/s]
[2025-08-19 12:02:36,734][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3394.08it/s]
[2025-08-19 12:02:36,770][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3423.40it/s]
[2025-08-19 12:02:36,788][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [183 103 279  42  49 ... 252 254 223 213 191], Length=274
        Val time series IDs: [294  33 388 517 240 ...  28 195 298 456  79], Length=109
        Test time series IDs: [241 131 225 398 473 ... 396 543  52 167 537], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:02:37,161][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-08-19 12:02:37,184][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:02:37,188][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2820.07it/s]
[2025-08-19 12:02:37,289][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3349.05it/s]
[2025-08-19 12:02:37,327][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3851.65it/s]
[2025-08-19 12:02:37,343][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [212  86 114  49 433 ...  29  27 217 130 349], Length=274
        Val time series IDs: [298 539 243 315 199 ... 469 411  38 105 465], Length=109
        Test time series IDs: [325 348  69 460 124 ... 190 293   2 342   7], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)