# Loading data with DisjointTimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import DisjointTimeBasedConfig # Disjoint dataset MUST use DisjointTimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
disjoint_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.DISJOINT_TIME_BASED, display_details=True)

[2025-09-06 19:23:39,244][wrapper_dataset][INFO] - Dataset is disjoint_time_based. Use cesnet_tszoo.configs.DisjointTimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(train_ts/val_ts/test_ts, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:39,249][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:39,269][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:39,274][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 1287.64it/s]
[2025-09-06 19:23:39,505][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 1273.90it/s]
[2025-09-06 19:23:39,602][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 1160.62it/s]
[2025-09-06 19:23:39,655][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [348 144 472  59 521 ... 543 148 295 287   0], Length=274
        Val time series IDs: [107 487 393 149 344 ... 420 481 488 368 204], Length=109
        Test time series IDs: [425  68 449  88 413 ... 100 454 355 195 496], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
disjoint_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config")
# Or
disjoint_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config")

[2025-09-06 19:23:39,660][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:39,661][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:39,662][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:39,662][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:39,662][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
disjoint_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, init_workers=0)
# Or
disjoint_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, init_workers=0)

[2025-09-06 19:23:39,668][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:39,668][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:39,669][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:39,669][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:39,670][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:23:39,678][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 45.92it/s]


(274, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = disjoint_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:23:41,975][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 65.67it/s]


(109, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = disjoint_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:23:42,481][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 98.81it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [10]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:42,600][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:42,625][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:42,628][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2420.99it/s]
[2025-09-06 19:23:42,745][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2954.23it/s]
[2025-09-06 19:23:42,786][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3482.41it/s]
[2025-09-06 19:23:42,804][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [330 150 213 176  10 ... 282 286 248 309 250], Length=274
        Val time series IDs: [260 137 445 205  14 ... 489 295  40  48  41], Length=109
        Test time series IDs: [113 535 338 341  51 ... 300 226 246 317 131], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [11]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 19:23:42,813][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:02<00:00, 47.97it/s]


(274, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [12]:
config = DisjointTimeBasedConfig(train_ts=[177, 176, 319, 267], val_ts=None, test_ts=None, train_time_period=0.5, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:45,010][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:45,020][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:45,023][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 4/4 [00:00<00:00, 1332.58it/s]
[2025-09-06 19:23:45,027][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [177 176 319 267], Length=4
        Val time series IDs: None
        Test time series IDs: None
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: None
        Test time periods: None
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
      

In [13]:
dataloader = disjoint_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:23:45,036][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2023.00it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(train_ts/val_ts/test_ts, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [14]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:45,099][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:45,118][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:45,121][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 7714.83it/s]
[2025-09-06 19:23:45,162][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 6047.90it/s]
[2025-09-06 19:23:45,183][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7198.46it/s]
[2025-09-06 19:23:45,194][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 22 222  55 101 433 ... 176  15 170 227 482], Length=274
        Val time series IDs: [ 47 490 251 416  23 ...  86 192 155 410 397], Length=109
        Test time series IDs: [487 363  93 120  88 ... 310 469  53 472 496], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
   

In [15]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-06 19:23:45,202][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1205.32it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [16]:
disjoint_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
disjoint_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-06 19:23:45,613][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:45,614][disjoint_time_based_config][INFO] - all_batch_size adjusted to 25 as it should be greater than or equal to sliding_window_size + sliding_window_prediction_size.
[2025-09-06 19:23:45,614][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-06 19:23:45,615][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:45,616][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:23:45,616][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:23:45,617][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [17]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:45,623][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:45,642][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:45,646][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 8089.02it/s]
[2025-09-06 19:23:45,685][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 7364.59it/s]
[2025-09-06 19:23:45,704][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 7191.38it/s]
[2025-09-06 19:23:45,714][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [233 534 187  91 475 ... 446 417  65  96 505], Length=274
        Val time series IDs: [283 340 302 361 422 ... 456 243 447 354 109], Length=109
        Test time series IDs: [236 126  49 345 234 ... 435 261 387 305 210], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
  

In [18]:
dataloader = disjoint_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-06 19:23:45,721][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 1255.87it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [19]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:46,117][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:46,137][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:46,140][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2522.04it/s]
[2025-09-06 19:23:46,254][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 3273.77it/s]
[2025-09-06 19:23:46,292][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3236.95it/s]
[2025-09-06 19:23:46,310][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [237 532 313  70 370 ...  32  17 303 254 194], Length=274
        Val time series IDs: [439  76 404 213 147 ... 190 415 311 283 545], Length=109
        Test time series IDs: [223 197 472 149 346 ... 319 238 105 186 214], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [20]:
df = disjoint_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,237.0,0.0,996.0,12705.0,5458358.0,369.0,16.77,21.610001,383.0,17.41,13.87,497.0,22.59,28.540001,0.950195,0.950195,0.549805,0.549805,9.95,109.760002
1,237.0,1.0,1136.0,20655.0,12402040.0,398.0,15.31,18.309999,379.0,14.58,10.83,576.0,22.15,27.469999,0.950195,0.959961,0.48999,0.560059,8.34,122.370003
2,237.0,2.0,2135.0,202218.0,211375800.0,455.0,18.959999,25.16,367.0,15.29,14.36,880.0,36.669998,61.110001,0.879883,0.879883,0.439941,0.469971,10.66,114.050003
3,237.0,3.0,13188.0,712840.0,488859300.0,766.0,33.299999,50.740002,391.0,17.0,18.02,3772.0,164.0,307.179993,0.850098,0.839844,0.560059,0.580078,10.0,107.559998
4,237.0,4.0,15451.0,1596526.0,1433519000.0,1038.0,49.43,73.57,673.0,32.049999,41.139999,4572.0,217.710007,352.809998,0.910156,0.890137,0.469971,0.52002,12.73,106.230003
5,237.0,5.0,15547.0,712091.0,564377300.0,978.0,44.450001,69.68,516.0,23.450001,32.689999,4720.0,214.550003,362.690002,0.859863,0.839844,0.48999,0.529785,13.81,98.43
6,237.0,6.0,15046.0,1244992.0,1011957000.0,886.0,40.27,60.849998,436.0,19.82,26.889999,4537.0,206.229996,346.0,0.879883,0.870117,0.560059,0.609863,9.93,107.720001
7,237.0,7.0,15774.0,965740.0,802609300.0,872.0,36.330002,58.080002,451.0,18.790001,23.379999,4616.0,192.330002,342.320007,0.939941,0.939941,0.469971,0.609863,13.23,98.769997
8,237.0,8.0,14642.0,932450.0,755016700.0,836.0,33.439999,53.049999,413.0,16.52,17.33,4401.0,176.039993,314.829987,0.839844,0.819824,0.529785,0.589844,12.52,102.110001
9,237.0,9.0,13597.0,912194.0,797955500.0,732.0,33.27,47.240002,319.0,14.5,13.9,3987.0,181.229996,295.799988,0.899902,0.870117,0.47998,0.589844,11.64,98.870003


In [21]:
len(dfs) # every time series has its own dataframe

274

#### Val set

- Affected by `val_workers`.

In [22]:
df = disjoint_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,439.0,3359.0,1166.0,15447.0,6354528.0,469.0,7.69,4.94,458.0,7.51,7.41,809.0,13.26,9.88,0.620117,0.589844,0.560059,0.569824,35.189999,102.019997
1,439.0,3360.0,1210.0,12404.0,3551471.0,472.0,7.74,6.01,447.0,7.33,8.09,845.0,13.85,11.34,0.680176,0.649902,0.52002,0.540039,37.720001,104.510002
2,439.0,3361.0,1155.0,11399.0,2715158.0,464.0,7.86,5.59,448.0,7.59,6.96,791.0,13.41,10.17,0.709961,0.680176,0.540039,0.549805,32.549999,104.309998
3,439.0,3362.0,1163.0,13210.0,7286743.0,444.0,7.16,5.34,402.0,6.48,5.69,806.0,13.0,9.26,0.620117,0.620117,0.549805,0.540039,23.98,98.599998
4,439.0,3363.0,1489.0,21122.0,12318248.0,490.0,7.66,6.42,390.0,6.09,4.6,932.0,14.56,18.290001,0.640137,0.609863,0.529785,0.540039,30.549999,102.160004
5,439.0,3364.0,1581.0,15307.0,9283869.0,491.0,7.79,6.63,417.0,6.62,5.23,1007.0,15.98,20.26,0.660156,0.649902,0.560059,0.569824,20.290001,101.739998
6,439.0,3365.0,2069.0,78195.0,75633260.0,515.0,8.31,6.36,435.0,7.02,5.9,1232.0,19.870001,25.139999,0.629883,0.640137,0.540039,0.540039,26.32,98.620003
7,439.0,3366.0,3433.0,129660.0,148771616.0,593.0,9.41,9.26,421.0,6.68,5.93,1864.0,29.59,43.080002,0.629883,0.640137,0.549805,0.540039,23.4,91.910004
8,439.0,3367.0,5274.0,146250.0,147069028.0,642.0,10.03,9.96,450.0,7.03,6.06,2427.0,37.919998,53.5,0.640137,0.640137,0.5,0.48999,20.73,97.18
9,439.0,3368.0,7132.0,122859.0,91646557.0,760.0,12.46,12.7,347.0,5.69,4.3,2859.0,46.869999,65.839996,0.629883,0.620117,0.48999,0.48999,23.59,95.379997


In [23]:
len(dfs) # every time series has its own dataframe

109

#### Test set

- Affected by `test_workers`.

In [24]:
df = disjoint_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = disjoint_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,223.0,5374.0,90.0,130.0,4952.0,23.0,3.83,0.75,24.0,4.0,1.26,82.0,13.67,6.35,0.25,0.27002,0.469971,0.459961,4.95,229.059998
1,223.0,5375.0,14.0,17.0,1152.0,12.0,3.0,0.82,14.0,3.5,1.0,14.0,3.5,1.0,0.959961,0.850098,0.459961,0.360107,0.0,173.220001
2,223.0,5376.0,18.0,22.0,937.0,13.0,3.25,1.26,17.0,4.25,2.22,18.0,4.5,2.38,0.859863,0.879883,0.429932,0.399902,2.51,187.559998
3,223.0,5377.0,15.0,22.0,1227.0,14.0,3.5,0.58,13.0,3.25,0.5,15.0,3.75,0.5,0.569824,0.580078,0.399902,0.360107,3.95,196.330002
4,223.0,5378.0,31.0,33.0,1464.0,17.0,2.83,2.64,18.0,3.0,2.53,30.0,5.0,2.9,0.449951,0.429932,0.47998,0.429932,2.86,228.369995
5,223.0,5379.0,24.0,33.0,1292.0,12.0,4.0,1.73,13.0,4.33,2.08,24.0,8.0,3.0,0.379883,0.409912,0.419922,0.409912,5.95,233.059998
6,223.0,5380.0,19.0,21.0,960.0,11.0,2.75,1.5,10.0,2.5,1.0,18.0,4.5,2.38,0.330078,0.320068,0.439941,0.399902,0.2,216.940002
7,223.0,5381.0,15.0,24.0,1028.0,14.0,2.8,1.1,14.0,2.8,0.84,15.0,3.0,1.0,0.77002,0.740234,0.379883,0.379883,0.05,235.979996
8,223.0,5382.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,223.0,5383.0,10.0,12.0,558.0,9.0,2.25,0.5,9.0,2.25,0.5,10.0,2.5,1.0,0.340088,0.310059,0.429932,0.360107,3.76,188.059998


In [25]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `train_ts/val_ts/test_ts` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [26]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:47,126][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:47,147][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:47,150][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2418.83it/s]
[2025-09-06 19:23:47,269][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2986.97it/s]
[2025-09-06 19:23:47,310][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3481.18it/s]
[2025-09-06 19:23:47,326][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [135 285 376 171 252 ...  76 418 286 426 425], Length=274
        Val time series IDs: [276 185 139 205 498 ...  79  23 190 282 247], Length=109
        Test time series IDs: [368 266 150 175 102 ... 117 479 232   1 262], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

#### Train set

- Affected by `train_workers`.

In [27]:
numpy_array = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(274, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [28]:
numpy_array = disjoint_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(109, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [29]:
numpy_array = disjoint_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### Using time_format=TimeFormat.DATETIME

In [30]:
config = DisjointTimeBasedConfig(train_ts=0.5, val_ts=0.2, test_ts=0.1, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, init_workers=0)
disjoint_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:23:47,692][disjoint_time_based_config][INFO] - Quick validation succeeded.
[2025-09-06 19:23:47,716][disjoint_time_based_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:23:47,721][cesnet_dataset][INFO] - Updating config for train set.
100%|██████████| 274/274 [00:00<00:00, 2624.32it/s]
[2025-09-06 19:23:47,830][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 109/109 [00:00<00:00, 2850.49it/s]
[2025-09-06 19:23:47,872][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 54/54 [00:00<00:00, 3173.36it/s]
[2025-09-06 19:23:47,892][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDs: [ 61 413  31 194 234 ...  25 241 227 249 153], Length=274
        Val time series IDs: [297 364 257 237 443 ... 138  38 401 183 323], Length=109
        Test time series IDs: [ 30 313 410 521 225 ... 490 390 195 358 218], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. 

In [31]:
numpy_array, times = disjoint_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(274, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)