# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-09-15 12:02:36,035][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:36,041][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:36,064][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:36,070][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 535.38it/s]
[2025-09-15 12:02:36,178][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [480 406 251 319  38 ... 546 134 510 515 544], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-15 12:02:36,184][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:36,184][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:36,186][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:36,186][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:36,187][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-15 12:02:36,191][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:36,192][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:36,193][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:36,193][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:36,194][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 12:02:36,202][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 194.99it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 12:02:36,751][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 125.84it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 12:02:37,018][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 112.37it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 12:02:37,127][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 191.90it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:37,412][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:37,435][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:37,438][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 841.38it/s]
[2025-09-15 12:02:37,504][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [459 102  31  12 155 ... 529 196  94 174 415], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 12:02:37,511][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 221.65it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:37,997][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:38,068][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:38,072][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 666.61it/s]
[2025-09-15 12:02:38,081][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 12:02:38,090][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1828.74it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:38,157][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:38,176][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:38,180][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1728.46it/s]
[2025-09-15 12:02:38,213][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [236  20  15 167 503 ... 473   9 338 469 179], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 64

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-15 12:02:38,221][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4695.54it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-15 12:02:38,333][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:38,334][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-15 12:02:38,334][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:38,335][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 12:02:38,335][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 12:02:38,336][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:38,341][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:38,360][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:38,365][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2203.19it/s]
[2025-09-15 12:02:38,392][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [418 246 271 509 102 ... 419 249 242  68 411], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-15 12:02:38,400][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4710.93it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:38,510][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:38,529][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:38,533][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 688.59it/s]
[2025-09-15 12:02:38,614][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [429 425 267 402 125 ... 524 302 372  83 477], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,429.0,0.0,175.0,232.0,8876.0,66.0,5.5,1.45,63.0,5.25,1.42,169.0,14.08,6.01,0.340088,0.350098,0.379883,0.360107,2.89,224.639999
1,429.0,1.0,142.0,186.0,7117.0,54.0,4.5,1.62,53.0,4.42,1.62,129.0,10.75,4.94,0.389893,0.409912,0.469971,0.459961,4.41,223.949997
2,429.0,2.0,150.0,189.0,7615.0,60.0,5.0,1.48,59.0,4.92,1.78,144.0,12.0,5.83,0.310059,0.310059,0.48999,0.459961,2.88,231.619995
3,429.0,3.0,91.0,119.0,5018.0,40.0,3.33,1.5,40.0,3.33,1.67,89.0,7.42,3.92,0.360107,0.370117,0.560059,0.569824,2.17,227.029999
4,429.0,4.0,86.0,115.0,4618.0,45.0,3.75,2.05,46.0,3.83,2.41,82.0,6.83,3.61,0.340088,0.340088,0.509766,0.48999,1.72,235.160004
5,429.0,5.0,62.0,88.0,3892.0,24.0,2.4,0.97,24.0,2.4,0.97,61.0,6.1,3.21,0.290039,0.280029,0.52002,0.48999,8.44,215.779999
6,429.0,6.0,86.0,111.0,5110.0,48.0,4.36,1.8,46.0,4.18,1.6,85.0,7.73,3.35,0.389893,0.370117,0.47998,0.439941,3.01,219.860001
7,429.0,7.0,64.0,85.0,4324.0,33.0,2.75,1.42,33.0,2.75,1.36,63.0,5.25,3.22,0.439941,0.409912,0.47998,0.439941,1.64,218.940002
8,429.0,8.0,86.0,123.0,5307.0,48.0,4.36,1.5,43.0,3.91,1.14,84.0,7.64,3.41,0.379883,0.379883,0.439941,0.439941,0.95,221.960007
9,429.0,9.0,87.0,127.0,4915.0,45.0,3.75,1.54,44.0,3.67,1.61,84.0,7.0,2.66,0.379883,0.379883,0.409912,0.409912,5.73,228.389999


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,429.0,3359.0,119.0,158.0,6284.0,36.0,3.27,1.62,36.0,3.27,1.62,108.0,9.82,5.64,0.340088,0.350098,0.529785,0.540039,5.4,246.550003
1,429.0,3360.0,125.0,174.0,7018.0,39.0,3.55,1.29,41.0,3.73,1.62,120.0,10.91,7.3,0.509766,0.459961,0.549805,0.509766,5.81,226.050003
2,429.0,3361.0,102.0,149.0,5652.0,35.0,3.5,1.58,34.0,3.4,1.35,96.0,9.6,8.1,0.48999,0.5,0.449951,0.439941,3.77,199.880005
3,429.0,3362.0,91.0,124.0,4844.0,49.0,4.08,1.31,48.0,4.0,1.28,87.0,7.25,5.5,0.600098,0.629883,0.399902,0.379883,2.29,200.899994
4,429.0,3363.0,127.0,202.0,8816.0,45.0,4.09,2.17,39.0,3.55,1.63,120.0,10.91,6.61,0.310059,0.320068,0.529785,0.529785,5.95,219.600006
5,429.0,3364.0,109.0,170.0,6727.0,39.0,3.55,1.44,34.0,3.09,1.58,106.0,9.64,4.23,0.150024,0.160034,0.47998,0.469971,8.88,231.229996
6,429.0,3365.0,67.0,102.0,3856.0,35.0,3.89,1.27,32.0,3.56,1.24,67.0,7.44,3.5,0.399902,0.429932,0.549805,0.540039,10.48,235.910004
7,429.0,3366.0,57.0,88.0,4606.0,31.0,2.82,1.4,33.0,3.0,1.55,56.0,5.09,2.7,0.419922,0.370117,0.540039,0.48999,3.4,225.080002
8,429.0,3367.0,58.0,82.0,3433.0,28.0,2.8,1.75,23.0,2.3,1.16,57.0,5.7,4.92,0.439941,0.439941,0.640137,0.649902,1.24,235.820007
9,429.0,3368.0,70.0,104.0,4802.0,37.0,3.36,1.36,28.0,2.55,0.93,70.0,6.36,4.3,0.48999,0.47998,0.48999,0.469971,3.76,158.539993


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,429.0,5374.0,191.0,283.0,11873.0,44.0,3.67,1.87,42.0,3.5,2.24,179.0,14.92,7.14,0.150024,0.160034,0.429932,0.399902,10.87,220.0
1,429.0,5375.0,114.0,154.0,6477.0,51.0,4.25,1.36,49.0,4.08,1.24,107.0,8.92,3.2,0.350098,0.360107,0.469971,0.459961,7.4,220.080002
2,429.0,5376.0,88.0,195.0,8396.0,37.0,3.08,1.38,37.0,3.08,1.51,80.0,6.67,2.71,0.330078,0.330078,0.419922,0.399902,10.45,212.490005
3,429.0,5377.0,69.0,126.0,6417.0,31.0,2.82,1.4,34.0,3.09,1.7,68.0,6.18,4.02,0.48999,0.469971,0.399902,0.360107,5.9,191.110001
4,429.0,5378.0,91.0,116.0,4913.0,31.0,3.1,1.29,33.0,3.3,1.16,83.0,8.3,3.33,0.290039,0.310059,0.399902,0.379883,4.33,228.220001
5,429.0,5379.0,79.0,120.0,7413.0,34.0,3.09,1.45,33.0,3.0,1.26,75.0,6.82,4.21,0.370117,0.340088,0.560059,0.549805,1.86,183.070007
6,429.0,5380.0,77.0,132.0,6750.0,33.0,3.0,1.9,29.0,2.64,1.63,75.0,6.82,3.92,0.300049,0.300049,0.5,0.5,5.0,214.850006
7,429.0,5381.0,95.0,144.0,6045.0,36.0,3.0,1.65,33.0,2.75,1.54,90.0,7.5,3.85,0.209961,0.219971,0.509766,0.509766,7.72,230.919998
8,429.0,5382.0,110.0,175.0,7719.0,38.0,3.8,1.03,39.0,3.9,1.37,104.0,10.4,4.9,0.27002,0.25,0.459961,0.459961,9.36,221.350006
9,429.0,5383.0,92.0,141.0,6328.0,36.0,3.27,1.19,35.0,3.18,1.72,90.0,8.18,3.6,0.360107,0.370117,0.429932,0.419922,7.76,209.660004


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,429.0,0.0,175.0,232.0,8876.0,66.0,5.5,1.45,63.0,5.25,1.42,169.0,14.08,6.01,0.340088,0.350098,0.379883,0.360107,2.89,224.639999
1,429.0,1.0,142.0,186.0,7117.0,54.0,4.5,1.62,53.0,4.42,1.62,129.0,10.75,4.94,0.389893,0.409912,0.469971,0.459961,4.41,223.949997
2,429.0,2.0,150.0,189.0,7615.0,60.0,5.0,1.48,59.0,4.92,1.78,144.0,12.0,5.83,0.310059,0.310059,0.48999,0.459961,2.88,231.619995
3,429.0,3.0,91.0,119.0,5018.0,40.0,3.33,1.5,40.0,3.33,1.67,89.0,7.42,3.92,0.360107,0.370117,0.560059,0.569824,2.17,227.029999
4,429.0,4.0,86.0,115.0,4618.0,45.0,3.75,2.05,46.0,3.83,2.41,82.0,6.83,3.61,0.340088,0.340088,0.509766,0.48999,1.72,235.160004
5,429.0,5.0,62.0,88.0,3892.0,24.0,2.4,0.97,24.0,2.4,0.97,61.0,6.1,3.21,0.290039,0.280029,0.52002,0.48999,8.44,215.779999
6,429.0,6.0,86.0,111.0,5110.0,48.0,4.36,1.8,46.0,4.18,1.6,85.0,7.73,3.35,0.389893,0.370117,0.47998,0.439941,3.01,219.860001
7,429.0,7.0,64.0,85.0,4324.0,33.0,2.75,1.42,33.0,2.75,1.36,63.0,5.25,3.22,0.439941,0.409912,0.47998,0.439941,1.64,218.940002
8,429.0,8.0,86.0,123.0,5307.0,48.0,4.36,1.5,43.0,3.91,1.14,84.0,7.64,3.41,0.379883,0.379883,0.439941,0.439941,0.95,221.960007
9,429.0,9.0,87.0,127.0,4915.0,45.0,3.75,1.54,44.0,3.67,1.61,84.0,7.0,2.66,0.379883,0.379883,0.409912,0.409912,5.73,228.389999


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:39,140][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:39,159][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:39,162][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 756.45it/s]
[2025-09-15 12:02:39,235][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 13 363  21 380 207 ...  46 447 261 521 491], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 12:02:39,495][time_config][INFO] - Quick validation succeeded.
[2025-09-15 12:02:39,518][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 12:02:39,522][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1112.50it/s]
[2025-09-15 12:02:39,573][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [459 441 331  21 249 ... 291 519 178 320 237], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)