# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-09-14 14:46:45,804][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:45,808][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:45,829][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:45,833][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 477.97it/s]
[2025-09-14 14:46:45,952][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [504 337  52 325 428 ...  35 425 403 164 211], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-14 14:46:45,958][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:45,958][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:45,960][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:45,960][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:45,961][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-14 14:46:45,967][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:45,968][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:45,969][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:45,970][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:45,970][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:46:45,980][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 199.82it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:46:46,518][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 128.25it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:46:46,780][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 100.19it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:46:46,902][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 160.46it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:47,240][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:47,317][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:47,321][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 708.27it/s]
[2025-09-14 14:46:47,398][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [146 182 143  47 473 ... 506 420 168 294 195], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-14 14:46:47,409][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 198.66it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:47,949][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:47,968][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:47,972][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 470.08it/s]
[2025-09-14 14:46:47,981][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:46:47,990][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1935.95it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:48,053][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:48,072][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:48,077][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1716.35it/s]
[2025-09-14 14:46:48,109][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [270 135 281 107  20 ... 172 347 104 150 179], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch siz

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-14 14:46:48,118][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4568.45it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-14 14:46:48,232][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:48,232][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-14 14:46:48,233][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:48,234][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:46:48,235][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:46:48,235][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:48,240][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:48,258][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:48,262][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1801.72it/s]
[2025-09-14 14:46:48,293][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [395  34   9  72 176 ... 127 530 387  45 432], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch si

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-14 14:46:48,302][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4785.86it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:48,412][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:48,433][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:48,437][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 587.71it/s]
[2025-09-14 14:46:48,531][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [301 179 412 283 299 ... 155  62 280 477 448], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,301.0,0.0,849.0,22557.0,20259498.0,465.0,6.84,2.34,403.0,5.93,2.2,638.0,9.38,4.15,0.870117,0.870117,0.439941,0.360107,31.5,143.759995
1,301.0,1.0,933.0,36392.0,14933057.0,433.0,6.37,2.85,367.0,5.4,2.6,636.0,9.35,5.1,0.850098,0.859863,0.509766,0.419922,40.240002,137.119995
2,301.0,2.0,819.0,24788.0,8279240.0,449.0,6.41,2.53,356.0,5.09,2.15,633.0,9.04,4.6,0.830078,0.850098,0.48999,0.370117,37.02,137.820007
3,301.0,3.0,853.0,39664.0,28337708.0,430.0,6.42,3.02,290.0,4.33,1.71,652.0,9.73,5.82,0.899902,0.910156,0.469971,0.360107,31.190001,127.949997
4,301.0,4.0,815.0,34624.0,24736442.0,444.0,6.25,3.46,294.0,4.14,1.66,654.0,9.21,6.21,0.879883,0.870117,0.47998,0.379883,36.689999,133.399994
5,301.0,5.0,716.0,115814.0,109071744.0,417.0,5.71,3.15,273.0,3.74,1.64,606.0,8.3,5.81,0.850098,0.850098,0.459961,0.360107,33.200001,126.660004
6,301.0,6.0,731.0,176901.0,135041418.0,392.0,5.85,2.34,277.0,4.13,1.84,625.0,9.33,5.95,0.870117,0.859863,0.529785,0.449951,35.470001,121.019997
7,301.0,7.0,842.0,104596.0,61918794.0,481.0,6.09,3.51,310.0,3.92,1.99,695.0,8.8,6.31,0.799805,0.810059,0.509766,0.419922,46.34,129.220001
8,301.0,8.0,670.0,126978.0,37321551.0,398.0,5.04,2.46,282.0,3.57,1.7,568.0,7.19,4.71,0.790039,0.799805,0.509766,0.419922,48.52,129.899994
9,301.0,9.0,1232.0,142679.0,131926173.0,671.0,5.94,3.74,438.0,3.88,1.8,1040.0,9.2,9.26,0.799805,0.799805,0.459961,0.360107,39.939999,128.910004


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,301.0,3359.0,783.0,142044.0,143581065.0,448.0,4.87,2.4,318.0,3.46,1.49,597.0,6.49,4.37,0.819824,0.799805,0.52002,0.459961,40.619999,118.839996
1,301.0,3360.0,767.0,733561.0,978351200.0,420.0,5.19,3.46,288.0,3.56,1.76,542.0,6.69,5.95,0.850098,0.830078,0.5,0.429932,36.68,115.849998
2,301.0,3361.0,533.0,89149.0,47238594.0,314.0,4.55,2.15,209.0,3.03,1.26,410.0,5.94,3.35,0.899902,0.879883,0.449951,0.399902,31.370001,118.199997
3,301.0,3362.0,539.0,52952.0,5440717.0,312.0,4.73,2.4,213.0,3.23,1.37,402.0,6.09,3.53,0.859863,0.830078,0.509766,0.439941,39.290001,114.440002
4,301.0,3363.0,523.0,35582.0,39424596.0,306.0,4.64,2.38,212.0,3.21,1.78,393.0,5.95,3.48,0.879883,0.870117,0.469971,0.389893,34.580002,118.279999
5,301.0,3364.0,468.0,27147.0,21625123.0,241.0,4.38,2.58,168.0,3.05,1.43,371.0,6.75,5.6,0.879883,0.870117,0.48999,0.409912,31.190001,119.360001
6,301.0,3365.0,515.0,182039.0,239548274.0,276.0,4.93,3.6,155.0,2.77,1.67,415.0,7.41,6.96,0.839844,0.839844,0.399902,0.25,34.119999,101.290001
7,301.0,3366.0,361.0,39353.0,25673024.0,227.0,4.45,2.07,137.0,2.69,1.17,300.0,5.88,3.55,0.850098,0.819824,0.52002,0.399902,44.23,117.730003
8,301.0,3367.0,354.0,26854.0,5762759.0,208.0,3.71,1.89,143.0,2.55,1.17,289.0,5.16,3.86,0.839844,0.819824,0.47998,0.399902,38.169998,116.650002
9,301.0,3368.0,266.0,198001.0,248867257.0,167.0,3.41,1.85,128.0,2.61,1.35,208.0,4.24,2.97,0.850098,0.830078,0.48999,0.449951,40.330002,130.880005


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,301.0,5374.0,612.0,30880.0,17037630.0,351.0,4.62,1.83,268.0,3.53,1.7,494.0,6.5,3.21,0.830078,0.819824,0.469971,0.379883,29.790001,110.360001
1,301.0,5375.0,558.0,115118.0,144855547.0,295.0,4.28,2.28,216.0,3.13,1.47,443.0,6.42,4.83,0.879883,0.870117,0.47998,0.409912,38.939999,106.540001
2,301.0,5376.0,519.0,11368.0,6676822.0,315.0,4.14,1.76,257.0,3.38,1.68,442.0,5.82,3.14,0.910156,0.890137,0.509766,0.419922,41.740002,112.709999
3,301.0,5377.0,586.0,12762.0,7058542.0,337.0,4.55,2.88,259.0,3.5,1.53,483.0,6.53,6.07,0.879883,0.859863,0.509766,0.449951,40.09,119.459999
4,301.0,5378.0,809.0,92993.0,113377515.0,419.0,5.51,3.38,257.0,3.38,1.67,640.0,8.42,6.19,0.850098,0.819824,0.459961,0.399902,49.509998,107.730003
5,301.0,5379.0,993.0,49224.0,44117954.0,473.0,5.26,2.98,297.0,3.3,1.5,749.0,8.32,5.59,0.830078,0.799805,0.449951,0.379883,38.779999,104.959999
6,301.0,5380.0,776.0,59940.0,28333553.0,424.0,4.46,2.7,270.0,2.84,1.4,651.0,6.85,5.75,0.810059,0.790039,0.47998,0.419922,36.459999,114.129997
7,301.0,5381.0,591.0,37712.0,32908539.0,322.0,3.88,2.76,211.0,2.54,1.26,485.0,5.84,6.01,0.850098,0.830078,0.509766,0.429932,62.209999,110.410004
8,301.0,5382.0,768.0,89801.0,76691469.0,435.0,4.31,2.53,288.0,2.85,1.38,615.0,6.09,4.11,0.890137,0.870117,0.5,0.429932,54.110001,114.800003
9,301.0,5383.0,1451.0,618003.0,661180565.0,700.0,5.34,3.13,474.0,3.62,2.47,1134.0,8.66,6.98,0.830078,0.810059,0.47998,0.409912,47.040001,111.349998


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,301.0,0.0,849.0,22557.0,20259498.0,465.0,6.84,2.34,403.0,5.93,2.2,638.0,9.38,4.15,0.870117,0.870117,0.439941,0.360107,31.5,143.759995
1,301.0,1.0,933.0,36392.0,14933057.0,433.0,6.37,2.85,367.0,5.4,2.6,636.0,9.35,5.1,0.850098,0.859863,0.509766,0.419922,40.240002,137.119995
2,301.0,2.0,819.0,24788.0,8279240.0,449.0,6.41,2.53,356.0,5.09,2.15,633.0,9.04,4.6,0.830078,0.850098,0.48999,0.370117,37.02,137.820007
3,301.0,3.0,853.0,39664.0,28337708.0,430.0,6.42,3.02,290.0,4.33,1.71,652.0,9.73,5.82,0.899902,0.910156,0.469971,0.360107,31.190001,127.949997
4,301.0,4.0,815.0,34624.0,24736442.0,444.0,6.25,3.46,294.0,4.14,1.66,654.0,9.21,6.21,0.879883,0.870117,0.47998,0.379883,36.689999,133.399994
5,301.0,5.0,716.0,115814.0,109071744.0,417.0,5.71,3.15,273.0,3.74,1.64,606.0,8.3,5.81,0.850098,0.850098,0.459961,0.360107,33.200001,126.660004
6,301.0,6.0,731.0,176901.0,135041418.0,392.0,5.85,2.34,277.0,4.13,1.84,625.0,9.33,5.95,0.870117,0.859863,0.529785,0.449951,35.470001,121.019997
7,301.0,7.0,842.0,104596.0,61918794.0,481.0,6.09,3.51,310.0,3.92,1.99,695.0,8.8,6.31,0.799805,0.810059,0.509766,0.419922,46.34,129.220001
8,301.0,8.0,670.0,126978.0,37321551.0,398.0,5.04,2.46,282.0,3.57,1.7,568.0,7.19,4.71,0.790039,0.799805,0.509766,0.419922,48.52,129.899994
9,301.0,9.0,1232.0,142679.0,131926173.0,671.0,5.94,3.74,438.0,3.88,1.8,1040.0,9.2,9.26,0.799805,0.799805,0.459961,0.360107,39.939999,128.910004


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:49,156][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:49,175][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:49,180][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 638.79it/s]
[2025-09-14 14:46:49,266][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [252  92 294 320 438 ... 148 182  51 304 513], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:46:49,558][time_config][INFO] - Quick validation succeeded.
[2025-09-14 14:46:49,581][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:46:49,586][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1005.02it/s]
[2025-09-14 14:46:49,643][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [274 363 280 296 483 ...   4 134 295 378 529], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)