# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-09-06 19:28:33,349][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:33,355][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:33,375][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:33,380][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 515.83it/s]
[2025-09-06 19:28:33,492][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [393 154 293 301  94 ... 228 217 256 439 157], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-06 19:28:33,497][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:33,497][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:33,498][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:33,498][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:33,498][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-06 19:28:33,503][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:33,504][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:33,504][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:33,505][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:33,505][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:28:33,514][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 216.78it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:28:34,009][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 139.20it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:28:34,252][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 109.58it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:28:34,363][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 182.17it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:34,662][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:34,686][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:34,690][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 855.43it/s]
[2025-09-06 19:28:34,755][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [329 202 157  61 143 ... 334 413 298 337  47], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 19:28:34,764][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 233.21it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:35,225][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:35,244][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:35,247][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 666.71it/s]
[2025-09-06 19:28:35,256][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:28:35,265][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 996.96it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:35,377][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:35,395][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:35,400][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1828.66it/s]
[2025-09-06 19:28:35,430][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 466 442 289  97 ... 297 236 201 456 524], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 64

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-06 19:28:35,439][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4698.30it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-06 19:28:35,551][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:35,552][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-06 19:28:35,552][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:35,553][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:28:35,553][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:28:35,554][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:35,559][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:35,578][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:35,582][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2202.23it/s]
[2025-09-06 19:28:35,610][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [333 236 540 499 350 ...  36 249 230 119 361], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-06 19:28:35,618][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4804.42it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:35,726][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:35,745][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:35,748][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 662.95it/s]
[2025-09-06 19:28:35,831][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [371 531 278 490 306 ... 304 393 268 398 448], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,371.0,0.0,1223.0,18004.0,7659450.0,453.0,10.3,10.82,561.0,12.75,11.59,911.0,20.700001,31.700001,0.930176,0.930176,0.469971,0.589844,25.440001,135.429993
1,371.0,1.0,1410.0,27539.0,15854348.0,521.0,11.09,13.09,627.0,13.34,12.96,1006.0,21.4,34.619999,0.899902,0.879883,0.5,0.580078,17.629999,136.199997
2,371.0,2.0,1180.0,93782.0,77180635.0,468.0,10.88,11.6,554.0,12.88,10.82,875.0,20.35,29.91,0.939941,0.939941,0.469971,0.560059,29.02,134.789993
3,371.0,3.0,1452.0,73411.0,74129180.0,473.0,10.51,12.39,533.0,11.84,11.55,1097.0,24.379999,45.889999,0.899902,0.890137,0.47998,0.560059,23.75,124.5
4,371.0,4.0,2471.0,207805.0,194309200.0,510.0,11.09,16.530001,491.0,10.67,10.58,1656.0,36.0,78.360001,0.899902,0.890137,0.449951,0.52002,28.139999,132.270004
5,371.0,5.0,3369.0,436535.0,366342188.0,561.0,13.36,21.01,462.0,11.0,10.89,2161.0,51.450001,112.959999,0.899902,0.910156,0.580078,0.660156,37.41,124.410004
6,371.0,6.0,3886.0,656513.0,427075059.0,605.0,14.4,21.52,512.0,12.19,11.46,2316.0,55.139999,118.480003,0.910156,0.919922,0.540039,0.640137,30.299999,134.440002
7,371.0,7.0,3593.0,623053.0,524873460.0,653.0,14.51,21.32,581.0,12.91,12.03,2316.0,51.470001,112.809998,0.870117,0.879883,0.5,0.600098,30.370001,131.720001
8,371.0,8.0,3065.0,185534.0,115916182.0,641.0,14.57,22.9,500.0,11.36,11.37,2155.0,48.98,106.43,0.890137,0.890137,0.529785,0.609863,35.189999,133.990005
9,371.0,9.0,3115.0,437305.0,344896064.0,584.0,13.9,19.700001,575.0,13.69,17.93,2087.0,49.689999,104.07,0.910156,0.899902,0.529785,0.609863,45.34,129.229996


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,371.0,3359.0,917.0,15862.0,6663042.0,289.0,7.05,3.25,351.0,8.56,5.45,641.0,15.63,15.49,0.819824,0.830078,0.48999,0.569824,45.200001,126.129997
1,371.0,3360.0,970.0,30351.0,22225741.0,266.0,5.91,3.33,312.0,6.93,4.16,638.0,14.18,16.879999,0.839844,0.859863,0.5,0.560059,37.630001,121.589996
2,371.0,3361.0,1035.0,25953.0,17967459.0,274.0,5.59,3.63,337.0,6.88,4.8,694.0,14.16,18.790001,0.830078,0.859863,0.509766,0.569824,30.9,131.059998
3,371.0,3362.0,986.0,16080.0,6900281.0,278.0,6.62,3.62,327.0,7.79,5.09,665.0,15.83,19.16,0.850098,0.890137,0.5,0.609863,28.76,131.330002
4,371.0,3363.0,923.0,23287.0,15174408.0,264.0,6.0,4.1,293.0,6.66,4.56,622.0,14.14,18.440001,0.819824,0.859863,0.529785,0.620117,37.150002,126.669998
5,371.0,3364.0,706.0,13141.0,5186896.0,246.0,5.59,3.9,280.0,6.36,4.75,509.0,11.57,13.86,0.839844,0.859863,0.549805,0.660156,35.389999,120.75
6,371.0,3365.0,1023.0,70010.0,52160913.0,324.0,7.53,7.33,286.0,6.65,3.68,773.0,17.98,30.76,0.870117,0.870117,0.600098,0.649902,38.369999,129.820007
7,371.0,3366.0,1592.0,88362.0,79514204.0,312.0,7.43,8.52,278.0,6.62,4.79,1084.0,25.809999,49.959999,0.879883,0.910156,0.569824,0.620117,35.490002,116.900002
8,371.0,3367.0,2580.0,298586.0,297816710.0,441.0,10.76,11.94,364.0,8.88,5.22,1670.0,40.73,80.559998,0.890137,0.899902,0.52002,0.580078,36.779999,120.580002
9,371.0,3368.0,2296.0,216745.0,194491117.0,440.0,9.17,12.49,346.0,7.21,4.94,1616.0,33.669998,73.269997,0.899902,0.910156,0.5,0.569824,34.599998,129.529999


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,371.0,5374.0,2703.0,35541.0,12061697.0,322.0,6.85,5.35,356.0,7.57,4.51,986.0,20.98,33.040001,0.859863,0.859863,0.449951,0.540039,11.76,117.129997
1,371.0,5375.0,2709.0,41531.0,16380177.0,295.0,6.86,5.97,306.0,7.12,4.51,970.0,22.559999,37.200001,0.839844,0.850098,0.48999,0.549805,13.23,102.459999
2,371.0,5376.0,2906.0,41064.0,16355035.0,343.0,8.17,6.83,356.0,8.48,5.71,1069.0,25.450001,39.369999,0.819824,0.819824,0.509766,0.589844,10.23,107.379997
3,371.0,5377.0,3017.0,46546.0,24416858.0,324.0,7.53,6.62,360.0,8.37,5.68,1085.0,25.23,41.860001,0.899902,0.910156,0.48999,0.52002,13.66,115.860001
4,371.0,5378.0,2810.0,166971.0,159832583.0,316.0,8.54,7.87,287.0,7.76,4.55,1052.0,28.43,48.509998,0.890137,0.890137,0.47998,0.529785,14.35,102.360001
5,371.0,5379.0,3309.0,66185.0,36134433.0,397.0,8.82,11.32,339.0,7.53,4.78,1434.0,31.870001,63.34,0.890137,0.879883,0.48999,0.560059,12.86,95.959999
6,371.0,5380.0,4845.0,195093.0,179268385.0,488.0,11.35,15.37,361.0,8.4,5.49,2152.0,50.049999,106.900002,0.890137,0.879883,0.48999,0.540039,15.07,100.839996
7,371.0,5381.0,7166.0,228439.0,171184035.0,588.0,13.07,21.01,362.0,8.04,4.78,3221.0,71.580002,167.600006,0.850098,0.859863,0.509766,0.580078,15.81,106.559998
8,371.0,5382.0,7744.0,479259.0,397200002.0,639.0,15.21,25.870001,344.0,8.19,5.17,3608.0,85.900002,193.630005,0.799805,0.810059,0.540039,0.600098,17.610001,108.529999
9,371.0,5383.0,6733.0,342208.0,267686091.0,567.0,13.83,21.790001,349.0,8.51,5.06,3149.0,76.800003,168.509995,0.879883,0.890137,0.529785,0.600098,16.700001,112.849998


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,371.0,0.0,1223.0,18004.0,7659450.0,453.0,10.3,10.82,561.0,12.75,11.59,911.0,20.700001,31.700001,0.930176,0.930176,0.469971,0.589844,25.440001,135.429993
1,371.0,1.0,1410.0,27539.0,15854348.0,521.0,11.09,13.09,627.0,13.34,12.96,1006.0,21.4,34.619999,0.899902,0.879883,0.5,0.580078,17.629999,136.199997
2,371.0,2.0,1180.0,93782.0,77180635.0,468.0,10.88,11.6,554.0,12.88,10.82,875.0,20.35,29.91,0.939941,0.939941,0.469971,0.560059,29.02,134.789993
3,371.0,3.0,1452.0,73411.0,74129180.0,473.0,10.51,12.39,533.0,11.84,11.55,1097.0,24.379999,45.889999,0.899902,0.890137,0.47998,0.560059,23.75,124.5
4,371.0,4.0,2471.0,207805.0,194309200.0,510.0,11.09,16.530001,491.0,10.67,10.58,1656.0,36.0,78.360001,0.899902,0.890137,0.449951,0.52002,28.139999,132.270004
5,371.0,5.0,3369.0,436535.0,366342188.0,561.0,13.36,21.01,462.0,11.0,10.89,2161.0,51.450001,112.959999,0.899902,0.910156,0.580078,0.660156,37.41,124.410004
6,371.0,6.0,3886.0,656513.0,427075059.0,605.0,14.4,21.52,512.0,12.19,11.46,2316.0,55.139999,118.480003,0.910156,0.919922,0.540039,0.640137,30.299999,134.440002
7,371.0,7.0,3593.0,623053.0,524873460.0,653.0,14.51,21.32,581.0,12.91,12.03,2316.0,51.470001,112.809998,0.870117,0.879883,0.5,0.600098,30.370001,131.720001
8,371.0,8.0,3065.0,185534.0,115916182.0,641.0,14.57,22.9,500.0,11.36,11.37,2155.0,48.98,106.43,0.890137,0.890137,0.529785,0.609863,35.189999,133.990005
9,371.0,9.0,3115.0,437305.0,344896064.0,584.0,13.9,19.700001,575.0,13.69,17.93,2087.0,49.689999,104.07,0.910156,0.899902,0.529785,0.609863,45.34,129.229996


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:36,376][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:36,395][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:36,399][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 776.14it/s]
[2025-09-06 19:28:36,471][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [350  76 233 157 246 ... 195 372 379 183 427], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:28:36,753][time_config][INFO] - Quick validation succeeded.
[2025-09-06 19:28:36,776][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:28:36,781][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 987.71it/s]
[2025-09-06 19:28:36,836][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [107  67 421 401  45 ... 156  49 474 531 303], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)