# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-08-17 14:22:37,994][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:38,000][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:38,019][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:38,022][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 625.27it/s]
[2025-08-17 14:22:38,115][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [487 390 253 164 232 ... 319 540 142 366 456], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-17 14:22:38,120][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:38,120][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:38,121][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:38,122][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:38,122][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-17 14:22:38,127][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:38,128][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:38,128][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:38,129][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:38,129][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 14:22:38,136][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 229.41it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 14:22:38,605][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 138.83it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 14:22:38,849][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 108.38it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 14:22:38,962][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 200.05it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:39,235][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:39,257][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:39,260][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 822.59it/s]
[2025-08-17 14:22:39,328][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [238 220  57 116 488 ...  76 245 101 392 477], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-17 14:22:39,335][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 225.33it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:39,812][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:39,831][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:39,834][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 800.02it/s]
[2025-08-17 14:22:39,842][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 14:22:39,852][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1017.41it/s]


(1, 32, 20)

#### Sliding window

- Both `sliding_window_size` and `sliding_window_prediction_size` must be set if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:39,963][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:39,981][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:39,985][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2162.84it/s]
[2025-08-17 14:22:40,012][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [272 315 110 199 172 ... 231  68 200 190  77], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    D

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-08-17 14:22:40,020][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4895.85it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-08-17 14:22:40,126][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:40,126][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-08-17 14:22:40,127][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:40,128][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 14:22:40,128][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 14:22:40,129][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:40,132][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:40,151][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:40,155][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2563.44it/s]
[2025-08-17 14:22:40,178][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 47  19  51 252 258 ... 232 219 155  16  40], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-08-17 14:22:40,185][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 5284.98it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:40,284][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:40,302][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:40,306][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 708.46it/s]
[2025-08-17 14:22:40,384][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [136 452 392 407 340 ... 383 116 270  23 224], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,136.0,0.0,18115.0,1039871.0,871946400.0,8754.0,5.52,4.9,8027.0,5.06,4.69,14326.0,9.03,18.93,0.890137,0.879883,0.47998,0.449951,13.23,139.440002
1,136.0,1.0,19581.0,755494.0,670795000.0,9325.0,5.58,3.89,8439.0,5.05,2.92,15338.0,9.17,18.690001,0.890137,0.870117,0.47998,0.449951,12.44,139.100006
2,136.0,2.0,30589.0,1731321.0,1672778000.0,12382.0,5.6,4.88,10723.0,4.85,5.05,19462.0,8.79,16.469999,0.879883,0.870117,0.5,0.449951,14.05,128.710007
3,136.0,3.0,48431.0,3841176.0,3856568000.0,23208.0,5.49,4.77,18064.0,4.28,3.17,34972.0,8.28,13.46,0.879883,0.870117,0.5,0.429932,14.57,121.089996
4,136.0,4.0,61131.0,4096592.0,3921116000.0,31670.0,5.09,5.68,23465.0,3.77,4.51,45759.0,7.36,12.08,0.910156,0.899902,0.5,0.429932,17.67,120.559998
5,136.0,5.0,63892.0,7111885.0,6990914000.0,34007.0,4.78,4.67,24987.0,3.51,6.29,49121.0,6.9,11.41,0.899902,0.890137,0.509766,0.429932,19.41,115.360001
6,136.0,6.0,70101.0,5455217.0,5712918000.0,35085.0,4.72,4.68,25649.0,3.45,3.09,49527.0,6.66,11.29,0.899902,0.890137,0.52002,0.439941,19.99,116.959999
7,136.0,7.0,75680.0,5494699.0,5291503000.0,39718.0,5.01,5.49,28578.0,3.61,5.0,57238.0,7.22,12.1,0.890137,0.879883,0.509766,0.439941,21.370001,113.660004
8,136.0,8.0,71895.0,9279422.0,9112694000.0,38941.0,5.11,5.78,28580.0,3.75,4.68,54995.0,7.21,12.28,0.899902,0.890137,0.509766,0.439941,19.950001,115.879997
9,136.0,9.0,76955.0,8369774.0,8555864000.0,39351.0,5.29,6.28,27951.0,3.76,5.31,56748.0,7.62,13.23,0.890137,0.879883,0.5,0.429932,22.25,116.410004


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows   n_packets       n_bytes  \
 0                     136.0      0.0  18115.0   1039871.0  8.719464e+08   
 1                     136.0      1.0  19581.0    755494.0  6.707950e+08   
 2                     136.0      2.0  30589.0   1731321.0  1.672778e+09   
 3                     136.0      3.0  48431.0   3841176.0  3.856568e+09   
 4                     136.0      4.0  61131.0   4096592.0  3.921116e+09   
 ...                     ...      ...      ...         ...           ...   
 3354                  136.0   3354.0  52248.0  13575721.0  1.397975e+10   
 3355                  136.0   3355.0  53299.0   8085738.0  7.527550e+09   
 3356                  136.0   3356.0  50760.0   7382566.0  7.055192e+09   
 3357                  136.0   3357.0  58594.0   8873660.0  8.962845e+09   
 3358                  136.0   3358.0  45032.0   3801262.0  3.653777e+09   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0         

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,136.0,3359.0,34359.0,3602450.0,3406874000.0,12352.0,5.95,13.65,10370.0,5.0,20.17,25342.0,12.21,49.150002,0.830078,0.819824,0.52002,0.459961,24.07,115.900002
1,136.0,3360.0,31490.0,1541365.0,1388038000.0,10652.0,5.84,16.280001,10435.0,5.72,31.07,23540.0,12.91,57.0,0.859863,0.859863,0.509766,0.469971,20.440001,120.949997
2,136.0,3361.0,31131.0,942388.0,743617200.0,9216.0,5.83,17.790001,9562.0,6.05,34.77,23352.0,14.78,80.040001,0.859863,0.859863,0.52002,0.47998,20.879999,115.900002
3,136.0,3362.0,24590.0,1259906.0,1150805000.0,8126.0,5.46,18.16,8314.0,5.59,36.0,19057.0,12.81,60.220001,0.850098,0.850098,0.549805,0.509766,19.26,115.440002
4,136.0,3363.0,24690.0,1169571.0,1165922000.0,8255.0,5.41,19.719999,8824.0,5.78,42.259998,19931.0,13.05,64.18,0.839844,0.839844,0.540039,0.48999,19.91,115.120003
5,136.0,3364.0,20700.0,555369.0,467004600.0,7853.0,4.67,13.73,7062.0,4.2,24.26,16570.0,9.85,44.080002,0.859863,0.859863,0.549805,0.5,17.049999,110.730003
6,136.0,3365.0,30213.0,1820409.0,1868138000.0,13186.0,4.49,11.29,10675.0,3.63,18.4,23371.0,7.95,32.98,0.859863,0.859863,0.540039,0.469971,18.290001,107.480003
7,136.0,3366.0,44250.0,3564076.0,3891919000.0,20536.0,4.31,9.86,15702.0,3.29,13.98,33554.0,7.04,25.629999,0.879883,0.879883,0.549805,0.469971,20.65,106.389999
8,136.0,3367.0,53791.0,3236498.0,3145520000.0,26437.0,4.38,8.5,19659.0,3.26,10.0,41550.0,6.88,22.85,0.890137,0.890137,0.540039,0.469971,20.67,107.970001
9,136.0,3368.0,60225.0,5519323.0,5607972000.0,28634.0,4.4,8.37,19908.0,3.06,8.26,44750.0,6.88,22.290001,0.890137,0.879883,0.540039,0.469971,21.540001,105.470001


In [24]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     136.0   3359.0  34359.0  3602450.0  3.406874e+09   
 1                     136.0   3360.0  31490.0  1541365.0  1.388038e+09   
 2                     136.0   3361.0  31131.0   942388.0  7.436172e+08   
 3                     136.0   3362.0  24590.0  1259906.0  1.150805e+09   
 4                     136.0   3363.0  24690.0  1169571.0  1.165922e+09   
 ...                     ...      ...      ...        ...           ...   
 2010                  136.0   5369.0  45075.0  6932298.0  6.481045e+09   
 2011                  136.0   5370.0  44529.0  7477192.0  7.015394e+09   
 2012                  136.0   5371.0  37086.0  5828244.0  5.226256e+09   
 2013                  136.0   5372.0  30274.0  4261275.0  3.891864e+09   
 2014                  136.0   5373.0  26195.0  2832513.0  2.480008e+09   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0            12352.0  

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,136.0,5374.0,18180.0,2190461.0,2274519000.0,6689.0,4.9,5.13,5051.0,3.7,3.87,12946.0,9.49,21.190001,0.830078,0.830078,0.48999,0.439941,24.15,115.260002
1,136.0,5375.0,15292.0,2049119.0,1671553000.0,5619.0,4.67,4.32,4400.0,3.66,2.28,11061.0,9.2,21.09,0.839844,0.839844,0.509766,0.459961,21.48,113.639999
2,136.0,5376.0,13543.0,912803.0,844748900.0,4965.0,4.6,5.25,3793.0,3.51,4.13,10140.0,9.39,22.27,0.830078,0.830078,0.509766,0.469971,21.23,111.300003
3,136.0,5377.0,14513.0,1155230.0,1112940000.0,5446.0,4.85,6.95,4446.0,3.96,7.59,10784.0,9.6,22.4,0.819824,0.810059,0.52002,0.469971,19.530001,112.889999
4,136.0,5378.0,15925.0,777385.0,524930800.0,6167.0,4.7,6.71,4832.0,3.68,7.23,11804.0,9.0,20.16,0.819824,0.819824,0.52002,0.469971,20.629999,110.110001
5,136.0,5379.0,21219.0,1521785.0,1479333000.0,8791.0,4.25,4.97,6437.0,3.11,4.77,15311.0,7.4,15.47,0.839844,0.830078,0.52002,0.459961,19.620001,105.349998
6,136.0,5380.0,29161.0,2005974.0,1878354000.0,13178.0,4.41,4.57,10230.0,3.42,3.34,21382.0,7.15,13.3,0.859863,0.839844,0.48999,0.429932,19.1,109.690002
7,136.0,5381.0,33220.0,3744959.0,3978854000.0,15492.0,4.23,4.73,11449.0,3.12,4.3,24796.0,6.77,13.48,0.859863,0.850098,0.509766,0.449951,20.82,108.199997
8,136.0,5382.0,37895.0,2875072.0,2637837000.0,16517.0,4.4,5.76,12471.0,3.32,5.26,26242.0,6.99,13.88,0.859863,0.850098,0.5,0.439941,23.879999,111.57
9,136.0,5383.0,41943.0,3483939.0,3446049000.0,18634.0,4.72,7.04,13332.0,3.38,6.1,29172.0,7.4,14.7,0.870117,0.859863,0.5,0.429932,23.91,110.470001


In [26]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     136.0   5374.0  18180.0  2190461.0  2.274519e+09   
 1                     136.0   5375.0  15292.0  2049119.0  1.671553e+09   
 2                     136.0   5376.0  13543.0   912803.0  8.447489e+08   
 3                     136.0   5377.0  14513.0  1155230.0  1.112940e+09   
 4                     136.0   5378.0  15925.0   777385.0  5.249308e+08   
 ...                     ...      ...      ...        ...           ...   
 1338                  136.0   6712.0  13001.0  1892008.0  1.905533e+09   
 1339                  136.0   6713.0  14739.0  3457979.0  3.360777e+09   
 1340                  136.0   6714.0  14174.0  3029459.0  3.242951e+09   
 1341                  136.0   6715.0  19937.0  4552814.0  3.718897e+09   
 1342                  136.0   6716.0  15376.0  3908894.0  4.241853e+09   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             6689.0  

In [27]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     136.0   5374.0  18180.0  2190461.0  2.274519e+09   
 1                     136.0   5375.0  15292.0  2049119.0  1.671553e+09   
 2                     136.0   5376.0  13543.0   912803.0  8.447489e+08   
 3                     136.0   5377.0  14513.0  1155230.0  1.112940e+09   
 4                     136.0   5378.0  15925.0   777385.0  5.249308e+08   
 ...                     ...      ...      ...        ...           ...   
 1338                  136.0   6712.0  13001.0  1892008.0  1.905533e+09   
 1339                  136.0   6713.0  14739.0  3457979.0  3.360777e+09   
 1340                  136.0   6714.0  14174.0  3029459.0  3.242951e+09   
 1341                  136.0   6715.0  19937.0  4552814.0  3.718897e+09   
 1342                  136.0   6716.0  15376.0  3908894.0  4.241853e+09   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             6689.0  

#### All set

- Affected by `all_workers`.

In [28]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,136.0,0.0,18115.0,1039871.0,871946400.0,8754.0,5.52,4.9,8027.0,5.06,4.69,14326.0,9.03,18.93,0.890137,0.879883,0.47998,0.449951,13.23,139.440002
1,136.0,1.0,19581.0,755494.0,670795000.0,9325.0,5.58,3.89,8439.0,5.05,2.92,15338.0,9.17,18.690001,0.890137,0.870117,0.47998,0.449951,12.44,139.100006
2,136.0,2.0,30589.0,1731321.0,1672778000.0,12382.0,5.6,4.88,10723.0,4.85,5.05,19462.0,8.79,16.469999,0.879883,0.870117,0.5,0.449951,14.05,128.710007
3,136.0,3.0,48431.0,3841176.0,3856568000.0,23208.0,5.49,4.77,18064.0,4.28,3.17,34972.0,8.28,13.46,0.879883,0.870117,0.5,0.429932,14.57,121.089996
4,136.0,4.0,61131.0,4096592.0,3921116000.0,31670.0,5.09,5.68,23465.0,3.77,4.51,45759.0,7.36,12.08,0.910156,0.899902,0.5,0.429932,17.67,120.559998
5,136.0,5.0,63892.0,7111885.0,6990914000.0,34007.0,4.78,4.67,24987.0,3.51,6.29,49121.0,6.9,11.41,0.899902,0.890137,0.509766,0.429932,19.41,115.360001
6,136.0,6.0,70101.0,5455217.0,5712918000.0,35085.0,4.72,4.68,25649.0,3.45,3.09,49527.0,6.66,11.29,0.899902,0.890137,0.52002,0.439941,19.99,116.959999
7,136.0,7.0,75680.0,5494699.0,5291503000.0,39718.0,5.01,5.49,28578.0,3.61,5.0,57238.0,7.22,12.1,0.890137,0.879883,0.509766,0.439941,21.370001,113.660004
8,136.0,8.0,71895.0,9279422.0,9112694000.0,38941.0,5.11,5.78,28580.0,3.75,4.68,54995.0,7.21,12.28,0.899902,0.890137,0.509766,0.439941,19.950001,115.879997
9,136.0,9.0,76955.0,8369774.0,8555864000.0,39351.0,5.29,6.28,27951.0,3.76,5.31,56748.0,7.62,13.23,0.890137,0.879883,0.5,0.429932,22.25,116.410004


In [29]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     136.0      0.0  18115.0  1039871.0  8.719464e+08   
 1                     136.0      1.0  19581.0   755494.0  6.707950e+08   
 2                     136.0      2.0  30589.0  1731321.0  1.672778e+09   
 3                     136.0      3.0  48431.0  3841176.0  3.856568e+09   
 4                     136.0      4.0  61131.0  4096592.0  3.921116e+09   
 ...                     ...      ...      ...        ...           ...   
 6712                  136.0   6712.0  13001.0  1892008.0  1.905533e+09   
 6713                  136.0   6713.0  14739.0  3457979.0  3.360777e+09   
 6714                  136.0   6714.0  14174.0  3029459.0  3.242951e+09   
 6715                  136.0   6715.0  19937.0  4552814.0  3.718897e+09   
 6716                  136.0   6716.0  15376.0  3908894.0  4.241853e+09   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             8754.0  

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [30]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:41,939][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:41,959][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:41,962][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 800.41it/s]
[2025-08-17 14:22:42,031][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [161 512 148 430 248 ... 352 434 313 264 432], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [31]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [32]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [33]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [34]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [35]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 14:22:42,303][time_config][INFO] - Quick validation succeeded.
[2025-08-17 14:22:42,326][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 14:22:42,330][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1101.16it/s]
[2025-08-17 14:22:42,380][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 57 313  27 290   0 ... 292 362 419 380  45], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [36]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)