# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-08-31 12:07:09,373][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:09,378][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:09,396][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:09,401][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 533.45it/s]
[2025-08-31 12:07:09,508][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [437  41   3 355 225 ... 176 252 129 415   6], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-31 12:07:09,514][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:09,514][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:09,515][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:09,515][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:09,515][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-31 12:07:09,521][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:09,522][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:09,522][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:09,523][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:09,523][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:07:09,532][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 216.05it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:07:10,028][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 133.99it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:07:10,280][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 107.59it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:07:10,394][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 187.14it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:10,685][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:10,706][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:10,710][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 835.85it/s]
[2025-08-31 12:07:10,778][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 33 311  80 240 528 ...  65 229  91 133 147], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-31 12:07:10,786][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 232.71it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:11,251][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:11,269][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:11,272][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 560.23it/s]
[2025-08-31 12:07:11,281][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:07:11,289][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 2192.29it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:11,346][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:11,364][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:11,368][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1896.48it/s]
[2025-08-31 12:07:11,396][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [540 531 254 425 388 ... 512 291 189 387 122], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 64

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-08-31 12:07:11,404][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 5404.45it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-08-31 12:07:11,502][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:11,502][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-08-31 12:07:11,502][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:11,503][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:07:11,503][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:07:11,504][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:11,509][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:11,527][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:11,531][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 2346.97it/s]
[2025-08-31 12:07:11,556][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [376 268 347 519 469 ... 511  11 166 251 218], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-08-31 12:07:11,564][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4909.66it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:11,672][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:11,693][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:11,697][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 619.65it/s]
[2025-08-31 12:07:11,786][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [276   6 220  74 375 ... 498 305 287 404 341], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,276.0,0.0,28781.0,2021649.0,2186308000.0,3853.0,11.27,20.35,7524.0,22.0,65.699997,11806.0,34.52,90.57,0.620117,0.620117,0.409912,0.439941,6.36,155.720001
1,276.0,1.0,40797.0,3747918.0,4064858000.0,4553.0,13.31,28.73,8057.0,23.559999,64.07,13967.0,40.84,113.059998,0.629883,0.620117,0.419922,0.459961,4.92,152.020004
2,276.0,2.0,36025.0,4813768.0,5537465000.0,5071.0,14.45,31.23,7813.0,22.26,56.68,15193.0,43.279999,127.629997,0.620117,0.609863,0.399902,0.439941,5.43,152.820007
3,276.0,3.0,29967.0,4957078.0,5261905000.0,4497.0,14.1,29.719999,6737.0,21.120001,47.509998,15665.0,49.110001,170.259995,0.600098,0.589844,0.449951,0.47998,7.76,151.039993
4,276.0,4.0,36506.0,5822564.0,5956184000.0,4526.0,14.89,32.880001,7175.0,23.6,59.259998,16540.0,54.41,207.389999,0.629883,0.629883,0.469971,0.509766,7.47,145.789993
5,276.0,5.0,33925.0,3561996.0,3465601000.0,4722.0,14.99,36.200001,7495.0,23.790001,67.699997,17332.0,55.02,220.830002,0.600098,0.589844,0.5,0.529785,8.71,149.699997
6,276.0,6.0,34048.0,3492589.0,3293400000.0,4858.0,14.72,35.16,8122.0,24.610001,72.18,18001.0,54.549999,219.160004,0.580078,0.569824,0.469971,0.509766,8.61,147.679993
7,276.0,7.0,36395.0,3906942.0,3573457000.0,4975.0,15.69,37.470001,9547.0,30.120001,88.059998,18654.0,58.849998,219.779999,0.600098,0.589844,0.47998,0.529785,7.12,142.649994
8,276.0,8.0,48565.0,5109799.0,5575270000.0,5398.0,16.66,38.939999,9750.0,30.09,88.959999,21958.0,67.769997,229.449997,0.580078,0.569824,0.449951,0.47998,7.76,140.910004
9,276.0,9.0,43844.0,5907165.0,5972146000.0,5364.0,17.08,40.009998,10225.0,32.560001,100.800003,20913.0,66.599998,240.160004,0.589844,0.580078,0.47998,0.509766,7.78,145.759995


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,276.0,3359.0,12018.0,6190226.0,5722170000.0,2997.0,7.72,12.86,6200.0,15.98,47.75,7212.0,18.59,38.049999,0.5,0.48999,0.48999,0.540039,13.18,139.130005
1,276.0,3360.0,11651.0,526005.0,524689200.0,3047.0,8.35,13.82,5683.0,15.57,41.93,7445.0,20.4,40.310001,0.509766,0.509766,0.469971,0.509766,12.74,134.979996
2,276.0,3361.0,11050.0,212721.0,154181100.0,3037.0,8.39,14.3,5118.0,14.14,36.299999,7491.0,20.690001,40.990002,0.47998,0.47998,0.47998,0.52002,13.62,135.610001
3,276.0,3362.0,41435.0,10310055.0,13637070000.0,3847.0,10.87,25.32,4785.0,13.52,34.509998,12272.0,34.669998,121.089996,0.48999,0.48999,0.47998,0.52002,12.94,139.029999
4,276.0,3363.0,36494.0,6217936.0,7861777000.0,4121.0,11.42,25.870001,5087.0,14.09,36.139999,12063.0,33.419998,99.360001,0.5,0.509766,0.469971,0.509766,11.59,140.550003
5,276.0,3364.0,21153.0,4828395.0,4984075000.0,3105.0,9.67,20.48,3900.0,12.15,27.200001,9015.0,28.08,86.25,0.5,0.5,0.5,0.529785,12.5,135.910004
6,276.0,3365.0,30206.0,5576627.0,5751360000.0,3409.0,10.89,25.59,4631.0,14.8,38.34,11555.0,36.919998,135.830002,0.47998,0.47998,0.47998,0.529785,13.59,136.639999
7,276.0,3366.0,53537.0,7719354.0,7681700000.0,4248.0,13.53,32.98,6120.0,19.49,63.720001,16149.0,51.43,201.419998,0.540039,0.540039,0.5,0.540039,13.33,134.649994
8,276.0,3367.0,50809.0,6132168.0,5767879000.0,4603.0,14.61,35.880001,7129.0,22.629999,71.75,17260.0,54.790001,218.350006,0.5,0.5,0.52002,0.549805,13.02,130.440002
9,276.0,3368.0,43609.0,4434510.0,4264395000.0,4722.0,15.18,37.919998,9482.0,30.49,122.279999,18062.0,58.080002,224.279999,0.509766,0.509766,0.52002,0.560059,14.15,137.470001


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,276.0,5374.0,14849.0,595503.0,558578500.0,2748.0,8.64,12.44,8072.0,25.379999,69.720001,8002.0,25.16,46.48,0.580078,0.589844,0.449951,0.48999,10.35,134.039993
1,276.0,5375.0,13385.0,426950.0,390102600.0,2675.0,8.66,13.46,6706.0,21.700001,51.880001,8291.0,26.83,50.09,0.580078,0.589844,0.469971,0.52002,11.46,135.470001
2,276.0,5376.0,15818.0,397194.0,313189400.0,2661.0,8.45,12.84,8651.0,27.459999,74.830002,8041.0,25.530001,46.900002,0.600098,0.609863,0.459961,0.509766,10.45,136.229996
3,276.0,5377.0,16121.0,767519.0,622340600.0,2689.0,8.59,13.11,7913.0,25.280001,72.720001,8377.0,26.76,51.0,0.529785,0.540039,0.459961,0.509766,11.39,140.149994
4,276.0,5378.0,18814.0,1588849.0,1416302000.0,2827.0,9.85,18.049999,5459.0,19.02,41.23,9527.0,33.200001,91.279999,0.560059,0.569824,0.459961,0.5,10.02,136.179993
5,276.0,5379.0,29167.0,3954089.0,4060213000.0,3466.0,13.23,27.639999,5933.0,22.65,49.82,13598.0,51.900002,169.699997,0.569824,0.569824,0.459961,0.509766,9.45,128.649994
6,276.0,5380.0,41343.0,4736274.0,4759984000.0,4328.0,16.09,38.32,8019.0,29.809999,84.480003,17745.0,65.970001,227.610001,0.560059,0.569824,0.469971,0.509766,11.22,134.350006
7,276.0,5381.0,45564.0,4611370.0,4781336000.0,4611.0,15.96,35.57,10402.0,35.990002,119.410004,19961.0,69.07,244.389999,0.560059,0.560059,0.47998,0.529785,11.78,133.940002
8,276.0,5382.0,47512.0,3560149.0,3394246000.0,4815.0,16.780001,37.459999,10762.0,37.5,123.120003,21323.0,74.300003,250.710007,0.529785,0.540039,0.48999,0.52002,11.71,128.330002
9,276.0,5383.0,49884.0,5219957.0,5588411000.0,4684.0,16.209999,36.630001,11945.0,41.330002,147.630005,21238.0,73.489998,252.740005,0.540039,0.549805,0.47998,0.509766,11.88,132.570007


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,276.0,0.0,28781.0,2021649.0,2186308000.0,3853.0,11.27,20.35,7524.0,22.0,65.699997,11806.0,34.52,90.57,0.620117,0.620117,0.409912,0.439941,6.36,155.720001
1,276.0,1.0,40797.0,3747918.0,4064858000.0,4553.0,13.31,28.73,8057.0,23.559999,64.07,13967.0,40.84,113.059998,0.629883,0.620117,0.419922,0.459961,4.92,152.020004
2,276.0,2.0,36025.0,4813768.0,5537465000.0,5071.0,14.45,31.23,7813.0,22.26,56.68,15193.0,43.279999,127.629997,0.620117,0.609863,0.399902,0.439941,5.43,152.820007
3,276.0,3.0,29967.0,4957078.0,5261905000.0,4497.0,14.1,29.719999,6737.0,21.120001,47.509998,15665.0,49.110001,170.259995,0.600098,0.589844,0.449951,0.47998,7.76,151.039993
4,276.0,4.0,36506.0,5822564.0,5956184000.0,4526.0,14.89,32.880001,7175.0,23.6,59.259998,16540.0,54.41,207.389999,0.629883,0.629883,0.469971,0.509766,7.47,145.789993
5,276.0,5.0,33925.0,3561996.0,3465601000.0,4722.0,14.99,36.200001,7495.0,23.790001,67.699997,17332.0,55.02,220.830002,0.600098,0.589844,0.5,0.529785,8.71,149.699997
6,276.0,6.0,34048.0,3492589.0,3293400000.0,4858.0,14.72,35.16,8122.0,24.610001,72.18,18001.0,54.549999,219.160004,0.580078,0.569824,0.469971,0.509766,8.61,147.679993
7,276.0,7.0,36395.0,3906942.0,3573457000.0,4975.0,15.69,37.470001,9547.0,30.120001,88.059998,18654.0,58.849998,219.779999,0.600098,0.589844,0.47998,0.529785,7.12,142.649994
8,276.0,8.0,48565.0,5109799.0,5575270000.0,5398.0,16.66,38.939999,9750.0,30.09,88.959999,21958.0,67.769997,229.449997,0.580078,0.569824,0.449951,0.47998,7.76,140.910004
9,276.0,9.0,43844.0,5907165.0,5972146000.0,5364.0,17.08,40.009998,10225.0,32.560001,100.800003,20913.0,66.599998,240.160004,0.589844,0.580078,0.47998,0.509766,7.78,145.759995


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:12,422][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:12,441][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:12,445][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 603.41it/s]
[2025-08-31 12:07:12,537][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [106 172 505 532  67 ...  10 476 355  44 206], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:07:12,820][time_config][INFO] - Quick validation succeeded.
[2025-08-31 12:07:12,842][time_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:07:12,846][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1104.79it/s]
[2025-08-31 12:07:12,897][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [362 478 324 333 530 ... 149 494 269 527 252], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)