# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-09-06 17:10:01,212][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:01,217][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:01,237][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:01,241][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 555.31it/s]
[2025-09-06 17:10:01,346][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [455 228 387 105 547 ... 361 304 211 369 366], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-06 17:10:01,351][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:01,351][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:01,352][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:01,352][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:01,352][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-06 17:10:01,356][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:01,357][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:01,357][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:01,358][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:01,358][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:10:01,367][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 213.25it/s]


(54, 32, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:10:01,871][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 32/32 [00:00<00:00, 141.88it/s]


(54, 64, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:10:02,110][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 99.77it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:10:02,232][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 192.64it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:02,515][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:02,538][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:02,541][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 783.67it/s]
[2025-09-06 17:10:02,612][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [180 358 297  51 525 ... 382 254 292 164  22], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 17:10:02,620][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 227.24it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:03,093][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:03,113][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:03,116][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 4/4 [00:00<00:00, 469.98it/s]
[2025-09-06 17:10:03,128][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:10:03,137][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 890.62it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:03,264][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:03,283][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:03,287][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1683.26it/s]
[2025-09-06 17:10:03,322][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [203 349  52  36 368 ... 419  53 195 332 457], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 64

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-09-06 17:10:03,331][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4271.21it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-09-06 17:10:03,453][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:03,453][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-09-06 17:10:03,454][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:03,454][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:10:03,454][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:10:03,455][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:03,460][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:03,480][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:03,484][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1989.64it/s]
[2025-09-06 17:10:03,513][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [385 123  64 337 531 ... 534 204 116   3 402], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type: None        
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-09-06 17:10:03,520][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4617.63it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:03,634][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:03,654][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:03,657][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 551.79it/s]
[2025-09-06 17:10:03,760][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [296 536 152 240 294 ... 266 125 329 342 463], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,296.0,0.0,114.0,346.0,46193.0,68.0,5.67,2.39,77.0,6.42,2.84,105.0,8.75,4.69,0.709961,0.740234,0.52002,0.569824,3.81,221.100006
1,296.0,1.0,88.0,296.0,44292.0,54.0,5.4,1.43,67.0,6.7,2.36,77.0,7.7,2.67,0.930176,0.959961,0.389893,0.370117,0.45,195.020004
2,296.0,2.0,209.0,198277.0,202956643.0,88.0,8.0,4.98,67.0,6.09,3.36,161.0,14.64,11.9,0.810059,0.810059,0.459961,0.330078,11.65,165.199997
3,296.0,3.0,233.0,105690.0,108660925.0,112.0,9.33,6.08,77.0,6.42,3.18,190.0,15.83,13.51,0.959961,0.950195,0.5,0.399902,21.450001,168.339996
4,296.0,4.0,328.0,123855.0,73151784.0,117.0,9.75,8.31,53.0,4.42,2.54,286.0,23.83,25.33,0.879883,0.850098,0.529785,0.469971,30.459999,165.199997
5,296.0,5.0,188.0,6194.0,2185993.0,102.0,8.5,9.13,52.0,4.33,2.19,160.0,13.33,15.13,0.870117,0.870117,0.47998,0.449951,33.240002,168.929993
6,296.0,6.0,217.0,5212.0,2787605.0,106.0,9.64,6.52,64.0,5.82,2.44,188.0,17.09,13.37,0.879883,0.870117,0.52002,0.379883,27.360001,168.770004
7,296.0,7.0,247.0,63500.0,66028114.0,114.0,11.4,9.65,63.0,6.3,2.95,204.0,20.4,20.34,0.959961,0.959961,0.47998,0.379883,38.880001,163.399994
8,296.0,8.0,205.0,10951.0,9133721.0,96.0,8.73,4.96,54.0,4.91,1.7,164.0,14.91,9.83,0.859863,0.859863,0.449951,0.340088,42.48,157.410004
9,296.0,9.0,164.0,5662.0,3360434.0,80.0,6.67,4.64,53.0,4.42,2.02,141.0,11.75,10.0,0.930176,0.910156,0.409912,0.27002,30.93,151.229996


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,296.0,3359.0,48.0,188.0,41576.0,32.0,3.56,1.13,37.0,4.11,1.76,43.0,4.78,2.99,0.870117,0.910156,0.340088,0.330078,1.04,198.880005
1,296.0,3360.0,73.0,278.0,65302.0,38.0,3.8,1.23,47.0,4.7,0.95,56.0,5.6,1.65,0.779785,0.740234,0.429932,0.449951,0.95,193.559998
2,296.0,3361.0,79.0,259.0,62315.0,39.0,3.55,1.57,38.0,3.45,1.69,74.0,6.73,3.52,0.409912,0.459961,0.469971,0.439941,6.74,215.669998
3,296.0,3362.0,55.0,247.0,67560.0,29.0,3.62,0.74,31.0,3.88,1.89,46.0,5.75,5.42,0.640137,0.720215,0.399902,0.370117,6.93,194.929993
4,296.0,3363.0,166.0,16271.0,18533414.0,42.0,4.2,2.49,36.0,3.6,0.97,102.0,10.2,7.8,0.459961,0.48999,0.48999,0.449951,13.25,217.389999
5,296.0,3364.0,233.0,20739.0,21627230.0,92.0,7.67,5.52,51.0,4.25,1.86,177.0,14.75,12.39,0.680176,0.689941,0.509766,0.389893,18.719999,184.589996
6,296.0,3365.0,241.0,15523.0,15108917.0,81.0,6.75,7.46,45.0,3.75,2.6,202.0,16.83,17.77,0.5,0.52002,0.439941,0.389893,9.21,212.020004
7,296.0,3366.0,58.0,115.0,23522.0,26.0,2.89,1.45,28.0,3.11,1.62,54.0,6.0,2.69,0.439941,0.549805,0.529785,0.469971,7.48,211.220001
8,296.0,3367.0,78.0,165.0,23173.0,43.0,3.58,1.62,38.0,3.17,1.47,76.0,6.33,3.55,0.439941,0.48999,0.540039,0.509766,6.08,215.220001
9,296.0,3368.0,101.0,235.0,32459.0,38.0,3.45,1.44,35.0,3.18,1.4,93.0,8.45,5.11,0.449951,0.560059,0.52002,0.589844,5.45,225.199997


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,296.0,5374.0,72.0,391.0,78040.0,47.0,4.7,3.43,56.0,5.6,4.03,61.0,6.1,3.6,0.689941,0.689941,0.449951,0.48999,16.35,180.820007
1,296.0,5375.0,48.0,283537.0,14822622.0,34.0,3.09,1.45,44.0,4.0,2.45,41.0,3.73,1.74,0.77002,0.77002,0.449951,0.5,28.43,177.759995
2,296.0,5376.0,58.0,301.0,63798.0,47.0,4.27,1.9,51.0,4.64,2.2,56.0,5.09,2.02,0.839844,0.830078,0.399902,0.5,4.8,159.279999
3,296.0,5377.0,42.0,229.0,59835.0,35.0,4.38,1.85,38.0,4.75,2.12,38.0,4.75,1.83,0.870117,0.839844,0.48999,0.52002,14.68,168.610001
4,296.0,5378.0,172.0,97041.0,105969541.0,73.0,8.11,3.95,53.0,5.89,2.03,125.0,13.89,11.92,0.970215,0.970215,0.330078,0.25,23.26,148.119995
5,296.0,5379.0,127.0,5331.0,5106090.0,80.0,7.27,4.5,60.0,5.45,1.37,116.0,10.55,7.12,0.950195,0.950195,0.350098,0.320068,22.76,155.149994
6,296.0,5380.0,248.0,19999.0,19527959.0,80.0,7.27,5.41,51.0,4.64,2.69,199.0,18.09,18.68,0.810059,0.819824,0.370117,0.27002,17.030001,163.789993
7,296.0,5381.0,337.0,38334.0,35503114.0,111.0,10.09,7.05,77.0,7.0,5.85,274.0,24.91,21.77,0.75,0.790039,0.47998,0.360107,29.74,151.729996
8,296.0,5382.0,272.0,15220.0,8109328.0,103.0,9.36,7.76,60.0,5.45,3.59,222.0,20.18,19.559999,0.910156,0.919922,0.509766,0.459961,27.51,157.740005
9,296.0,5383.0,244.0,121925.0,66358228.0,102.0,11.33,6.98,49.0,5.44,2.55,215.0,23.889999,17.57,0.859863,0.850098,0.5,0.419922,55.599998,143.470001


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,296.0,0.0,114.0,346.0,46193.0,68.0,5.67,2.39,77.0,6.42,2.84,105.0,8.75,4.69,0.709961,0.740234,0.52002,0.569824,3.81,221.100006
1,296.0,1.0,88.0,296.0,44292.0,54.0,5.4,1.43,67.0,6.7,2.36,77.0,7.7,2.67,0.930176,0.959961,0.389893,0.370117,0.45,195.020004
2,296.0,2.0,209.0,198277.0,202956643.0,88.0,8.0,4.98,67.0,6.09,3.36,161.0,14.64,11.9,0.810059,0.810059,0.459961,0.330078,11.65,165.199997
3,296.0,3.0,233.0,105690.0,108660925.0,112.0,9.33,6.08,77.0,6.42,3.18,190.0,15.83,13.51,0.959961,0.950195,0.5,0.399902,21.450001,168.339996
4,296.0,4.0,328.0,123855.0,73151784.0,117.0,9.75,8.31,53.0,4.42,2.54,286.0,23.83,25.33,0.879883,0.850098,0.529785,0.469971,30.459999,165.199997
5,296.0,5.0,188.0,6194.0,2185993.0,102.0,8.5,9.13,52.0,4.33,2.19,160.0,13.33,15.13,0.870117,0.870117,0.47998,0.449951,33.240002,168.929993
6,296.0,6.0,217.0,5212.0,2787605.0,106.0,9.64,6.52,64.0,5.82,2.44,188.0,17.09,13.37,0.879883,0.870117,0.52002,0.379883,27.360001,168.770004
7,296.0,7.0,247.0,63500.0,66028114.0,114.0,11.4,9.65,63.0,6.3,2.95,204.0,20.4,20.34,0.959961,0.959961,0.47998,0.379883,38.880001,163.399994
8,296.0,8.0,205.0,10951.0,9133721.0,96.0,8.73,4.96,54.0,4.91,1.7,164.0,14.91,9.83,0.859863,0.859863,0.449951,0.340088,42.48,157.410004
9,296.0,9.0,164.0,5662.0,3360434.0,80.0,6.67,4.64,53.0,4.42,2.02,141.0,11.75,10.0,0.930176,0.910156,0.409912,0.27002,30.93,151.229996


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:04,401][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:04,421][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:04,425][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 622.04it/s]
[2025-09-06 17:10:04,514][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [341 186 144 271 177 ... 140 520 329 330  51], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:10:04,810][time_config][INFO] - Quick validation succeeded.
[2025-09-06 17:10:04,833][time_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:10:04,837][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 54/54 [00:00<00:00, 1089.83it/s]
[2025-09-06 17:10:04,889][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [125 160 111 286 375 ... 366 374 246 156  22], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)