# Loading data with TimeBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.TIME_BASED, display_details=True)

[2025-11-14 18:43:56,418][cesnet_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped (unless sliding_window is used).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many times for every time series will be in one batch (differs when sliding window is used).
- Batch consists of: (only when sliding window is not used).
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(ts_ids, batch_size, features_to_take + used ids (without time))`, Numpy array of shape `(batch_size)`)

In [4]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:56,425][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:56,442][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:56,443][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 479.76it/s]
[2025-11-14 18:43:56,566][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
time_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
time_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-11-14 18:43:56,575][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:56,576][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:56,577][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
time_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
time_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-11-14 18:43:56,583][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:56,585][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:56,585][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:43:56,597][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 102/102 [00:00<00:00, 192.68it/s]


(54, 33, 20)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = time_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:43:57,141][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 31/31 [00:00<00:00, 135.71it/s]


(54, 65, 20)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = time_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:43:57,382][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 11/11 [00:00<00:00, 108.69it/s]


(54, 128, 20)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = time_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:43:57,496][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 53/53 [00:00<00:00, 173.72it/s]


(54, 128, 20)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:57,809][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:57,830][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:57,830][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 1534.71it/s]
[2025-11-14 18:43:57,935][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [12]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-11-14 18:43:57,958][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 223.33it/s]


(54, 32, 19)

(32,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = TimeBasedConfig(ts_ids=[177, 176, 319, 267], train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:58,441][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:58,457][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:58,458][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 4/4 [00:00<00:00, 666.74it/s]
[2025-11-14 18:43:58,470][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [177 176 319 267], Length=4
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Slidin

In [14]:
dataloader = time_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:43:58,479][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 105/105 [00:00<00:00, 1975.69it/s]


(1, 32, 20)

#### Sliding window

- When `sliding_window_prediction_size` is set then `sliding_window_size` must be set too if you want to use sliding window.
- Batch sizes are used for background caching.
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids)`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids)` <br>
    ).
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: <br>
    ( <br>
        Numpy array of shape `(ts_ids, sliding_window_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(ts_ids, sliding_window_prediction_size, features_to_take + used ids (without time))`, <br>
        Numpy array of shape `(sliding_window_size)` of time, <br>
        Numpy array of shape `(sliding_window_prediction_size)` of time, <br>
    ).
- You can modify sliding window step size with `sliding_window_step`
- You can use `set_shared_size` to set how many times time periods should share.
    - `val_time_period` takes from `train_time_period`
    - `test_time_period` takes from `val_time_period` or `train_time_period`

In [15]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(1000, 1500), test_time_period=range(1500, 2000), features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128, 
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=0.05)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:58,540][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:58,556][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:58,557][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 4692.98it/s]
[2025-11-14 18:43:58,572][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(665, 1500)
        Test time periods: range(1165, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 335
    Fillers
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type: NoAnomalyHandler        
    Batch sizes
        Train batch size: 32
 

In [16]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction))

[2025-11-14 18:43:58,582][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 3497.72it/s]


You can also change sliding window parameters later with `update_dataset_config_and_initialize` or `set_sliding_window`.

In [17]:
time_based_dataset.update_dataset_config_and_initialize(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)
# Or
time_based_dataset.set_sliding_window(sliding_window_size=22, sliding_window_prediction_size=3, sliding_window_step="config", set_shared_size="config", workers=0)

[2025-11-14 18:43:58,733][cesnet_dataset][INFO] - Destroyed cached train_dataloader.
[2025-11-14 18:43:58,733][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:58,736][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:43:58,737][cesnet_dataset][INFO] - Sliding window values has been changed successfuly.


##### Using time_format=TimeFormat.DATETIME

In [18]:
config = TimeBasedConfig(ts_ids=54, train_time_period=range(0, 1000), val_time_period=range(978, 1500), test_time_period=range(1478, 2000), features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                         train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                         sliding_window_size=22, sliding_window_prediction_size=2, sliding_window_step=2, set_shared_size=100)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:58,743][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:58,760][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:58,760][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 5363.43it/s]
[2025-11-14 18:43:58,779][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 1000)
        Val time periods: range(900, 1500)
        Test time periods: range(1400, 2000)
        All time periods: range(0, 2000)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Sliding window
        Sliding window size: 22
        Sliding window prediction size: 2
        Sliding window step size: 2
        Set shared size: 100
    Fillers
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type: NoAnomalyHandler        
    Batch sizes
        Train batch size: 32


In [19]:
dataloader = time_based_dataset.get_train_dataloader(workers="config")

batches = []

for sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times in tqdm(dataloader):
    batches.append((sliding_window, sliding_window_prediction, sliding_window_times, sliding_window_prediction_times))

[2025-11-14 18:43:58,794][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 489/489 [00:00<00:00, 4734.24it/s]


### Loading data as Dataframe

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [20]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:58,905][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:58,925][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:58,926][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 811.31it/s]
[2025-11-14 18:43:58,996][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [21]:
df = time_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,0.0,217.0,455.0,43446.0,157.0,13.08,6.53,83.0,6.92,3.18,209.0,17.42,9.84,0.469971,0.389893,0.700195,0.72998,5.31,92.480003
1,54.0,1.0,226.0,609.0,56118.0,167.0,13.92,7.35,72.0,6.0,2.86,215.0,17.92,11.74,0.360107,0.310059,0.700195,0.720215,19.93,80.010002
2,54.0,2.0,230.0,827.0,94466.0,163.0,13.58,5.87,89.0,7.42,4.81,209.0,17.42,8.52,0.52002,0.509766,0.720215,0.759766,15.01,78.870003
3,54.0,3.0,216.0,684.0,75534.0,159.0,13.25,5.26,89.0,7.42,5.57,202.0,16.83,7.0,0.429932,0.409912,0.740234,0.77002,7.61,83.330002
4,54.0,4.0,184.0,601.0,66754.0,144.0,12.0,4.75,68.0,5.67,4.23,177.0,14.75,6.03,0.5,0.48999,0.660156,0.669922,9.73,70.360001
5,54.0,5.0,160.0,566.0,61906.0,127.0,10.58,3.09,69.0,5.75,4.09,152.0,12.67,3.55,0.389893,0.389893,0.759766,0.77002,11.68,89.139999
6,54.0,6.0,111.0,141.0,9620.0,91.0,7.58,4.36,46.0,3.83,1.64,108.0,9.0,6.08,0.48999,0.429932,0.689941,0.700195,5.24,92.489998
7,54.0,7.0,131.0,369.0,40961.0,106.0,8.83,5.04,46.0,3.83,2.44,122.0,10.17,6.41,0.429932,0.439941,0.72998,0.740234,9.02,85.360001
8,54.0,8.0,176.0,550.0,57364.0,123.0,10.25,4.27,78.0,6.5,4.52,165.0,13.75,6.06,0.409912,0.379883,0.689941,0.75,10.89,88.709999
9,54.0,9.0,157.0,582.0,65721.0,117.0,9.75,3.79,67.0,5.58,4.94,147.0,12.25,5.67,0.429932,0.449951,0.839844,0.850098,18.540001,73.389999


In [22]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [23]:
df = time_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,3359.0,240.0,2320.0,368262.0,109.0,4.54,2.62,227.0,9.46,10.7,118.0,4.92,3.06,0.930176,0.910156,0.540039,0.549805,3.76,148.820007
1,54.0,3360.0,283.0,2445.0,388451.0,131.0,4.68,3.06,264.0,9.43,11.86,143.0,5.11,3.44,0.890137,0.879883,0.5,0.509766,1.5,148.910004
2,54.0,3361.0,237.0,2001.0,314172.0,105.0,4.38,1.58,227.0,9.46,9.35,120.0,5.0,1.98,0.950195,0.950195,0.459961,0.469971,1.62,139.240005
3,54.0,3362.0,242.0,2175.0,331634.0,109.0,4.19,2.3,220.0,8.46,8.31,128.0,4.92,3.05,0.879883,0.870117,0.5,0.509766,5.89,138.419998
4,54.0,3363.0,297.0,2510.0,377529.0,125.0,5.0,2.33,272.0,10.88,9.39,169.0,6.76,4.42,0.930176,0.919922,0.509766,0.509766,5.1,142.559998
5,54.0,3364.0,229.0,1947.0,297658.0,92.0,4.18,2.22,196.0,8.91,8.23,124.0,5.64,3.67,0.930176,0.939941,0.509766,0.509766,5.19,140.229996
6,54.0,3365.0,175.0,1789.0,292869.0,69.0,3.0,1.57,170.0,7.39,8.95,90.0,3.91,2.61,0.890137,0.870117,0.669922,0.680176,3.83,140.520004
7,54.0,3366.0,176.0,1871.0,294211.0,79.0,3.76,2.59,173.0,8.24,9.5,92.0,4.38,3.28,0.950195,0.919922,0.469971,0.449951,3.54,143.270004
8,54.0,3367.0,189.0,1761.0,275636.0,95.0,3.96,2.39,180.0,7.5,8.11,108.0,4.5,3.19,0.959961,0.959961,0.569824,0.580078,2.57,145.389999
9,54.0,3368.0,164.0,1456.0,224391.0,87.0,3.48,1.73,160.0,6.4,6.79,96.0,3.84,2.19,0.890137,0.859863,0.560059,0.560059,6.08,152.690002


In [24]:
len(dfs) # every time series has its own dataframe

54

#### Test set

- Affected by `test_workers`.

In [25]:
df = time_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,5374.0,132.0,497.0,54132.0,93.0,4.23,2.94,124.0,5.64,4.71,118.0,5.36,4.17,0.879883,0.850098,0.48999,0.439941,4.21,156.740005
1,54.0,5375.0,74.0,166.0,19600.0,64.0,2.78,1.0,73.0,3.17,1.3,72.0,3.13,1.39,0.899902,0.879883,0.540039,0.529785,1.86,160.729996
2,54.0,5376.0,144.0,722.0,87527.0,90.0,3.91,2.23,123.0,5.35,4.13,124.0,5.39,3.96,0.870117,0.850098,0.409912,0.399902,8.23,137.860001
3,54.0,5377.0,100.0,516.0,69846.0,80.0,4.71,3.51,93.0,5.47,4.29,92.0,5.41,4.2,0.939941,0.950195,0.549805,0.569824,5.57,160.270004
4,54.0,5378.0,74.0,230.0,26801.0,57.0,2.85,1.95,67.0,3.35,2.48,67.0,3.35,2.46,0.890137,0.870117,0.52002,0.509766,0.85,152.160004
5,54.0,5379.0,86.0,204.0,17061.0,55.0,2.62,1.32,74.0,3.52,2.32,74.0,3.52,1.86,0.740234,0.72998,0.379883,0.350098,1.8,136.820007
6,54.0,5380.0,71.0,141.0,17661.0,55.0,3.06,1.35,62.0,3.44,1.54,66.0,3.67,1.68,0.810059,0.790039,0.5,0.5,0.82,141.940002
7,54.0,5381.0,55.0,113.0,9001.0,46.0,2.88,1.67,49.0,3.06,1.73,51.0,3.19,1.68,0.77002,0.740234,0.549805,0.529785,0.74,155.699997
8,54.0,5382.0,91.0,450.0,55370.0,75.0,3.57,2.09,87.0,4.14,3.21,82.0,3.9,2.61,0.839844,0.799805,0.48999,0.47998,6.08,131.360001
9,54.0,5383.0,61.0,203.0,20522.0,49.0,2.58,1.61,58.0,3.05,1.84,58.0,3.05,2.04,0.910156,0.879883,0.48999,0.469971,3.21,158.960007


In [26]:
len(dfs) # every time series has its own dataframe

54

#### All set

- Affected by `all_workers`.

In [27]:
df = time_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = time_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,0.0,217.0,455.0,43446.0,157.0,13.08,6.53,83.0,6.92,3.18,209.0,17.42,9.84,0.469971,0.389893,0.700195,0.72998,5.31,92.480003
1,54.0,1.0,226.0,609.0,56118.0,167.0,13.92,7.35,72.0,6.0,2.86,215.0,17.92,11.74,0.360107,0.310059,0.700195,0.720215,19.93,80.010002
2,54.0,2.0,230.0,827.0,94466.0,163.0,13.58,5.87,89.0,7.42,4.81,209.0,17.42,8.52,0.52002,0.509766,0.720215,0.759766,15.01,78.870003
3,54.0,3.0,216.0,684.0,75534.0,159.0,13.25,5.26,89.0,7.42,5.57,202.0,16.83,7.0,0.429932,0.409912,0.740234,0.77002,7.61,83.330002
4,54.0,4.0,184.0,601.0,66754.0,144.0,12.0,4.75,68.0,5.67,4.23,177.0,14.75,6.03,0.5,0.48999,0.660156,0.669922,9.73,70.360001
5,54.0,5.0,160.0,566.0,61906.0,127.0,10.58,3.09,69.0,5.75,4.09,152.0,12.67,3.55,0.389893,0.389893,0.759766,0.77002,11.68,89.139999
6,54.0,6.0,111.0,141.0,9620.0,91.0,7.58,4.36,46.0,3.83,1.64,108.0,9.0,6.08,0.48999,0.429932,0.689941,0.700195,5.24,92.489998
7,54.0,7.0,131.0,369.0,40961.0,106.0,8.83,5.04,46.0,3.83,2.44,122.0,10.17,6.41,0.429932,0.439941,0.72998,0.740234,9.02,85.360001
8,54.0,8.0,176.0,550.0,57364.0,123.0,10.25,4.27,78.0,6.5,4.52,165.0,13.75,6.06,0.409912,0.379883,0.689941,0.75,10.89,88.709999
9,54.0,9.0,157.0,582.0,65721.0,117.0,9.75,3.79,67.0,5.58,4.94,147.0,12.25,5.67,0.429932,0.449951,0.839844,0.850098,18.540001,73.389999


In [28]:
len(dfs) # every time series has its own dataframe

54

### Loading data as singular Numpy array 

- Batch size has no effect.
- Sliding window has no effect.
- Returns every time series in `ts_ids` with sets specified time period.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape (excluding sliding window parameters).
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [29]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.ID_TIME, random_state=111,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:43:59,673][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:43:59,690][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:43:59,690][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 863.31it/s]
[2025-11-14 18:43:59,757][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    

#### Train set

- Affected by `train_workers`.

In [30]:
numpy_array = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [31]:
numpy_array = time_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(54, 2015, 20)

#### Test set

- Affected by `test_workers`.

In [32]:
numpy_array = time_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(54, 1343, 20)

#### All set

- Affected by `all_workers`.

In [33]:
numpy_array = time_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(54, 6717, 20)

#### Using time_format=TimeFormat.DATETIME

In [34]:
config = TimeBasedConfig(ts_ids=54, train_time_period=0.5, val_time_period=0.3, test_time_period=0.2, features_to_take="all", time_format=TimeFormat.DATETIME, random_state=111,
                         train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
time_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:44:00,086][time_config][INFO] - Quick validation succeeded.
[2025-11-14 18:44:00,104][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
[2025-11-14 18:44:00,105][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 1156.12it/s]
[2025-11-14 18:44:00,168][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
    Time periods
        Train time periods: range(0, 3359)
        Val time periods: range(3359, 5374)
        Test time periods: range(5374, 6717)
        All time periods: range(0, 6717)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
   

In [35]:
numpy_array, times = time_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)