# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-08-26 20:06:57,059][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:57,064][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:57,075][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:57,079][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1124.18it/s]
[2025-08-26 20:06:57,166][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:57,166][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [110  88 544 193 403 ... 290 511 472 471 364], Length=54
        Val time series IDS: [128  77 134 419 188 ...  37 431 337 311 389], Length=25
        Test time series IDS [184 306 502 412 143 137 150 111 310  27], Length=10
        All time series IDS [110  88 544 193 403 ... 137 150 111 310  27], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-26 20:06:57,172][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 20:06:57,173][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 20:06:57,174][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 20:06:57,175][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 20:06:57,175][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-26 20:06:57,180][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 20:06:57,181][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 20:06:57,182][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 20:06:57,182][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 20:06:57,182][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 20:06:57,191][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 40.89it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 20:06:57,252][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 49.93it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 20:06:57,282][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 117.54it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 20:06:57,299][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 33.69it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:57,336][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:57,348][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:57,351][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1664.35it/s]
[2025-08-26 20:06:57,408][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:57,408][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [117 547 307 246 354 ... 370 153 184  45 120], Length=54
        Val time series IDS: [502  55  78 151 294 ... 383  17 463 290  34], Length=25
        Test time series IDS [431 541 127 242 531 126 454 444 229 203], Length=10
        All time series IDS [117 547 307 246 354 ... 126 454 444 229 203], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-26 20:06:57,416][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 108.25it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:57,443][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:57,452][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:57,456][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 949.04it/s]
[2025-08-26 20:06:57,462][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:57,462][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 20:06:57,471][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 999.83it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:57,480][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:57,491][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:57,494][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1444.75it/s]
[2025-08-26 20:06:57,559][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:57,560][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [216 169 383 147 201 ... 330 299 148 257 516], Length=54
        Val time series IDS: [513  28 452 400 528 ... 207 234  10 323 369], Length=25
        Test time series IDS [ 99 440  26 294  69 508  93 390 165  62], Length=10
        All time series IDS [216 169 383 147 201 ... 508  93 390 165  62], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,216.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
1,216.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
2,216.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
3,216.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
4,216.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
5,216.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,216.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,216.0,7.0,11.0,12.0,436.0,3.0,1.5,0.71,3.0,1.5,0.71,11.0,5.5,2.12,0.119995,0.140015,0.620117,0.620117,0.0,248.25
8,216.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,216.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [17]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets  n_bytes  \
 0                     216.0      0.0      0.0        0.0      0.0   
 1                     216.0      1.0      0.0        0.0      0.0   
 2                     216.0      2.0      0.0        0.0      0.0   
 3                     216.0      3.0      0.0        0.0      0.0   
 4                     216.0      4.0      0.0        0.0      0.0   
 ...                     ...      ...      ...        ...      ...   
 3354                  216.0   3354.0      6.0        8.0    536.0   
 3355                  216.0   3355.0      0.0        0.0      0.0   
 3356                  216.0   3356.0      0.0        0.0      0.0   
 3357                  216.0   3357.0      0.0        0.0      0.0   
 3358                  216.0   3358.0     23.0       37.0   1352.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0                0.0             0.0            0.00               0.0   
 1      

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,513.0,0.0,11.0,14.0,568.0,10.0,5.0,0.0,10.0,5.0,0.0,10.0,5.0,0.0,1.0,1.0,0.290039,0.280029,0.04,183.080002
1,513.0,1.0,9.0,9.0,364.0,7.0,3.5,0.71,8.0,4.0,1.41,8.0,4.0,1.41,1.0,1.0,0.25,0.23999,0.0,217.169998
2,513.0,2.0,11.0,11.0,468.0,10.0,3.33,1.53,11.0,3.67,1.15,11.0,3.67,1.15,1.0,1.0,0.290039,0.27002,0.0,170.669998
3,513.0,3.0,12.0,12.0,509.0,10.0,5.0,2.83,10.0,5.0,2.83,10.0,5.0,2.83,0.939941,0.959961,0.180054,0.180054,0.0,114.559998
4,513.0,4.0,9.0,10.0,449.0,8.0,2.0,0.82,8.0,2.0,0.82,8.0,2.0,0.82,1.0,1.0,0.419922,0.370117,0.0,235.919998
5,513.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,513.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,513.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
8,513.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,513.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets  n_bytes  \
 0                     513.0      0.0     11.0       14.0    568.0   
 1                     513.0      1.0      9.0        9.0    364.0   
 2                     513.0      2.0     11.0       11.0    468.0   
 3                     513.0      3.0     12.0       12.0    509.0   
 4                     513.0      4.0      9.0       10.0    449.0   
 ...                     ...      ...      ...        ...      ...   
 3354                  513.0   3354.0      0.0        0.0      0.0   
 3355                  513.0   3355.0      5.0        7.0    327.0   
 3356                  513.0   3356.0      0.0        0.0      0.0   
 3357                  513.0   3357.0      5.0        5.0    229.0   
 3358                  513.0   3358.0      0.0        0.0      0.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0               10.0            5.00            0.00              10.0   
 1      

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,99.0,0.0,17.0,39.0,2228.0,16.0,5.33,4.04,17.0,5.67,4.16,17.0,5.67,4.16,0.970215,0.970215,0.709961,0.72998,1.0,225.279999
1,99.0,1.0,44.0,231.0,15600.0,30.0,5.0,2.83,40.0,6.67,4.59,31.0,5.17,3.06,0.910156,0.899902,0.310059,0.300049,4.42,186.220001
2,99.0,2.0,25.0,225.0,12588.0,19.0,4.75,2.06,25.0,6.25,2.99,19.0,4.75,2.06,1.0,1.0,0.419922,0.47998,0.94,143.300003
3,99.0,3.0,28.0,76.0,3548.0,28.0,4.67,1.51,28.0,4.67,1.51,28.0,4.67,1.51,0.930176,0.890137,0.459961,0.469971,1.18,192.880005
4,99.0,4.0,10.0,16.0,695.0,10.0,3.33,1.53,10.0,3.33,1.53,10.0,3.33,1.53,0.890137,0.899902,0.320068,0.310059,0.61,188.320007
5,99.0,5.0,30.0,146.0,9491.0,24.0,4.8,0.84,29.0,5.8,1.79,25.0,5.0,1.22,0.939941,0.910156,0.280029,0.27002,5.95,154.889999
6,99.0,6.0,19.0,65.0,7729.0,15.0,3.75,1.89,18.0,4.5,2.08,18.0,4.5,2.08,0.990234,0.990234,0.640137,0.529785,1.69,203.020004
7,99.0,7.0,17.0,73.0,5990.0,17.0,4.25,0.96,17.0,4.25,0.96,17.0,4.25,0.96,0.950195,0.959961,0.449951,0.419922,6.99,190.919998
8,99.0,8.0,11.0,40.0,6199.0,9.0,4.5,2.12,10.0,5.0,1.41,10.0,5.0,1.41,0.970215,1.0,0.160034,0.109985,1.95,152.699997
9,99.0,9.0,24.0,196.0,9439.0,19.0,3.8,1.92,24.0,4.8,3.56,20.0,4.0,2.35,0.959961,0.959961,0.509766,0.540039,1.59,174.199997


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,216.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
1,216.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
2,216.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
3,216.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
4,216.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
5,216.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,216.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,216.0,7.0,11.0,12.0,436.0,3.0,1.5,0.71,3.0,1.5,0.71,11.0,5.5,2.12,0.119995,0.140015,0.620117,0.620117,0.0,248.25
8,216.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,216.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets  n_bytes  \
 0                     216.0      0.0      0.0        0.0      0.0   
 1                     216.0      1.0      0.0        0.0      0.0   
 2                     216.0      2.0      0.0        0.0      0.0   
 3                     216.0      3.0      0.0        0.0      0.0   
 4                     216.0      4.0      0.0        0.0      0.0   
 ...                     ...      ...      ...        ...      ...   
 3354                  216.0   3354.0      6.0        8.0    536.0   
 3355                  216.0   3355.0      0.0        0.0      0.0   
 3356                  216.0   3356.0      0.0        0.0      0.0   
 3357                  216.0   3357.0      0.0        0.0      0.0   
 3358                  216.0   3358.0     23.0       37.0   1352.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0                0.0             0.0            0.00               0.0   
 1      

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:58,664][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:58,673][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:58,677][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1911.80it/s]
[2025-08-26 20:06:58,726][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:58,726][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [197 219 483  59 281 ... 112 444  56 456  42], Length=54
        Val time series IDS: [519 147  51  29 399 ... 161 295  84 317 220], Length=25
        Test time series IDS [302 249 114 391 329  72 132 320 412 300], Length=10
        All time series IDS [197 219 483  59 281 ...  72 132 320 412 300], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:58,892][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:58,952][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:58,956][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1978.57it/s]
[2025-08-26 20:06:59,003][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:59,004][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [310 132 151  21 411 ... 486 224 200 528 420], Length=54
        Val time series IDS: [362 293 253 401  19 ... 540 510 517 158 400], Length=25
        Test time series IDS [514 296  44 363 547 323 267 471 326 157], Length=10
        All time series IDS [310 132 151  21 411 ... 323 267 471 326 157], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:59,057][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:59,067][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:59,070][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2868.02it/s]
[2025-08-26 20:06:59,103][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:59,103][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [100 156 219 428  52 ... 332 533 237 311 334], Length=54
        Val time series IDS: [202 440 449 436  21 ... 145 507 492 102  20], Length=25
        Test time series IDS [473 166 266  68 341  54 530  35 168  58], Length=10
        All time series IDS [100 156 219 428  52 ...  54 530  35 168  58], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-26 20:06:59,111][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 113.88it/s]


array([[[1.0000e+02, 0.0000e+00, 7.6760e+03],
        [1.0000e+02, 1.0000e+00, 8.2920e+03],
        [1.0000e+02, 2.0000e+00, 1.1067e+04],
        ...,
        [1.0000e+02, 3.3560e+03, 6.7130e+03],
        [1.0000e+02, 3.3570e+03, 6.7490e+03],
        [1.0000e+02, 3.3580e+03, 6.0140e+03]],

       [[1.5600e+02, 0.0000e+00, 4.4200e+02],
        [1.5600e+02, 1.0000e+00, 4.7000e+02],
        [1.5600e+02, 2.0000e+00, 1.1770e+03],
        ...,
        [1.5600e+02, 3.3560e+03, 6.6500e+02],
        [1.5600e+02, 3.3570e+03, 8.3500e+02],
        [1.5600e+02, 3.3580e+03, 7.4700e+02]],

       [[2.1900e+02, 0.0000e+00, 3.6000e+01],
        [2.1900e+02, 1.0000e+00, 3.7000e+01],
        [2.1900e+02, 2.0000e+00, 4.6000e+01],
        ...,
        [2.1900e+02, 3.3560e+03, 4.1000e+01],
        [2.1900e+02, 3.3570e+03, 5.3000e+01],
        [2.1900e+02, 3.3580e+03, 5.0000e+01]],

       ...,

       [[4.8300e+02, 0.0000e+00, 8.7800e+02],
        [4.8300e+02, 1.0000e+00, 9.5600e+02],
        [4.8300e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 20:06:59,136][series_config][INFO] - Quick validation succeeded.
[2025-08-26 20:06:59,144][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 20:06:59,148][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3014.03it/s]
[2025-08-26 20:06:59,180][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 20:06:59,180][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 98 411 207 501 436 ... 243 179 135 125 407], Length=54
        Val time series IDS: [460 289 163 490 208 ...  54 366 226 516 391], Length=25
        Test time series IDS [531 197 329  68  12 479 530 371 468 336], Length=10
        All time series IDS [ 98 411 207 501 436 ... 479 530 371 468 336], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-26 20:06:59,189][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 88.73it/s]


array([[[5.4400e+02, 0.0000e+00, 8.6000e+01],
        [5.4400e+02, 1.0000e+00, 5.3000e+01],
        [5.4400e+02, 2.0000e+00, 6.0000e+01],
        ...,
        [5.4400e+02, 3.3560e+03, 5.1000e+01],
        [5.4400e+02, 3.3570e+03, 3.3000e+01],
        [5.4400e+02, 3.3580e+03, 6.8000e+01]],

       [[4.1100e+02, 0.0000e+00, 1.5310e+03],
        [4.1100e+02, 1.0000e+00, 1.9940e+03],
        [4.1100e+02, 2.0000e+00, 4.2270e+03],
        ...,
        [4.1100e+02, 3.3560e+03, 2.3940e+03],
        [4.1100e+02, 3.3570e+03, 1.7420e+03],
        [4.1100e+02, 3.3580e+03, 1.6350e+03]],

       [[1.5200e+02, 0.0000e+00, 3.3970e+03],
        [1.5200e+02, 1.0000e+00, 3.9830e+03],
        [1.5200e+02, 2.0000e+00, 9.5390e+03],
        ...,
        [1.5200e+02, 3.3560e+03, 1.9980e+03],
        [1.5200e+02, 3.3570e+03, 2.5410e+03],
        [1.5200e+02, 3.3580e+03, 3.1950e+03]],

       ...,

       [[2.1000e+01, 0.0000e+00, 7.8100e+02],
        [2.1000e+01, 1.0000e+00, 8.6900e+02],
        [2.1000e+01, 2