# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-06 17:06:07,117][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:07,122][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:07,133][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:07,137][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1288.77it/s]
[2025-09-06 17:06:07,213][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:07,213][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [199 115 289 321 383 ... 251 163 542 344 354], Length=54
        Val time series IDS: [197 278 543  27 355 ...  25 325 226 169 191], Length=25
        Test time series IDS [422 260 104 375 342  31 202 248 394  46], Length=10
        All time series IDS [199 115 289 321 383 ...  31 202 248 394  46], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-06 17:06:07,218][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:06:07,218][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:06:07,218][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:06:07,220][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:06:07,220][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-06 17:06:07,224][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:06:07,224][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:06:07,225][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 17:06:07,226][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 17:06:07,226][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:06:07,235][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 47.25it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:06:07,288][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 49.92it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:06:07,318][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 121.34it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:06:07,338][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 35.00it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:07,373][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:07,384][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:07,388][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1725.17it/s]
[2025-09-06 17:06:07,440][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:07,441][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [432 457 480  84 233 ... 140 527 195 295 394], Length=54
        Val time series IDS: [272 227 205 251 211 ... 189  80 239 202 464], Length=25
        Test time series IDS [ 57 385  65  55  96  22 471 111 213 142], Length=10
        All time series IDS [432 457 480  84 233 ...  22 471 111 213 142], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 17:06:07,449][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 113.04it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:07,473][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:07,482][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:07,486][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 1598.74it/s]
[2025-09-06 17:06:07,490][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:07,491][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 17:06:07,498][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 949.37it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:07,506][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:07,516][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:07,519][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1345.58it/s]
[2025-09-06 17:06:07,586][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:07,587][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [406 197 540 141  14 ... 236 147 221  79 513], Length=54
        Val time series IDS: [ 63 499  17 275 318 ... 248 169 354 111 210], Length=25
        Test time series IDS [353  71 530 480 436   6 463 218 364  62], Length=10
        All time series IDS [406 197 540 141  14 ...   6 463 218 364  62], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,406.0,0.0,3288.0,61708.0,30904120.0,434.0,9.43,9.33,356.0,7.74,4.02,1315.0,28.59,54.330002,0.97998,0.97998,0.419922,0.47998,11.87,134.360001
1,406.0,1.0,4012.0,101131.0,79169100.0,506.0,10.12,9.38,426.0,8.52,4.71,1463.0,29.26,56.77,0.97998,0.990234,0.429932,0.459961,12.24,127.809998
2,406.0,2.0,5480.0,401476.0,397452600.0,646.0,12.67,14.28,425.0,8.33,4.43,2314.0,45.369999,89.279999,0.970215,0.959961,0.459961,0.5,9.89,122.779999
3,406.0,3.0,9882.0,1205609.0,1295497000.0,856.0,17.120001,23.09,366.0,7.32,3.81,4059.0,81.18,157.490005,0.990234,0.990234,0.409912,0.429932,14.2,128.080002
4,406.0,4.0,7842.0,1418313.0,1351303000.0,830.0,21.280001,26.370001,274.0,7.03,3.91,3931.0,100.790001,163.429993,0.970215,0.970215,0.399902,0.389893,15.65,137.880005
5,406.0,5.0,7200.0,713334.0,713578700.0,845.0,19.200001,26.08,278.0,6.32,3.76,3887.0,88.339996,154.169998,0.959961,0.959961,0.509766,0.509766,18.530001,123.589996
6,406.0,6.0,7022.0,964035.0,846849900.0,793.0,17.24,23.85,274.0,5.96,4.06,3705.0,80.540001,146.449997,0.959961,0.939941,0.48999,0.47998,18.209999,129.919998
7,406.0,7.0,7504.0,557089.0,471703200.0,871.0,18.530001,25.5,305.0,6.49,3.46,4082.0,86.849998,159.080002,0.950195,0.950195,0.419922,0.419922,14.45,134.059998
8,406.0,8.0,7443.0,957048.0,800463800.0,849.0,18.059999,25.57,288.0,6.13,3.79,4008.0,85.279999,157.020004,0.910156,0.919922,0.48999,0.5,24.040001,116.800003
9,406.0,9.0,7433.0,660902.0,574574000.0,814.0,16.959999,23.969999,300.0,6.25,3.58,3913.0,81.519997,156.779999,0.950195,0.959961,0.419922,0.399902,18.629999,127.769997


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,63.0,0.0,198.0,5543.0,4117638.0,130.0,5.65,3.39,114.0,4.96,2.4,170.0,7.39,5.76,0.950195,0.890137,0.389893,0.290039,9.28,196.779999
1,63.0,1.0,222.0,8547.0,6643669.0,144.0,6.26,3.7,115.0,5.0,1.68,180.0,7.83,5.97,0.970215,0.939941,0.419922,0.320068,16.299999,177.869995
2,63.0,2.0,181.0,7476.0,5720065.0,120.0,5.71,4.03,100.0,4.76,1.97,155.0,7.38,6.63,0.950195,0.939941,0.459961,0.389893,19.059999,172.910004
3,63.0,3.0,174.0,4558.0,3301450.0,127.0,5.52,2.92,111.0,4.83,1.87,158.0,6.87,4.71,0.959961,0.930176,0.429932,0.340088,13.24,182.860001
4,63.0,4.0,148.0,4129.0,2578827.0,108.0,4.91,3.44,76.0,3.45,1.3,123.0,5.59,4.23,0.959961,0.959961,0.389893,0.320068,16.5,169.460007
5,63.0,5.0,100.0,2234.0,1428128.0,81.0,4.26,2.62,65.0,3.42,1.46,93.0,4.89,3.38,0.990234,0.97998,0.439941,0.340088,14.37,158.0
6,63.0,6.0,122.0,2647.0,1994489.0,92.0,4.38,2.48,71.0,3.38,1.28,111.0,5.29,4.34,0.950195,0.930176,0.48999,0.389893,10.97,185.029999
7,63.0,7.0,141.0,4824.0,2814347.0,107.0,5.35,2.87,79.0,3.95,1.43,127.0,6.35,4.52,0.970215,0.959961,0.439941,0.350098,8.52,169.289993
8,63.0,8.0,110.0,3785.0,2904156.0,79.0,4.16,2.27,64.0,3.37,1.46,98.0,5.16,3.52,0.959961,0.959961,0.419922,0.330078,14.34,169.149994
9,63.0,9.0,129.0,4824.0,3991556.0,98.0,5.16,3.27,78.0,4.11,1.45,119.0,6.26,4.49,0.950195,0.959961,0.429932,0.330078,13.09,165.270004


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,353.0,0.0,6950.0,1223729.0,699492500.0,436.0,20.76,26.450001,145.0,6.9,4.15,865.0,41.189999,59.34,0.640137,0.629883,0.409912,0.360107,27.25,165.990005
1,353.0,1.0,8501.0,689835.0,758851700.0,506.0,18.07,27.889999,158.0,5.64,3.64,1135.0,40.540001,72.610001,0.560059,0.529785,0.48999,0.469971,32.73,175.5
2,353.0,2.0,9016.0,688751.0,293073300.0,599.0,27.23,38.610001,150.0,6.82,4.25,1642.0,74.639999,120.43,0.629883,0.620117,0.509766,0.429932,33.150002,171.240005
3,353.0,3.0,10956.0,3039193.0,2260469000.0,629.0,31.450001,44.049999,168.0,8.4,9.54,2454.0,122.699997,191.229996,0.640137,0.620117,0.350098,0.27002,20.700001,156.440002
4,353.0,4.0,14252.0,2720319.0,1719613000.0,784.0,37.330002,54.98,965.0,45.950001,90.919998,4020.0,191.429993,305.140015,0.540039,0.540039,0.549805,0.449951,50.889999,171.580002
5,353.0,5.0,12061.0,2427727.0,2130638000.0,776.0,48.5,60.990002,710.0,44.380001,85.019997,3702.0,231.380005,306.359985,0.620117,0.580078,0.469971,0.389893,52.860001,160.619995
6,353.0,6.0,11887.0,1375318.0,1052175000.0,721.0,42.41,54.380001,1211.0,71.239998,112.110001,3253.0,191.350006,261.76001,0.709961,0.649902,0.47998,0.370117,25.4,135.070007
7,353.0,7.0,12853.0,1473446.0,915148900.0,727.0,38.259998,52.57,1211.0,63.740002,96.449997,3399.0,178.889999,265.660004,0.640137,0.600098,0.529785,0.449951,29.799999,173.149994
8,353.0,8.0,12620.0,2869786.0,1923652000.0,713.0,37.529999,52.599998,820.0,43.16,90.309998,3551.0,186.889999,279.130005,0.5,0.459961,0.540039,0.469971,35.380001,154.589996
9,353.0,9.0,12281.0,2625958.0,2397869000.0,779.0,45.82,58.490002,775.0,45.59,95.470001,3590.0,211.179993,289.609985,0.680176,0.640137,0.419922,0.340088,35.07,157.300003


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,406.0,0.0,3288.0,61708.0,30904120.0,434.0,9.43,9.33,356.0,7.74,4.02,1315.0,28.59,54.330002,0.97998,0.97998,0.419922,0.47998,11.87,134.360001
1,406.0,1.0,4012.0,101131.0,79169100.0,506.0,10.12,9.38,426.0,8.52,4.71,1463.0,29.26,56.77,0.97998,0.990234,0.429932,0.459961,12.24,127.809998
2,406.0,2.0,5480.0,401476.0,397452600.0,646.0,12.67,14.28,425.0,8.33,4.43,2314.0,45.369999,89.279999,0.970215,0.959961,0.459961,0.5,9.89,122.779999
3,406.0,3.0,9882.0,1205609.0,1295497000.0,856.0,17.120001,23.09,366.0,7.32,3.81,4059.0,81.18,157.490005,0.990234,0.990234,0.409912,0.429932,14.2,128.080002
4,406.0,4.0,7842.0,1418313.0,1351303000.0,830.0,21.280001,26.370001,274.0,7.03,3.91,3931.0,100.790001,163.429993,0.970215,0.970215,0.399902,0.389893,15.65,137.880005
5,406.0,5.0,7200.0,713334.0,713578700.0,845.0,19.200001,26.08,278.0,6.32,3.76,3887.0,88.339996,154.169998,0.959961,0.959961,0.509766,0.509766,18.530001,123.589996
6,406.0,6.0,7022.0,964035.0,846849900.0,793.0,17.24,23.85,274.0,5.96,4.06,3705.0,80.540001,146.449997,0.959961,0.939941,0.48999,0.47998,18.209999,129.919998
7,406.0,7.0,7504.0,557089.0,471703200.0,871.0,18.530001,25.5,305.0,6.49,3.46,4082.0,86.849998,159.080002,0.950195,0.950195,0.419922,0.419922,14.45,134.059998
8,406.0,8.0,7443.0,957048.0,800463800.0,849.0,18.059999,25.57,288.0,6.13,3.79,4008.0,85.279999,157.020004,0.910156,0.919922,0.48999,0.5,24.040001,116.800003
9,406.0,9.0,7433.0,660902.0,574574000.0,814.0,16.959999,23.969999,300.0,6.25,3.58,3913.0,81.519997,156.779999,0.950195,0.959961,0.419922,0.399902,18.629999,127.769997


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:07,947][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:07,957][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:07,961][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1694.38it/s]
[2025-09-06 17:06:08,015][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:08,016][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [505 109 306  19 288 ... 486 119 383 143 359], Length=54
        Val time series IDS: [323 516 302 443 532 ... 449 240 153 413 379], Length=25
        Test time series IDS [520 461 217 464 197 270 264 265  82 358], Length=10
        All time series IDS [505 109 306  19 288 ... 270 264 265  82 358], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:08,186][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:08,197][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:08,201][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2021.81it/s]
[2025-09-06 17:06:08,246][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:08,247][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [296 541 352 388 198 ...  39 117 402 315 178], Length=54
        Val time series IDS: [416 422 522 520 472 ... 348 212 384 217 292], Length=25
        Test time series IDS [105 128  27 293   3 479 209 475 424 265], Length=10
        All time series IDS [296 541 352 388 198 ... 479 209 475 424 265], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:08,303][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:08,314][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:08,317][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2733.85it/s]
[2025-09-06 17:06:08,351][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:08,352][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [166 309 517 360 105 ... 537 252 145 227 546], Length=54
        Val time series IDS: [463 447 150  99 498 ... 441 190 187  27 111], Length=25
        Test time series IDS [541 547 110 359 208 530 489 383 265   2], Length=10
        All time series IDS [166 309 517 360 105 ... 530 489 383 265   2], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-06 17:06:08,360][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 104.44it/s]


array([[[1.6600e+02, 0.0000e+00, 8.0000e+01],
        [1.6600e+02, 1.0000e+00, 9.8000e+01],
        [1.6600e+02, 2.0000e+00, 6.8000e+01],
        ...,
        [1.6600e+02, 3.3560e+03, 5.3000e+01],
        [1.6600e+02, 3.3570e+03, 1.4500e+02],
        [1.6600e+02, 3.3580e+03, 9.0000e+01]],

       [[3.0900e+02, 0.0000e+00, 7.2200e+02],
        [3.0900e+02, 1.0000e+00, 8.7500e+02],
        [3.0900e+02, 2.0000e+00, 7.3500e+02],
        ...,
        [3.0900e+02, 3.3560e+03, 6.0500e+02],
        [3.0900e+02, 3.3570e+03, 6.9100e+02],
        [3.0900e+02, 3.3580e+03, 5.6900e+02]],

       [[5.1700e+02, 0.0000e+00, 7.0000e+02],
        [5.1700e+02, 1.0000e+00, 5.1800e+02],
        [5.1700e+02, 2.0000e+00, 1.7980e+03],
        ...,
        [5.1700e+02, 3.3560e+03, 5.3500e+02],
        [5.1700e+02, 3.3570e+03, 5.2000e+02],
        [5.1700e+02, 3.3580e+03, 4.1800e+02]],

       ...,

       [[2.0500e+02, 0.0000e+00, 5.7780e+03],
        [2.0500e+02, 1.0000e+00, 5.9820e+03],
        [2.0500e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 17:06:08,386][series_config][INFO] - Quick validation succeeded.
[2025-09-06 17:06:08,396][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 17:06:08,399][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2799.87it/s]
[2025-09-06 17:06:08,431][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 17:06:08,431][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [544 191 527 480 207 ... 208 399 524 314  83], Length=54
        Val time series IDS: [424  20 357  95 200 ... 250  67 455  33 228], Length=25
        Test time series IDS [137 222  80  36  42 243 492  32 417 292], Length=10
        All time series IDS [544 191 527 480 207 ... 243 492  32 417 292], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-06 17:06:08,439][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 82.80it/s]


array([[[5.470e+02, 0.000e+00, 1.469e+03],
        [5.470e+02, 1.000e+00, 1.527e+03],
        [5.470e+02, 2.000e+00, 1.244e+03],
        ...,
        [5.470e+02, 3.356e+03, 1.705e+03],
        [5.470e+02, 3.357e+03, 1.959e+03],
        [5.470e+02, 3.358e+03, 1.917e+03]],

       [[1.690e+02, 0.000e+00, 1.120e+02],
        [1.690e+02, 1.000e+00, 9.800e+01],
        [1.690e+02, 2.000e+00, 8.800e+01],
        ...,
        [1.690e+02, 3.356e+03, 2.050e+02],
        [1.690e+02, 3.357e+03, 1.510e+02],
        [1.690e+02, 3.358e+03, 1.680e+02]],

       [[4.230e+02, 0.000e+00, 2.370e+02],
        [4.230e+02, 1.000e+00, 3.340e+02],
        [4.230e+02, 2.000e+00, 4.340e+02],
        ...,
        [4.230e+02, 3.356e+03, 5.770e+02],
        [4.230e+02, 3.357e+03, 7.800e+02],
        [4.230e+02, 3.358e+03, 2.660e+02]],

       ...,

       [[4.290e+02, 0.000e+00, 1.750e+02],
        [4.290e+02, 1.000e+00, 1.420e+02],
        [4.290e+02, 2.000e+00, 1.500e+02],
        ...,
        [4.290e+02, 3.356e