# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-11-14 18:37:57,427][cesnet_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:57,432][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:57,440][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:57,440][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 1438.04it/s]
[2025-11-14 18:37:57,491][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 1561.45it/s]
[2025-11-14 18:37:57,515][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 1330.68it/s]
[2025-11-14 18:37:57,524][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:57,525][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type (train set): NoAnomalyHandler   
    Batch sizes
        Train batch size: 32
        Val batch s

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-11-14 18:37:57,534][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:37:57,536][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:37:57,536][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-11-14 18:37:57,542][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:37:57,544][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-11-14 18:37:57,544][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:37:57,552][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 53.29it/s]


(33, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:37:57,601][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 66.56it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:37:57,627][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 133.21it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:37:57,646][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 42.50it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:57,677][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:57,684][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:57,685][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 4693.95it/s]
[2025-11-14 18:37:57,714][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 3842.49it/s]
[2025-11-14 18:37:57,730][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 2494.68it/s]
[2025-11-14 18:37:57,734][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:57,734][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type (train set): NoAnomalyHandler   
    Batch sizes
        Train batch size: 32
        Val batch 

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-11-14 18:37:57,793][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 110.86it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:57,819][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:57,825][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:57,825][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 4/4 [00:00<00:00, 1594.19it/s]
[2025-11-14 18:37:57,833][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:57,833][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type (train set): NoAnomalyHandler   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test wo

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-11-14 18:37:57,841][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 1000.79it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:57,848][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:57,855][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:57,856][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 2911.18it/s]
[2025-11-14 18:37:57,882][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 2940.23it/s]
[2025-11-14 18:37:57,896][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 2856.96it/s]
[2025-11-14 18:37:57,900][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:57,901][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,0.0,217.0,455.0,43446.0,157.0,13.08,6.53,83.0,6.92,3.18,209.0,17.42,9.84,0.469971,0.389893,0.700195,0.72998,5.31,92.480003
1,54.0,1.0,226.0,609.0,56118.0,167.0,13.92,7.35,72.0,6.0,2.86,215.0,17.92,11.74,0.360107,0.310059,0.700195,0.720215,19.93,80.010002
2,54.0,2.0,230.0,827.0,94466.0,163.0,13.58,5.87,89.0,7.42,4.81,209.0,17.42,8.52,0.52002,0.509766,0.720215,0.759766,15.01,78.870003
3,54.0,3.0,216.0,684.0,75534.0,159.0,13.25,5.26,89.0,7.42,5.57,202.0,16.83,7.0,0.429932,0.409912,0.740234,0.77002,7.61,83.330002
4,54.0,4.0,184.0,601.0,66754.0,144.0,12.0,4.75,68.0,5.67,4.23,177.0,14.75,6.03,0.5,0.48999,0.660156,0.669922,9.73,70.360001
5,54.0,5.0,160.0,566.0,61906.0,127.0,10.58,3.09,69.0,5.75,4.09,152.0,12.67,3.55,0.389893,0.389893,0.759766,0.77002,11.68,89.139999
6,54.0,6.0,111.0,141.0,9620.0,91.0,7.58,4.36,46.0,3.83,1.64,108.0,9.0,6.08,0.48999,0.429932,0.689941,0.700195,5.24,92.489998
7,54.0,7.0,131.0,369.0,40961.0,106.0,8.83,5.04,46.0,3.83,2.44,122.0,10.17,6.41,0.429932,0.439941,0.72998,0.740234,9.02,85.360001
8,54.0,8.0,176.0,550.0,57364.0,123.0,10.25,4.27,78.0,6.5,4.52,165.0,13.75,6.06,0.409912,0.379883,0.689941,0.75,10.89,88.709999
9,54.0,9.0,157.0,582.0,65721.0,117.0,9.75,3.79,67.0,5.58,4.94,147.0,12.25,5.67,0.429932,0.449951,0.839844,0.850098,18.540001,73.389999


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,395.0,0.0,1110.0,23665.0,17341520.0,245.0,10.65,7.24,255.0,11.09,6.88,489.0,21.26,22.450001,0.799805,0.810059,0.429932,0.370117,12.96,121.879997
1,395.0,1.0,1404.0,35116.0,19138100.0,289.0,12.04,9.47,427.0,17.790001,19.83,603.0,25.120001,30.57,0.850098,0.910156,0.439941,0.370117,10.23,129.929993
2,395.0,2.0,4099.0,185777.0,188244700.0,494.0,22.450001,18.43,631.0,28.68,34.459999,1629.0,74.050003,79.330002,0.890137,0.910156,0.360107,0.26001,6.18,121.800003
3,395.0,3.0,13934.0,629171.0,612120700.0,708.0,33.709999,27.93,682.0,32.48,35.169998,4079.0,194.240005,194.110001,0.930176,0.950195,0.379883,0.27002,6.67,120.550003
4,395.0,4.0,18799.0,805995.0,585219100.0,763.0,40.16,29.719999,560.0,29.469999,28.52,5464.0,287.579987,260.429993,0.879883,0.879883,0.350098,0.170044,8.2,100.059998
5,395.0,5.0,19978.0,811484.0,763507700.0,803.0,33.459999,31.969999,460.0,19.17,19.309999,5768.0,240.330002,271.040009,0.919922,0.939941,0.409912,0.300049,19.18,119.989998
6,395.0,6.0,19406.0,783391.0,675522800.0,829.0,39.48,33.299999,471.0,22.43,20.190001,5638.0,268.480011,281.109985,0.850098,0.819824,0.370117,0.22998,8.84,125.419998
7,395.0,7.0,21774.0,928544.0,853389200.0,914.0,35.150002,35.790001,515.0,19.809999,20.92,6333.0,243.580002,302.529999,0.899902,0.919922,0.48999,0.409912,18.0,121.099998
8,395.0,8.0,20638.0,1144186.0,1055584000.0,886.0,32.810001,34.049999,554.0,20.52,23.93,6278.0,232.520004,280.450012,0.939941,0.959961,0.47998,0.389893,37.580002,111.550003
9,395.0,9.0,18875.0,719398.0,621841600.0,824.0,34.330002,31.49,571.0,23.790001,26.049999,5648.0,235.330002,261.670013,0.899902,0.930176,0.469971,0.379883,14.56,115.980003


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,264.0,0.0,10310.0,512654.0,251354900.0,1811.0,9.9,11.4,2058.0,11.25,14.95,4718.0,25.780001,64.459999,0.879883,0.870117,0.409912,0.429932,3.73,146.460007
1,264.0,1.0,11968.0,947350.0,453898900.0,2149.0,9.59,12.07,2324.0,10.38,13.58,5658.0,25.26,73.970001,0.850098,0.819824,0.379883,0.409912,3.12,142.419998
2,264.0,2.0,34052.0,6666384.0,3961873000.0,2328.0,11.09,19.120001,2246.0,10.7,14.98,9458.0,45.040001,185.130005,0.879883,0.859863,0.429932,0.439941,5.35,144.949997
3,264.0,3.0,61598.0,12421866.0,7313597000.0,2513.0,12.56,26.790001,2346.0,11.73,17.219999,13196.0,65.980003,290.559998,0.890137,0.879883,0.449951,0.449951,11.4,136.190002
4,264.0,4.0,60929.0,6752997.0,3819981000.0,2464.0,13.69,29.16,2252.0,12.51,20.59,13214.0,73.410004,309.869995,0.870117,0.859863,0.5,0.47998,10.24,126.970001
5,264.0,5.0,48781.0,6665884.0,3530089000.0,2533.0,13.26,28.709999,2012.0,10.53,15.48,12167.0,63.700001,263.76001,0.859863,0.859863,0.48999,0.48999,13.78,131.789993
6,264.0,6.0,47264.0,6822807.0,4415642000.0,2528.0,13.17,27.01,2050.0,10.68,16.110001,11916.0,62.060001,253.860001,0.830078,0.819824,0.509766,0.509766,15.11,127.050003
7,264.0,7.0,52247.0,4583420.0,2252856000.0,2688.0,13.05,26.530001,2243.0,10.89,15.76,12372.0,60.060001,252.470001,0.839844,0.830078,0.48999,0.48999,13.22,124.0
8,264.0,8.0,47781.0,5217293.0,2595123000.0,2624.0,11.93,25.68,2196.0,9.98,14.42,12032.0,54.689999,239.259995,0.839844,0.830078,0.48999,0.48999,11.8,126.910004
9,264.0,9.0,49405.0,5741806.0,3705850000.0,2612.0,12.32,25.83,2339.0,11.03,16.65,11833.0,55.82,235.149994,0.810059,0.799805,0.469971,0.459961,12.8,127.190002


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,54.0,0.0,217.0,455.0,43446.0,157.0,13.08,6.53,83.0,6.92,3.18,209.0,17.42,9.84,0.469971,0.389893,0.700195,0.72998,5.31,92.480003
1,54.0,1.0,226.0,609.0,56118.0,167.0,13.92,7.35,72.0,6.0,2.86,215.0,17.92,11.74,0.360107,0.310059,0.700195,0.720215,19.93,80.010002
2,54.0,2.0,230.0,827.0,94466.0,163.0,13.58,5.87,89.0,7.42,4.81,209.0,17.42,8.52,0.52002,0.509766,0.720215,0.759766,15.01,78.870003
3,54.0,3.0,216.0,684.0,75534.0,159.0,13.25,5.26,89.0,7.42,5.57,202.0,16.83,7.0,0.429932,0.409912,0.740234,0.77002,7.61,83.330002
4,54.0,4.0,184.0,601.0,66754.0,144.0,12.0,4.75,68.0,5.67,4.23,177.0,14.75,6.03,0.5,0.48999,0.660156,0.669922,9.73,70.360001
5,54.0,5.0,160.0,566.0,61906.0,127.0,10.58,3.09,69.0,5.75,4.09,152.0,12.67,3.55,0.389893,0.389893,0.759766,0.77002,11.68,89.139999
6,54.0,6.0,111.0,141.0,9620.0,91.0,7.58,4.36,46.0,3.83,1.64,108.0,9.0,6.08,0.48999,0.429932,0.689941,0.700195,5.24,92.489998
7,54.0,7.0,131.0,369.0,40961.0,106.0,8.83,5.04,46.0,3.83,2.44,122.0,10.17,6.41,0.429932,0.439941,0.72998,0.740234,9.02,85.360001
8,54.0,8.0,176.0,550.0,57364.0,123.0,10.25,4.27,78.0,6.5,4.52,165.0,13.75,6.06,0.409912,0.379883,0.689941,0.75,10.89,88.709999
9,54.0,9.0,157.0,582.0,65721.0,117.0,9.75,3.79,67.0,5.58,4.94,147.0,12.25,5.67,0.429932,0.449951,0.839844,0.850098,18.540001,73.389999


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:58,348][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:58,354][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:58,354][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 2903.04it/s]
[2025-11-14 18:37:58,380][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 2778.13it/s]
[2025-11-14 18:37:58,394][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 2500.33it/s]
[2025-11-14 18:37:58,399][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:58,399][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:58,595][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:58,602][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:58,603][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 2997.76it/s]
[2025-11-14 18:37:58,638][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 2939.33it/s]
[2025-11-14 18:37:58,655][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 1816.19it/s]
[2025-11-14 18:37:58,661][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:58,661][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0, random_state=111,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:58,779][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:58,787][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:58,787][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 4319.08it/s]
[2025-11-14 18:37:58,808][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 4538.90it/s]
[2025-11-14 18:37:58,817][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 2853.66it/s]
[2025-11-14 18:37:58,822][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:58,822][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type (train set): NoAnomalyHandler   
    Batch sizes
        Train batch size: 32
        Val batch s

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-11-14 18:37:58,833][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 148.87it/s]


array([[[5.4000e+01, 0.0000e+00, 2.1700e+02],
        [5.4000e+01, 1.0000e+00, 2.2600e+02],
        [5.4000e+01, 2.0000e+00, 2.3000e+02],
        ...,
        [5.4000e+01, 3.3560e+03, 2.3200e+02],
        [5.4000e+01, 3.3570e+03, 2.6900e+02],
        [5.4000e+01, 3.3580e+03, 2.6700e+02]],

       [[2.2600e+02, 0.0000e+00, 6.2000e+01],
        [2.2600e+02, 1.0000e+00, 1.0300e+02],
        [2.2600e+02, 2.0000e+00, 9.8000e+01],
        ...,
        [2.2600e+02, 3.3560e+03, 4.2000e+01],
        [2.2600e+02, 3.3570e+03, 3.8000e+01],
        [2.2600e+02, 3.3580e+03, 7.9000e+01]],

       [[1.3500e+02, 0.0000e+00, 1.0350e+03],
        [1.3500e+02, 1.0000e+00, 9.9300e+02],
        [1.3500e+02, 2.0000e+00, 1.0710e+03],
        ...,
        [1.3500e+02, 3.3560e+03, 1.5690e+03],
        [1.3500e+02, 3.3570e+03, 1.6460e+03],
        [1.3500e+02, 3.3580e+03, 1.8060e+03]],

       ...,

       [[7.4000e+01, 0.0000e+00, 2.7000e+01],
        [7.4000e+01, 1.0000e+00, 2.3000e+01],
        [7.4000e+01, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=111)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details="text", workers=0)

[2025-11-14 18:37:58,856][series_config][INFO] - Quick validation succeeded.
[2025-11-14 18:37:58,861][cesnet_dataset][INFO] - Updating config for train set and fitting values.
[2025-11-14 18:37:58,862][cesnet_dataset][INFO] - Starting fitting cycle 1/1.
100%|██████████| 54/54 [00:00<00:00, 4694.14it/s]
[2025-11-14 18:37:58,881][cesnet_dataset][INFO] - Updating config for val set.
100%|██████████| 25/25 [00:00<00:00, 5000.84it/s]
[2025-11-14 18:37:58,891][cesnet_dataset][INFO] - Updating config for test set.
100%|██████████| 10/10 [00:00<00:00, 5004.54it/s]
[2025-11-14 18:37:58,895][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-11-14 18:37:58,895][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 54 226 135 160 236 ...   7 118 322 275  86], Length=54
        Val time series IDS: [395 456 318 362 179 ... 370 309 389 421 539], Length=25
        Test time series IDS [264  15 505 409 495 335 359 467 390 377], Length=10
        All time series IDS [ 54 226 135 160 236 ... 335 359 467 390 377], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: NoFiller
    Transformers
        Transformer type: NoTransformer
    Anomaly handler
        Anomaly handler type (train set): NoAnomalyHandler   
    Batch sizes
        Train batch size: 32
        Val batch s

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-11-14 18:37:58,905][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 135.93it/s]


array([[[1.23000e+02, 0.00000e+00, 2.68000e+02],
        [1.23000e+02, 1.00000e+00, 6.60000e+02],
        [1.23000e+02, 2.00000e+00, 6.73000e+02],
        ...,
        [1.23000e+02, 3.35600e+03, 6.94000e+02],
        [1.23000e+02, 3.35700e+03, 4.87000e+02],
        [1.23000e+02, 3.35800e+03, 4.33000e+02]],

       [[2.66000e+02, 0.00000e+00, 6.72000e+02],
        [2.66000e+02, 1.00000e+00, 7.75000e+02],
        [2.66000e+02, 2.00000e+00, 7.30000e+02],
        ...,
        [2.66000e+02, 3.35600e+03, 5.53000e+02],
        [2.66000e+02, 3.35700e+03, 7.75000e+02],
        [2.66000e+02, 3.35800e+03, 5.43000e+02]],

       [[2.00000e+02, 0.00000e+00, 6.71462e+05],
        [2.00000e+02, 1.00000e+00, 6.91311e+05],
        [2.00000e+02, 2.00000e+00, 6.53332e+05],
        ...,
        [2.00000e+02, 3.35600e+03, 7.54613e+05],
        [2.00000e+02, 3.35700e+03, 7.71318e+05],
        [2.00000e+02, 3.35800e+03, 6.98418e+05]],

       ...,

       [[4.53000e+02, 0.00000e+00, 2.19000e+03],
        [4.