# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-15 11:59:36,726][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:36,730][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:36,743][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:36,747][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1536.04it/s]
[2025-09-15 11:59:36,813][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:36,813][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [224  93 278 217 445 ...  88 120  22 163 398], Length=54
        Val time series IDS: [357 216 381 130  11 ... 520 451 435 503 190], Length=25
        Test time series IDS [456 479 273  18 437 146  80 333 132 187], Length=10
        All time series IDS [224  93 278 217 445 ... 146  80 333 132 187], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-15 11:59:36,818][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:59:36,819][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:59:36,820][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:59:36,820][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:59:36,822][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-15 11:59:36,827][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:59:36,827][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:59:36,828][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:59:36,828][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:59:36,829][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:59:36,837][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 50.66it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:59:36,888][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 53.59it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:59:36,917][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 133.08it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:59:36,934][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 38.39it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:36,968][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:36,979][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:36,982][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1971.08it/s]
[2025-09-15 11:59:37,030][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,030][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [512 401  29 502 101 ... 507 415  25 116 334], Length=54
        Val time series IDS: [300  91  95  92 217 ... 519 147 388 188 240], Length=25
        Test time series IDS [ 22 100 191 419  53 373 377 320 159  16], Length=10
        All time series IDS [512 401  29 502 101 ... 373 377 320 159  16], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 11:59:37,039][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 120.94it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,062][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,073][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,076][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2002.05it/s]
[2025-09-15 11:59:37,080][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,082][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:59:37,089][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 20.37it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,145][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,153][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,157][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1616.22it/s]
[2025-09-15 11:59:37,215][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,216][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 77  94   8 494 332 ... 171 298 204 293 323], Length=54
        Val time series IDS: [238 369  48 338 111 ... 313 296 535  87 168], Length=25
        Test time series IDS [ 44  64 181  75 458 444 215 431  90 311], Length=10
        All time series IDS [ 77  94   8 494 332 ... 444 215 431  90 311], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,77.0,0.0,84.0,599.0,116422.0,62.0,4.77,1.17,68.0,5.23,1.42,71.0,5.46,1.85,0.919922,0.919922,0.429932,0.280029,17.76,163.759995
1,77.0,1.0,77.0,504.0,98007.0,53.0,4.42,1.93,61.0,5.08,2.57,64.0,5.33,2.67,0.850098,0.839844,0.509766,0.459961,22.540001,138.460007
2,77.0,2.0,72.0,464.0,71261.0,55.0,5.0,2.0,55.0,5.0,1.79,60.0,5.45,2.34,0.819824,0.810059,0.409912,0.340088,38.700001,154.860001
3,77.0,3.0,44.0,306.0,70475.0,34.0,3.4,1.51,38.0,3.8,1.99,40.0,4.0,2.11,0.680176,0.660156,0.529785,0.439941,11.83,127.610001
4,77.0,4.0,87.0,5457.0,5333548.0,47.0,4.7,1.7,44.0,4.4,1.26,63.0,6.3,5.5,0.810059,0.77002,0.439941,0.340088,12.06,152.029999
5,77.0,5.0,78.0,1738.0,1368775.0,48.0,4.0,2.13,46.0,3.83,1.27,64.0,5.33,3.65,0.899902,0.850098,0.389893,0.300049,25.09,129.080002
6,77.0,6.0,53.0,610.0,99878.0,41.0,3.15,1.57,41.0,3.15,1.63,44.0,3.38,1.98,0.660156,0.629883,0.549805,0.469971,47.509998,120.900002
7,77.0,7.0,60.0,12774.0,12675012.0,45.0,3.21,1.31,41.0,2.93,1.33,48.0,3.43,1.65,0.759766,0.720215,0.540039,0.47998,60.380001,117.059998
8,77.0,8.0,68.0,866.0,140200.0,49.0,3.77,2.05,42.0,3.23,1.42,58.0,4.46,2.44,0.899902,0.890137,0.5,0.389893,50.349998,135.289993
9,77.0,9.0,70.0,888.0,130403.0,52.0,3.71,2.27,47.0,3.36,1.69,58.0,4.14,2.85,0.779785,0.77002,0.52002,0.399902,61.119999,112.300003


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,238.0,0.0,89366.0,1847431.0,1467887000.0,58345.0,4.66,3.46,71365.0,5.7,28.120001,73646.0,5.89,21.58,0.939941,0.930176,0.449951,0.419922,13.84,186.979996
1,238.0,1.0,93766.0,1859698.0,1462549000.0,58588.0,4.61,3.15,75718.0,5.95,39.43,74813.0,5.88,22.290001,0.950195,0.930176,0.449951,0.429932,14.18,184.449997
2,238.0,2.0,96753.0,6985598.0,6387581000.0,59938.0,4.61,3.72,71278.0,5.48,28.360001,78797.0,6.06,21.469999,0.939941,0.930176,0.449951,0.429932,14.44,180.449997
3,238.0,3.0,103469.0,8215977.0,7766366000.0,61631.0,4.46,4.11,66434.0,4.8,24.610001,84556.0,6.11,20.030001,0.939941,0.930176,0.469971,0.429932,16.74,169.809998
4,238.0,4.0,116825.0,11198014.0,11456990000.0,69863.0,4.26,4.03,68015.0,4.15,18.84,96554.0,5.88,16.790001,0.939941,0.930176,0.48999,0.439941,19.84,156.389999
5,238.0,5.0,123820.0,15330249.0,15983630000.0,74222.0,4.03,3.92,68414.0,3.71,16.73,102763.0,5.58,15.02,0.930176,0.919922,0.5,0.439941,22.549999,151.449997
6,238.0,6.0,128752.0,19224702.0,17811640000.0,77580.0,3.97,4.02,69234.0,3.54,15.63,107014.0,5.48,14.81,0.939941,0.919922,0.509766,0.449951,24.719999,148.330002
7,238.0,7.0,141303.0,14935254.0,14127880000.0,84785.0,4.19,3.98,76191.0,3.77,16.18,117065.0,5.79,15.21,0.930176,0.919922,0.5,0.439941,24.82,150.300003
8,238.0,8.0,131990.0,21089426.0,18800820000.0,79236.0,4.04,3.97,70717.0,3.6,15.73,109135.0,5.56,14.97,0.939941,0.930176,0.509766,0.449951,25.27,148.800003
9,238.0,9.0,131328.0,10731997.0,9643846000.0,77596.0,3.99,4.15,70155.0,3.61,17.0,107675.0,5.54,15.72,0.939941,0.919922,0.509766,0.449951,25.969999,146.940002


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,44.0,0.0,3888.0,318495.0,356922900.0,1762.0,8.55,5.43,1933.0,9.38,4.33,2516.0,12.21,9.15,0.919922,0.939941,0.469971,0.459961,4.0,108.769997
1,44.0,1.0,5118.0,656320.0,769353500.0,2011.0,9.62,5.32,2392.0,11.44,5.43,3064.0,14.66,9.25,0.950195,0.959961,0.419922,0.379883,3.61,100.110001
2,44.0,2.0,7028.0,263713.0,214580800.0,2034.0,9.78,5.2,2716.0,13.06,9.03,3605.0,17.33,11.04,0.950195,0.970215,0.429932,0.399902,4.57,99.389999
3,44.0,3.0,4319.0,310325.0,327572700.0,1664.0,8.12,5.1,1958.0,9.55,5.03,2641.0,12.88,9.22,0.919922,0.939941,0.449951,0.449951,4.0,101.040001
4,44.0,4.0,2539.0,840309.0,1070947000.0,1303.0,6.58,4.23,1401.0,7.08,3.39,1785.0,9.02,6.62,0.930176,0.950195,0.459961,0.459961,5.34,104.040001
5,44.0,5.0,2220.0,365286.0,448779600.0,1185.0,6.55,4.52,1243.0,6.87,3.53,1563.0,8.64,6.67,0.919922,0.939941,0.459961,0.439941,4.56,101.209999
6,44.0,6.0,1994.0,360228.0,451670400.0,1047.0,5.6,3.73,1068.0,5.71,3.0,1376.0,7.36,5.75,0.899902,0.930176,0.469971,0.469971,4.45,105.669998
7,44.0,7.0,2636.0,92002.0,78352510.0,1271.0,6.42,4.33,1421.0,7.18,3.68,1829.0,9.24,6.73,0.910156,0.919922,0.439941,0.399902,4.28,106.370003
8,44.0,8.0,3098.0,119940.0,100090300.0,1260.0,6.53,4.23,1538.0,7.97,4.58,2055.0,10.65,6.87,0.919922,0.939941,0.459961,0.429932,4.7,97.220001
9,44.0,9.0,3487.0,19138990.0,25920700000.0,1470.0,7.24,4.15,1795.0,8.84,5.0,2359.0,11.62,7.0,0.910156,0.930176,0.459961,0.429932,4.88,98.860001


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,77.0,0.0,84.0,599.0,116422.0,62.0,4.77,1.17,68.0,5.23,1.42,71.0,5.46,1.85,0.919922,0.919922,0.429932,0.280029,17.76,163.759995
1,77.0,1.0,77.0,504.0,98007.0,53.0,4.42,1.93,61.0,5.08,2.57,64.0,5.33,2.67,0.850098,0.839844,0.509766,0.459961,22.540001,138.460007
2,77.0,2.0,72.0,464.0,71261.0,55.0,5.0,2.0,55.0,5.0,1.79,60.0,5.45,2.34,0.819824,0.810059,0.409912,0.340088,38.700001,154.860001
3,77.0,3.0,44.0,306.0,70475.0,34.0,3.4,1.51,38.0,3.8,1.99,40.0,4.0,2.11,0.680176,0.660156,0.529785,0.439941,11.83,127.610001
4,77.0,4.0,87.0,5457.0,5333548.0,47.0,4.7,1.7,44.0,4.4,1.26,63.0,6.3,5.5,0.810059,0.77002,0.439941,0.340088,12.06,152.029999
5,77.0,5.0,78.0,1738.0,1368775.0,48.0,4.0,2.13,46.0,3.83,1.27,64.0,5.33,3.65,0.899902,0.850098,0.389893,0.300049,25.09,129.080002
6,77.0,6.0,53.0,610.0,99878.0,41.0,3.15,1.57,41.0,3.15,1.63,44.0,3.38,1.98,0.660156,0.629883,0.549805,0.469971,47.509998,120.900002
7,77.0,7.0,60.0,12774.0,12675012.0,45.0,3.21,1.31,41.0,2.93,1.33,48.0,3.43,1.65,0.759766,0.720215,0.540039,0.47998,60.380001,117.059998
8,77.0,8.0,68.0,866.0,140200.0,49.0,3.77,2.05,42.0,3.23,1.42,58.0,4.46,2.44,0.899902,0.890137,0.5,0.389893,50.349998,135.289993
9,77.0,9.0,70.0,888.0,130403.0,52.0,3.71,2.27,47.0,3.36,1.69,58.0,4.14,2.85,0.779785,0.77002,0.52002,0.399902,61.119999,112.300003


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,540][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,551][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,554][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1730.53it/s]
[2025-09-15 11:59:37,607][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,608][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [524 132 216 430  92 ... 115  25 206  13  53], Length=54
        Val time series IDS: [398 289 139 528 546 ... 505 503  83 478 403], Length=25
        Test time series IDS [131  45 235  48 426 457 424 241 411 266], Length=10
        All time series IDS [524 132 216 430  92 ... 457 424 241 411 266], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,757][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,769][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,772][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2310.88it/s]
[2025-09-15 11:59:37,812][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,813][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [149  42 168 386 406 ... 102 542 178  47 268], Length=54
        Val time series IDS: [212 130 226 231 437 ... 412  13  93 400 124], Length=25
        Test time series IDS [519 114 175 337 329 189  40 239 424  56], Length=10
        All time series IDS [149  42 168 386 406 ... 189  40 239 424  56], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,858][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,868][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,872][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3289.68it/s]
[2025-09-15 11:59:37,900][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,901][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [260 452 462 338 545 ...  48   1 129 148  35], Length=54
        Val time series IDS: [221 427 183  12 509 ... 467  37 419 319 547], Length=25
        Test time series IDS [213 178  64 423 330 198 153   9 340  42], Length=10
        All time series IDS [260 452 462 338 545 ... 198 153   9 340  42], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-15 11:59:37,908][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 124.92it/s]


array([[[2.600e+02, 0.000e+00, 0.000e+00],
        [2.600e+02, 1.000e+00, 7.000e+00],
        [2.600e+02, 2.000e+00, 0.000e+00],
        ...,
        [2.600e+02, 3.356e+03, 0.000e+00],
        [2.600e+02, 3.357e+03, 0.000e+00],
        [2.600e+02, 3.358e+03, 8.000e+00]],

       [[4.520e+02, 0.000e+00, 1.555e+03],
        [4.520e+02, 1.000e+00, 1.551e+03],
        [4.520e+02, 2.000e+00, 1.743e+03],
        ...,
        [4.520e+02, 3.356e+03, 6.200e+02],
        [4.520e+02, 3.357e+03, 7.800e+02],
        [4.520e+02, 3.358e+03, 7.300e+02]],

       [[4.620e+02, 0.000e+00, 3.300e+01],
        [4.620e+02, 1.000e+00, 3.500e+01],
        [4.620e+02, 2.000e+00, 3.000e+01],
        ...,
        [4.620e+02, 3.356e+03, 3.200e+01],
        [4.620e+02, 3.357e+03, 7.000e+00],
        [4.620e+02, 3.358e+03, 2.600e+01]],

       ...,

       [[5.030e+02, 0.000e+00, 5.300e+01],
        [5.030e+02, 1.000e+00, 4.200e+01],
        [5.030e+02, 2.000e+00, 8.900e+01],
        ...,
        [5.030e+02, 3.356e

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:59:37,932][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:59:37,942][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:59:37,946][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2685.33it/s]
[2025-09-15 11:59:37,983][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:59:37,983][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 35 545 234 184 225 ... 481 540 389 224 258], Length=54
        Val time series IDS: [246 228 362 291  54 ... 482 375 174 229 493], Length=25
        Test time series IDS [416 498 367 310  95  32  26 216 359   8], Length=10
        All time series IDS [ 35 545 234 184 225 ...  32  26 216 359   8], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-15 11:59:37,990][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 108.06it/s]


array([[[2.6400e+02, 0.0000e+00, 1.0310e+04],
        [2.6400e+02, 1.0000e+00, 1.1968e+04],
        [2.6400e+02, 2.0000e+00, 3.4052e+04],
        ...,
        [2.6400e+02, 3.3560e+03, 9.7650e+03],
        [2.6400e+02, 3.3570e+03, 1.0968e+04],
        [2.6400e+02, 3.3580e+03, 1.0474e+04]],

       [[3.8900e+02, 0.0000e+00, 7.6300e+02],
        [3.8900e+02, 1.0000e+00, 7.5500e+02],
        [3.8900e+02, 2.0000e+00, 8.9200e+02],
        ...,
        [3.8900e+02, 3.3560e+03, 7.0000e+02],
        [3.8900e+02, 3.3570e+03, 6.0400e+02],
        [3.8900e+02, 3.3580e+03, 5.1000e+02]],

       [[1.6900e+02, 0.0000e+00, 1.1200e+02],
        [1.6900e+02, 1.0000e+00, 9.8000e+01],
        [1.6900e+02, 2.0000e+00, 8.8000e+01],
        ...,
        [1.6900e+02, 3.3560e+03, 2.0500e+02],
        [1.6900e+02, 3.3570e+03, 1.5100e+02],
        [1.6900e+02, 3.3580e+03, 1.6800e+02]],

       ...,

       [[4.1900e+02, 0.0000e+00, 2.1210e+03],
        [4.1900e+02, 1.0000e+00, 2.2890e+03],
        [4.1900e+02, 2