# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-14 15:55:26,465][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:26,471][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:26,482][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:26,486][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1364.27it/s]
[2025-09-14 15:55:26,559][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:26,560][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [  5 462 475 202 445 ...  24 387 449 375 381], Length=54
        Val time series IDS: [ 65 178 290 311 280 ... 164 546 517 151 450], Length=25
        Test time series IDS [455 363 294 245 468 236  70 459 444 513], Length=10
        All time series IDS [  5 462 475 202 445 ... 236  70 459 444 513], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 6

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-14 15:55:26,566][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:55:26,566][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:55:26,567][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:55:26,567][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:55:26,569][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-14 15:55:26,573][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:55:26,574][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:55:26,575][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 15:55:26,576][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 15:55:26,576][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:55:26,584][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 49.96it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:55:26,636][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 52.58it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:55:26,666][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 105.11it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:55:26,684][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 37.70it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:26,719][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:26,731][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:26,735][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1819.53it/s]
[2025-09-14 15:55:26,787][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:26,788][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [256 343 363  19  74 ... 319 461 475 186 271], Length=54
        Val time series IDS: [499 501 239 147  81 ...  53 445  92   8 175], Length=25
        Test time series IDS [ 45 180 502 313 142 440  37 339 421 135], Length=10
        All time series IDS [256 343 363  19  74 ... 440  37 339 421 135], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-14 15:55:26,796][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 111.03it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:26,822][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:26,832][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:26,835][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2000.38it/s]
[2025-09-14 15:55:26,839][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:26,839][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker c

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 15:55:26,847][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 887.31it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:26,855][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:26,865][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:26,869][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1615.88it/s]
[2025-09-14 15:55:26,926][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:26,926][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [292 324 263 514 428 ... 442 233 168 149  61], Length=54
        Val time series IDS: [183 433 267 491 449 ... 253 370  77  89  46], Length=25
        Test time series IDS [476  60  96  48 100 236 362 212 300 521], Length=10
        All time series IDS [292 324 263 514 428 ... 236 362 212 300 521], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,292.0,0.0,26.0,32.0,1378.0,23.0,4.6,1.82,25.0,5.0,2.24,25.0,5.0,2.24,0.97998,0.950195,0.370117,0.360107,0.02,220.830002
1,292.0,1.0,32.0,37.0,1588.0,31.0,6.2,3.03,32.0,6.4,3.05,32.0,6.4,3.05,0.97998,0.97998,0.320068,0.300049,0.05,211.100006
2,292.0,2.0,20.0,24.0,980.0,20.0,4.0,1.0,20.0,4.0,1.0,20.0,4.0,1.0,1.0,1.0,0.48999,0.47998,0.0,188.940002
3,292.0,3.0,15.0,18.0,904.0,11.0,3.67,0.58,14.0,4.67,1.15,13.0,4.33,1.53,0.959961,0.850098,0.399902,0.350098,0.01,241.490005
4,292.0,4.0,14.0,15.0,620.0,13.0,2.6,1.14,14.0,2.8,1.48,14.0,2.8,1.48,1.0,1.0,0.509766,0.5,0.0,208.690002
5,292.0,5.0,16.0,21.0,837.0,16.0,3.2,1.64,16.0,3.2,1.64,16.0,3.2,1.64,0.97998,0.97998,0.389893,0.389893,0.01,217.199997
6,292.0,6.0,17.0,18.0,724.0,16.0,4.0,1.15,17.0,4.25,1.5,17.0,4.25,1.5,1.0,1.0,0.439941,0.439941,0.0,250.550003
7,292.0,7.0,18.0,23.0,1168.0,13.0,2.6,1.14,14.0,2.8,1.1,18.0,3.6,1.95,0.779785,0.759766,0.549805,0.540039,0.76,207.610001
8,292.0,8.0,9.0,12.0,492.0,7.0,2.33,0.58,8.0,2.67,1.15,8.0,2.67,1.15,1.0,1.0,0.340088,0.340088,0.0,162.25
9,292.0,9.0,16.0,33.0,1368.0,16.0,3.2,0.84,16.0,3.2,0.84,16.0,3.2,0.84,0.709961,0.720215,0.669922,0.649902,0.4,189.029999


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,183.0,0.0,38.0,126.0,24223.0,27.0,3.38,1.19,32.0,4.0,1.31,32.0,4.0,1.07,0.580078,0.560059,0.379883,0.439941,2.16,131.089996
1,183.0,1.0,42.0,24126.0,15417273.0,33.0,4.71,2.75,37.0,5.29,2.75,39.0,5.57,2.76,0.569824,0.509766,0.48999,0.549805,16.6,169.520004
2,183.0,2.0,42.0,69693.0,55801517.0,37.0,4.62,2.92,40.0,5.0,3.02,40.0,5.0,3.02,0.439941,0.360107,0.320068,0.419922,40.84,167.149994
3,183.0,3.0,29.0,23171.0,14987599.0,25.0,4.17,1.47,27.0,4.5,1.76,27.0,4.5,1.64,0.439941,0.399902,0.600098,0.77002,9.09,151.580002
4,183.0,4.0,11.0,28326.0,20286631.0,6.0,3.0,1.41,7.0,3.5,2.12,8.0,4.0,2.83,0.080017,0.049988,0.589844,0.660156,75.169998,89.360001
5,183.0,5.0,20.0,43.0,6679.0,16.0,3.2,1.64,18.0,3.6,2.07,18.0,3.6,2.07,0.509766,0.48999,0.680176,0.759766,1.33,125.230003
6,183.0,6.0,55.0,67893.0,55780246.0,20.0,2.86,0.9,51.0,7.29,10.52,24.0,3.43,1.4,0.449951,0.399902,0.459961,0.560059,2.67,123.419998
7,183.0,7.0,21.0,186311.0,155691841.0,15.0,3.0,1.73,16.0,3.2,1.79,19.0,3.8,2.17,0.099976,0.059998,0.47998,0.600098,60.560001,125.309998
8,183.0,8.0,13.0,268795.0,245469967.0,12.0,4.0,1.73,12.0,4.0,1.73,13.0,4.33,2.08,0.25,0.22998,0.320068,0.409912,50.580002,177.990005
9,183.0,9.0,14.0,133572.0,118235305.0,13.0,3.25,2.06,13.0,3.25,2.06,14.0,3.5,2.08,0.189941,0.140015,0.589844,0.700195,25.690001,120.650002


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,476.0,0.0,466.0,10563.0,6760310.0,258.0,7.37,4.03,224.0,6.4,2.76,377.0,10.77,10.32,0.75,0.75,0.370117,0.330078,11.2,169.710007
1,476.0,1.0,562.0,7706.0,2652022.0,307.0,8.08,4.75,275.0,7.24,3.1,444.0,11.68,10.96,0.669922,0.620117,0.439941,0.439941,14.82,166.199997
2,476.0,2.0,655.0,28679.0,19523320.0,284.0,7.47,6.48,210.0,5.53,2.32,520.0,13.68,19.879999,0.709961,0.680176,0.439941,0.399902,10.35,160.820007
3,476.0,3.0,1098.0,52203.0,35600811.0,320.0,8.89,9.26,199.0,5.53,2.91,810.0,22.5,43.279999,0.669922,0.669922,0.459961,0.429932,14.64,170.940002
4,476.0,4.0,2274.0,279701.0,238531000.0,369.0,10.25,14.04,212.0,5.89,4.46,1507.0,41.860001,83.699997,0.649902,0.629883,0.439941,0.389893,9.4,154.580002
5,476.0,5.0,1587.0,94535.0,63135973.0,294.0,9.19,11.44,170.0,5.31,3.8,1067.0,33.34,62.950001,0.740234,0.709961,0.399902,0.340088,12.23,159.050003
6,476.0,6.0,2476.0,258391.0,209849749.0,324.0,11.57,14.48,163.0,5.82,4.71,1497.0,53.459999,101.540001,0.620117,0.600098,0.469971,0.409912,18.809999,134.800003
7,476.0,7.0,3629.0,998413.0,977659599.0,398.0,10.76,16.299999,189.0,5.11,3.66,2095.0,56.619999,123.639999,0.709961,0.700195,0.459961,0.419922,15.54,170.419998
8,476.0,8.0,1531.0,135517.0,131122786.0,305.0,10.89,12.51,157.0,5.61,3.51,1075.0,38.389999,65.489998,0.620117,0.629883,0.47998,0.419922,13.46,151.399994
9,476.0,9.0,2631.0,293544.0,253740690.0,371.0,11.24,16.790001,173.0,5.24,4.83,1729.0,52.389999,104.849998,0.640137,0.640137,0.48999,0.429932,14.65,162.669998


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,292.0,0.0,26.0,32.0,1378.0,23.0,4.6,1.82,25.0,5.0,2.24,25.0,5.0,2.24,0.97998,0.950195,0.370117,0.360107,0.02,220.830002
1,292.0,1.0,32.0,37.0,1588.0,31.0,6.2,3.03,32.0,6.4,3.05,32.0,6.4,3.05,0.97998,0.97998,0.320068,0.300049,0.05,211.100006
2,292.0,2.0,20.0,24.0,980.0,20.0,4.0,1.0,20.0,4.0,1.0,20.0,4.0,1.0,1.0,1.0,0.48999,0.47998,0.0,188.940002
3,292.0,3.0,15.0,18.0,904.0,11.0,3.67,0.58,14.0,4.67,1.15,13.0,4.33,1.53,0.959961,0.850098,0.399902,0.350098,0.01,241.490005
4,292.0,4.0,14.0,15.0,620.0,13.0,2.6,1.14,14.0,2.8,1.48,14.0,2.8,1.48,1.0,1.0,0.509766,0.5,0.0,208.690002
5,292.0,5.0,16.0,21.0,837.0,16.0,3.2,1.64,16.0,3.2,1.64,16.0,3.2,1.64,0.97998,0.97998,0.389893,0.389893,0.01,217.199997
6,292.0,6.0,17.0,18.0,724.0,16.0,4.0,1.15,17.0,4.25,1.5,17.0,4.25,1.5,1.0,1.0,0.439941,0.439941,0.0,250.550003
7,292.0,7.0,18.0,23.0,1168.0,13.0,2.6,1.14,14.0,2.8,1.1,18.0,3.6,1.95,0.779785,0.759766,0.549805,0.540039,0.76,207.610001
8,292.0,8.0,9.0,12.0,492.0,7.0,2.33,0.58,8.0,2.67,1.15,8.0,2.67,1.15,1.0,1.0,0.340088,0.340088,0.0,162.25
9,292.0,9.0,16.0,33.0,1368.0,16.0,3.2,0.84,16.0,3.2,0.84,16.0,3.2,0.84,0.709961,0.720215,0.669922,0.649902,0.4,189.029999


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:27,253][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:27,264][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:27,318][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1772.05it/s]
[2025-09-14 15:55:27,373][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:27,373][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 87 348 110  78 176 ... 529 179 269 401 174], Length=54
        Val time series IDS: [123 136  73 450 314 ... 338 464   9  51  20], Length=25
        Test time series IDS [481 211 160 345  85 373 229 451 384  46], Length=10
        All time series IDS [ 87 348 110  78 176 ... 373 229 451 384  46], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:27,537][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:27,549][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:27,554][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2042.27it/s]
[2025-09-14 15:55:27,600][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:27,601][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 30  20  34 479 159 ... 157 167 256 465 365], Length=54
        Val time series IDS: [304  39 391 459 514 ...   3 387 364  68 346], Length=25
        Test time series IDS [290 334 298  36 358  81 373 286 474 245], Length=10
        All time series IDS [ 30  20  34 479 159 ...  81 373 286 474 245], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:27,647][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:27,659][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:27,662][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2869.12it/s]
[2025-09-14 15:55:27,694][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:27,695][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 58 194 534 457 265 ... 192 231  50  18 359], Length=54
        Val time series IDS: [229 414 116 272 137 ... 528 391 529  56 501], Length=25
        Test time series IDS [ 77 519 182 159 186 136 342 148 514 356], Length=10
        All time series IDS [ 58 194 534 457 265 ... 136 342 148 514 356], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-14 15:55:27,703][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 120.48it/s]


array([[[5.8000e+01, 0.0000e+00, 9.1740e+03],
        [5.8000e+01, 1.0000e+00, 1.0388e+04],
        [5.8000e+01, 2.0000e+00, 9.2480e+03],
        ...,
        [5.8000e+01, 3.3560e+03, 6.0470e+03],
        [5.8000e+01, 3.3570e+03, 6.7010e+03],
        [5.8000e+01, 3.3580e+03, 6.3300e+03]],

       [[1.9400e+02, 0.0000e+00, 1.7720e+03],
        [1.9400e+02, 1.0000e+00, 2.6210e+03],
        [1.9400e+02, 2.0000e+00, 5.0880e+03],
        ...,
        [1.9400e+02, 3.3560e+03, 1.6920e+03],
        [1.9400e+02, 3.3570e+03, 1.4970e+03],
        [1.9400e+02, 3.3580e+03, 1.7000e+03]],

       [[5.3400e+02, 0.0000e+00, 1.7200e+02],
        [5.3400e+02, 1.0000e+00, 1.0700e+02],
        [5.3400e+02, 2.0000e+00, 1.9400e+02],
        ...,
        [5.3400e+02, 3.3560e+03, 1.8600e+02],
        [5.3400e+02, 3.3570e+03, 9.2000e+01],
        [5.3400e+02, 3.3580e+03, 1.4800e+02]],

       ...,

       [[1.7100e+02, 0.0000e+00, 6.8090e+03],
        [1.7100e+02, 1.0000e+00, 9.5330e+03],
        [1.7100e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 15:55:27,727][series_config][INFO] - Quick validation succeeded.
[2025-09-14 15:55:27,738][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 15:55:27,742][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2832.14it/s]
[2025-09-14 15:55:27,776][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 15:55:27,777][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [351 107 275 117 447 ... 417 342 286 231 243], Length=54
        Val time series IDS: [440 170  76 439 383 ...   3 353 151 192 246], Length=25
        Test time series IDS [341 216 128 120 331 515 438 431 138 327], Length=10
        All time series IDS [351 107 275 117 447 ... 515 438 431 138 327], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 6

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-14 15:55:27,785][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 111.01it/s]


array([[[2.750e+02, 0.000e+00, 6.230e+02],
        [2.750e+02, 1.000e+00, 1.226e+03],
        [2.750e+02, 2.000e+00, 4.325e+03],
        ...,
        [2.750e+02, 3.356e+03, 1.342e+03],
        [2.750e+02, 3.357e+03, 1.496e+03],
        [2.750e+02, 3.358e+03, 1.523e+03]],

       [[2.790e+02, 0.000e+00, 4.550e+02],
        [2.790e+02, 1.000e+00, 5.800e+02],
        [2.790e+02, 2.000e+00, 4.420e+02],
        ...,
        [2.790e+02, 3.356e+03, 2.730e+02],
        [2.790e+02, 3.357e+03, 3.050e+02],
        [2.790e+02, 3.358e+03, 2.900e+02]],

       [[7.800e+01, 0.000e+00, 9.600e+01],
        [7.800e+01, 1.000e+00, 1.160e+02],
        [7.800e+01, 2.000e+00, 8.200e+01],
        ...,
        [7.800e+01, 3.356e+03, 6.800e+01],
        [7.800e+01, 3.357e+03, 6.000e+01],
        [7.800e+01, 3.358e+03, 9.100e+01]],

       ...,

       [[1.150e+02, 0.000e+00, 5.420e+02],
        [1.150e+02, 1.000e+00, 3.950e+02],
        [1.150e+02, 2.000e+00, 2.000e+02],
        ...,
        [1.150e+02, 3.356e