# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-15 11:34:07,439][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:07,444][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:07,455][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:07,459][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1412.98it/s]
[2025-09-15 11:34:07,530][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:07,531][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [316 188 527 375 292 ...  18 467 300 380  36], Length=54
        Val time series IDS: [192 404 250 529 204 ...  38 298 447 224 453], Length=25
        Test time series IDS [490 128 515 145  98  74 419 183 118   2], Length=10
        All time series IDS [316 188 527 375 292 ...  74 419 183 118   2], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val bat

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-15 11:34:07,537][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:34:07,537][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:34:07,539][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:34:07,539][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:34:07,539][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-15 11:34:07,544][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:34:07,545][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:34:07,546][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-15 11:34:07,546][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-15 11:34:07,547][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:34:07,555][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 47.17it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:34:07,608][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 55.51it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:34:07,636][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 117.48it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:34:07,654][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 35.52it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:07,689][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:07,700][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:07,704][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2023.67it/s]
[2025-09-15 11:34:07,751][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:07,751][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [545 220 115 148  58 ... 454 310 189  98 283], Length=54
        Val time series IDS: [437 458  74 158  75 ...  97 531 227 229 200], Length=25
        Test time series IDS [263 114 337 413 213 138 449 326 248 208], Length=10
        All time series IDS [545 220 115 148  58 ... 138 449 326 248 208], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val ba

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-15 11:34:07,759][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 120.95it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:07,782][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:07,792][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:07,796][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 1332.90it/s]
[2025-09-15 11:34:07,800][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:07,800][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Tes

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-15 11:34:07,808][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<?, ?it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:07,815][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:07,826][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:07,829][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1691.97it/s]
[2025-09-15 11:34:07,884][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:07,884][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [541 209 346  36 213 ... 190  52 311 398 287], Length=54
        Val time series IDS: [185   9 365 211 507 ... 191 485 344  92 225], Length=25
        Test time series IDS [399 170 307 240   2  25 352 331  14 480], Length=10
        All time series IDS [541 209 346  36 213 ...  25 352 331  14 480], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,541.0,0.0,419.0,5376.0,2122937.0,176.0,8.8,6.4,135.0,6.75,3.31,346.0,17.299999,19.309999,0.959961,0.950195,0.529785,0.5,19.799999,196.990005
1,541.0,1.0,526.0,15698.0,9113469.0,187.0,8.13,5.85,145.0,6.3,2.01,419.0,18.219999,22.469999,0.930176,0.950195,0.419922,0.360107,16.43,198.070007
2,541.0,2.0,612.0,9362.0,4178464.0,221.0,9.61,9.07,153.0,6.65,3.51,493.0,21.43,29.110001,0.950195,0.930176,0.459961,0.399902,11.53,190.380005
3,541.0,3.0,423.0,11421.0,4869419.0,198.0,8.61,7.25,141.0,6.13,2.7,357.0,15.52,18.43,0.879883,0.890137,0.449951,0.389893,16.4,179.669998
4,541.0,4.0,558.0,15396.0,7198935.0,220.0,10.48,12.02,108.0,5.14,2.57,473.0,22.52,32.720001,0.950195,0.959961,0.47998,0.419922,16.280001,190.360001
5,541.0,5.0,400.0,29822.0,9551678.0,173.0,9.11,8.9,104.0,5.47,3.42,350.0,18.42,22.200001,0.830078,0.819824,0.48999,0.429932,16.65,176.300003
6,541.0,6.0,559.0,18844.0,10420377.0,213.0,10.65,13.05,98.0,4.9,3.26,483.0,24.15,35.290001,0.919922,0.919922,0.409912,0.350098,18.68,199.320007
7,541.0,7.0,651.0,20259.0,12341248.0,228.0,12.0,14.55,102.0,5.37,3.42,530.0,27.889999,39.459999,0.939941,0.950195,0.5,0.439941,13.22,184.039993
8,541.0,8.0,485.0,21952.0,18822770.0,172.0,7.82,8.73,92.0,4.18,2.68,399.0,18.139999,26.690001,0.919922,0.910156,0.419922,0.360107,17.18,183.779999
9,541.0,9.0,493.0,19469.0,13193971.0,183.0,9.63,10.02,101.0,5.32,3.4,429.0,22.58,29.950001,0.939941,0.959961,0.459961,0.419922,20.35,175.130005


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,185.0,0.0,56.0,83.0,3526.0,50.0,4.55,1.44,52.0,4.73,1.68,53.0,4.82,1.72,0.899902,0.890137,0.429932,0.429932,0.56,206.470001
1,185.0,1.0,77.0,114.0,4778.0,64.0,5.33,2.46,72.0,6.0,2.34,73.0,6.08,2.54,0.950195,0.930176,0.409912,0.399902,0.03,211.399994
2,185.0,2.0,81.0,112.0,4778.0,62.0,5.17,1.9,75.0,6.25,2.8,67.0,5.58,1.78,0.939941,0.919922,0.340088,0.320068,0.03,199.550003
3,185.0,3.0,62.0,86.0,3596.0,54.0,4.91,2.3,59.0,5.36,2.42,58.0,5.27,2.37,0.97998,0.970215,0.47998,0.459961,0.08,206.270004
4,185.0,4.0,44.0,66.0,3132.0,37.0,4.62,1.69,43.0,5.38,1.6,44.0,5.5,1.77,0.890137,0.830078,0.399902,0.399902,0.9,199.410004
5,185.0,5.0,32.0,54.0,2276.0,31.0,3.44,1.24,31.0,3.44,1.24,31.0,3.44,1.24,1.0,1.0,0.469971,0.459961,0.05,198.649994
6,185.0,6.0,37.0,50.0,2772.0,35.0,3.18,0.87,36.0,3.27,0.79,37.0,3.36,0.81,0.830078,0.77002,0.48999,0.47998,0.39,212.779999
7,185.0,7.0,42.0,57.0,3067.0,36.0,3.6,1.51,41.0,4.1,1.85,40.0,4.0,1.83,0.899902,0.810059,0.48999,0.439941,0.05,197.369995
8,185.0,8.0,36.0,54.0,2207.0,30.0,3.0,1.25,35.0,3.5,1.43,36.0,3.6,1.43,0.97998,0.97998,0.409912,0.409912,0.02,231.75
9,185.0,9.0,44.0,57.0,2793.0,37.0,3.7,1.49,42.0,4.2,1.93,44.0,4.4,2.22,0.899902,0.850098,0.47998,0.459961,0.34,227.809998


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,399.0,0.0,10244.0,128303.0,73406760.0,1983.0,15.02,14.81,2219.0,16.809999,18.799999,6598.0,49.98,57.279999,0.629883,0.740234,0.5,0.540039,21.700001,142.649994
1,399.0,1.0,10842.0,331229.0,276362500.0,2027.0,15.36,14.59,2452.0,18.58,21.209999,6860.0,51.970001,60.950001,0.629883,0.72998,0.5,0.549805,21.120001,142.229996
2,399.0,2.0,11279.0,288662.0,218392700.0,2011.0,15.35,15.92,2709.0,20.68,31.280001,6926.0,52.869999,67.120003,0.649902,0.72998,0.47998,0.52002,19.42,136.429993
3,399.0,3.0,13981.0,1414494.0,1381182000.0,2085.0,16.040001,20.01,2367.0,18.209999,26.360001,7968.0,61.290001,97.510002,0.589844,0.680176,0.48999,0.540039,17.09,135.360001
4,399.0,4.0,17911.0,2012345.0,2284565000.0,2257.0,17.23,25.370001,1889.0,14.42,16.08,9758.0,74.489998,140.759995,0.600098,0.669922,0.48999,0.52002,17.370001,135.899994
5,399.0,5.0,18698.0,1562531.0,1446071000.0,2323.0,17.870001,26.440001,1709.0,13.15,14.94,10081.0,77.550003,147.850006,0.569824,0.629883,0.5,0.52002,17.790001,132.639999
6,399.0,6.0,18809.0,5911058.0,7526244000.0,2338.0,17.709999,25.780001,1669.0,12.64,13.13,10334.0,78.290001,150.789993,0.589844,0.640137,0.47998,0.5,17.84,136.039993
7,399.0,7.0,20364.0,4846245.0,6185210000.0,2409.0,19.120001,27.059999,1824.0,14.48,14.42,10894.0,86.459999,164.210007,0.649902,0.709961,0.52002,0.560059,18.68,131.330002
8,399.0,8.0,21972.0,3773057.0,3490981000.0,2347.0,18.48,27.280001,1704.0,13.42,15.47,10923.0,86.010002,164.550003,0.580078,0.660156,0.52002,0.529785,17.190001,134.910004
9,399.0,9.0,24141.0,1338077.0,1107975000.0,2425.0,18.950001,28.4,1728.0,13.5,16.51,11207.0,87.550003,165.910004,0.569824,0.640137,0.48999,0.509766,17.200001,134.869995


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,541.0,0.0,419.0,5376.0,2122937.0,176.0,8.8,6.4,135.0,6.75,3.31,346.0,17.299999,19.309999,0.959961,0.950195,0.529785,0.5,19.799999,196.990005
1,541.0,1.0,526.0,15698.0,9113469.0,187.0,8.13,5.85,145.0,6.3,2.01,419.0,18.219999,22.469999,0.930176,0.950195,0.419922,0.360107,16.43,198.070007
2,541.0,2.0,612.0,9362.0,4178464.0,221.0,9.61,9.07,153.0,6.65,3.51,493.0,21.43,29.110001,0.950195,0.930176,0.459961,0.399902,11.53,190.380005
3,541.0,3.0,423.0,11421.0,4869419.0,198.0,8.61,7.25,141.0,6.13,2.7,357.0,15.52,18.43,0.879883,0.890137,0.449951,0.389893,16.4,179.669998
4,541.0,4.0,558.0,15396.0,7198935.0,220.0,10.48,12.02,108.0,5.14,2.57,473.0,22.52,32.720001,0.950195,0.959961,0.47998,0.419922,16.280001,190.360001
5,541.0,5.0,400.0,29822.0,9551678.0,173.0,9.11,8.9,104.0,5.47,3.42,350.0,18.42,22.200001,0.830078,0.819824,0.48999,0.429932,16.65,176.300003
6,541.0,6.0,559.0,18844.0,10420377.0,213.0,10.65,13.05,98.0,4.9,3.26,483.0,24.15,35.290001,0.919922,0.919922,0.409912,0.350098,18.68,199.320007
7,541.0,7.0,651.0,20259.0,12341248.0,228.0,12.0,14.55,102.0,5.37,3.42,530.0,27.889999,39.459999,0.939941,0.950195,0.5,0.439941,13.22,184.039993
8,541.0,8.0,485.0,21952.0,18822770.0,172.0,7.82,8.73,92.0,4.18,2.68,399.0,18.139999,26.690001,0.919922,0.910156,0.419922,0.360107,17.18,183.779999
9,541.0,9.0,493.0,19469.0,13193971.0,183.0,9.63,10.02,101.0,5.32,3.4,429.0,22.58,29.950001,0.939941,0.959961,0.459961,0.419922,20.35,175.130005


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:08,262][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:08,273][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:08,277][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2012.37it/s]
[2025-09-15 11:34:08,323][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:08,323][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [426 256 202 308 223 ... 423 543 491 233 482], Length=54
        Val time series IDS: [542  93 193 483 281 ... 136  92 479 194 259], Length=25
        Test time series IDS [518 121  61 409 349 404 448 309 521 328], Length=10
        All time series IDS [426 256 202 308 223 ... 404 448 309 521 328], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:08,562][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:08,575][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:08,579][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1934.02it/s]
[2025-09-15 11:34:08,627][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:08,627][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [393 450 429 180  21 ... 446 236 277 145  33], Length=54
        Val time series IDS: [235 407 291  82 208 ... 496 259 197 525 516], Length=25
        Test time series IDS [333 213 322 520 219 489 381 531 338 453], Length=10
        All time series IDS [393 450 429 180  21 ... 489 381 531 338 453], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:08,681][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:08,691][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:08,695][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2915.17it/s]
[2025-09-15 11:34:08,728][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:08,729][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [249 140  19 424 506 ...  59 183  23 163 101], Length=54
        Val time series IDS: [253 452 507 232 148 ... 374 301  38  60 519], Length=25
        Test time series IDS [265 380  85 428 530 216 432 370 119 225], Length=10
        All time series IDS [249 140  19 424 506 ... 216 432 370 119 225], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val bat

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-15 11:34:08,738][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 107.95it/s]


array([[[2.4900e+02, 0.0000e+00, 4.7740e+03],
        [2.4900e+02, 1.0000e+00, 4.6720e+03],
        [2.4900e+02, 2.0000e+00, 4.2180e+03],
        ...,
        [2.4900e+02, 3.3560e+03, 9.6860e+03],
        [2.4900e+02, 3.3570e+03, 1.0839e+04],
        [2.4900e+02, 3.3580e+03, 1.0692e+04]],

       [[1.4000e+02, 0.0000e+00, 3.0000e+01],
        [1.4000e+02, 1.0000e+00, 3.5000e+01],
        [1.4000e+02, 2.0000e+00, 3.9000e+01],
        ...,
        [1.4000e+02, 3.3560e+03, 1.0000e+01],
        [1.4000e+02, 3.3570e+03, 1.9000e+01],
        [1.4000e+02, 3.3580e+03, 1.1000e+01]],

       [[1.9000e+01, 0.0000e+00, 1.9100e+02],
        [1.9000e+01, 1.0000e+00, 1.8200e+02],
        [1.9000e+01, 2.0000e+00, 2.1300e+02],
        ...,
        [1.9000e+01, 3.3560e+03, 7.1000e+01],
        [1.9000e+01, 3.3570e+03, 1.2200e+02],
        [1.9000e+01, 3.3580e+03, 1.0000e+02]],

       ...,

       [[1.7500e+02, 0.0000e+00, 8.5000e+01],
        [1.7500e+02, 1.0000e+00, 7.2000e+01],
        [1.7500e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-15 11:34:08,764][series_config][INFO] - Quick validation succeeded.
[2025-09-15 11:34:08,775][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-15 11:34:08,779][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3161.71it/s]
[2025-09-15 11:34:08,809][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-15 11:34:08,809][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [363 519 465  43  44 ...  11  67 172 351 204], Length=54
        Val time series IDS: [419 117  88 213   2 ... 299 338  28 437 225], Length=25
        Test time series IDS [481  62 259 158  95 135 343 144 105  30], Length=10
        All time series IDS [363 519 465  43  44 ... 135 343 144 105  30], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: no_transformer
    Anomaly handler
        Anomaly handler type (train set): no_anomaly_handler   
    Batch sizes
        Train batch size: 32
        Val bat

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-15 11:34:08,818][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 110.09it/s]


array([[[1.6500e+02, 0.0000e+00, 1.1600e+02],
        [1.6500e+02, 1.0000e+00, 2.0600e+02],
        [1.6500e+02, 2.0000e+00, 4.5900e+02],
        ...,
        [1.6500e+02, 3.3560e+03, 4.3000e+01],
        [1.6500e+02, 3.3570e+03, 5.3000e+01],
        [1.6500e+02, 3.3580e+03, 3.9000e+01]],

       [[2.0000e+01, 0.0000e+00, 8.7040e+03],
        [2.0000e+01, 1.0000e+00, 1.0403e+04],
        [2.0000e+01, 2.0000e+00, 9.1530e+03],
        ...,
        [2.0000e+01, 3.3560e+03, 9.6240e+03],
        [2.0000e+01, 3.3570e+03, 1.0599e+04],
        [2.0000e+01, 3.3580e+03, 1.1317e+04]],

       [[4.4000e+01, 0.0000e+00, 3.8880e+03],
        [4.4000e+01, 1.0000e+00, 5.1180e+03],
        [4.4000e+01, 2.0000e+00, 7.0280e+03],
        ...,
        [4.4000e+01, 3.3560e+03, 5.9740e+03],
        [4.4000e+01, 3.3570e+03, 5.8390e+03],
        [4.4000e+01, 3.3580e+03, 6.7280e+03]],

       ...,

       [[4.4700e+02, 0.0000e+00, 1.6040e+03],
        [4.4700e+02, 1.0000e+00, 1.1540e+03],
        [4.4700e+02, 2