# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-08-17 13:57:12,772][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:12,777][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:12,788][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:12,793][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1486.48it/s]
[2025-08-17 13:57:12,858][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:12,859][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 87 229 154 521 534 ... 126 280 413 222 507], Length=54
        Val time series IDS: [248 334 302  76 155 ... 172 146 407 240 139], Length=25
        Test time series IDS [451 339 485 478 372 311  13 508 526 293], Length=10
        All time series IDS [ 87 229 154 521 534 ... 311  13 508 526 293], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-17 13:57:12,865][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 13:57:12,866][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 13:57:12,866][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 13:57:12,867][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 13:57:12,867][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-17 13:57:12,871][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 13:57:12,871][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 13:57:12,872][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-17 13:57:12,872][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-17 13:57:12,873][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 13:57:12,881][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 46.97it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 13:57:12,938][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 58.77it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 13:57:12,966][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 142.89it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 13:57:12,983][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 36.74it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:13,016][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:13,027][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:13,031][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1954.15it/s]
[2025-08-17 13:57:13,077][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:13,078][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [272 258 202  21 367 ... 151 488 316 340 159], Length=54
        Val time series IDS: [399 222 226 174 348 ... 505 368 501 165 336], Length=25
        Test time series IDS [245 285 225 186 323 240 116 306 224  45], Length=10
        All time series IDS [272 258 202  21 367 ... 240 116 306 224  45], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        T

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-17 13:57:13,086][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 117.22it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:13,110][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:13,118][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:13,122][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 1139.68it/s]
[2025-08-17 13:57:13,127][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:13,128][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All worker count: 0
        Init worker count: 0
    Other
        Nan th

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-17 13:57:13,136][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 999.36it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:13,144][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:13,154][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:13,158][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1560.20it/s]
[2025-08-17 13:57:13,216][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:13,216][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 17 268 528   5 492 ...  55 416 511 199 149], Length=54
        Val time series IDS: [373 540 390 474 437 ... 353 182 292 243 512], Length=25
        Test time series IDS [  2 466 238 408 458 232 328 431 194 174], Length=10
        All time series IDS [ 17 268 528   5 492 ... 232 328 431 194 174], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,17.0,0.0,661810.0,27256441.0,29750540000.0,188208.0,6.71,33.119999,399507.0,14.24,312.070007,462409.0,16.49,336.779999,0.790039,0.77002,0.449951,0.459961,12.07,145.949997
1,17.0,1.0,711480.0,23075833.0,22606100000.0,194656.0,6.71,32.32,415972.0,14.34,338.440002,495970.0,17.09,368.76001,0.779785,0.759766,0.449951,0.459961,12.41,142.259995
2,17.0,2.0,791648.0,29617104.0,28155490000.0,205098.0,6.53,30.280001,468957.0,14.92,336.540009,515043.0,16.389999,352.899994,0.790039,0.779785,0.459961,0.449951,13.64,141.160004
3,17.0,3.0,780649.0,47170245.0,48267970000.0,240607.0,6.04,25.709999,414693.0,10.4,269.100006,557662.0,13.99,296.26001,0.819824,0.810059,0.469971,0.439941,16.4,133.889999
4,17.0,4.0,878839.0,71064434.0,72758030000.0,298122.0,5.35,20.040001,442726.0,7.94,201.949997,657850.0,11.8,222.850006,0.839844,0.830078,0.48999,0.439941,21.59,129.589996
5,17.0,5.0,897077.0,74533792.0,75108900000.0,323472.0,4.89,17.67,450004.0,6.8,168.600006,691812.0,10.45,184.350006,0.839844,0.830078,0.5,0.449951,24.9,126.980003
6,17.0,6.0,967014.0,80750346.0,79796390000.0,357687.0,4.71,17.18,481002.0,6.33,168.139999,740919.0,9.76,181.960007,0.850098,0.839844,0.5,0.449951,27.549999,124.220001
7,17.0,7.0,1070342.0,79271432.0,74943490000.0,396834.0,5.03,18.629999,532229.0,6.75,171.190002,819204.0,10.38,187.860001,0.850098,0.839844,0.5,0.449951,26.23,125.230003
8,17.0,8.0,1051438.0,89899033.0,91262150000.0,385248.0,4.88,18.040001,519742.0,6.58,162.440002,791258.0,10.01,177.970001,0.850098,0.839844,0.5,0.449951,27.92,124.129997
9,17.0,9.0,1090639.0,90338547.0,88348760000.0,396581.0,4.91,18.129999,536627.0,6.65,164.070007,816100.0,10.11,178.110001,0.850098,0.839844,0.5,0.449951,27.370001,121.050003


In [17]:
dfs

[      id_institution_subnet  id_time    n_flows   n_packets       n_bytes  \
 0                      17.0      0.0   661810.0  27256441.0  2.975054e+10   
 1                      17.0      1.0   711480.0  23075833.0  2.260610e+10   
 2                      17.0      2.0   791648.0  29617104.0  2.815549e+10   
 3                      17.0      3.0   780649.0  47170245.0  4.826797e+10   
 4                      17.0      4.0   878839.0  71064434.0  7.275803e+10   
 ...                     ...      ...        ...         ...           ...   
 3354                   17.0   3354.0  1184791.0  69730807.0  6.989515e+10   
 3355                   17.0   3355.0  1253491.0  72252651.0  7.201448e+10   
 3356                   17.0   3356.0  1198521.0  68012849.0  6.824209e+10   
 3357                   17.0   3357.0  1279711.0  72369016.0  7.381966e+10   
 3358                   17.0   3358.0  1188603.0  54860255.0  5.679931e+10   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_d

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,373.0,0.0,1466.0,24665.0,13653180.0,594.0,9.43,4.56,738.0,11.71,9.37,1115.0,17.700001,10.82,0.709961,0.740234,0.469971,0.52002,29.030001,135.210007
1,373.0,1.0,1388.0,26648.0,15409540.0,589.0,9.5,4.31,719.0,11.6,7.95,1049.0,16.92,8.83,0.709961,0.740234,0.509766,0.560059,32.279999,130.710007
2,373.0,2.0,1274.0,14669.0,6007035.0,533.0,9.03,4.01,610.0,10.34,7.45,990.0,16.780001,8.78,0.660156,0.689941,0.509766,0.569824,33.790001,124.769997
3,373.0,3.0,2240.0,194586.0,159798300.0,582.0,9.86,6.04,603.0,10.22,6.66,1468.0,24.879999,36.5,0.720215,0.75,0.48999,0.5,24.549999,121.010002
4,373.0,4.0,11053.0,1227568.0,980162200.0,1112.0,17.940001,18.440001,599.0,9.66,4.92,6062.0,97.769997,168.660004,0.680176,0.709961,0.469971,0.429932,22.950001,120.379997
5,373.0,5.0,12439.0,1718785.0,1584893000.0,1269.0,19.52,23.309999,588.0,9.05,5.53,6905.0,106.230003,187.070007,0.629883,0.649902,0.469971,0.429932,29.120001,119.940002
6,373.0,6.0,12704.0,1135011.0,816228800.0,1168.0,18.25,21.969999,600.0,9.38,5.29,6723.0,105.050003,183.970001,0.620117,0.660156,0.540039,0.549805,30.25,126.339996
7,373.0,7.0,14327.0,1724529.0,1344559000.0,1287.0,19.799999,26.620001,627.0,9.65,6.83,7656.0,117.779999,232.119995,0.580078,0.620117,0.48999,0.449951,26.57,124.559998
8,373.0,8.0,13351.0,1639653.0,1068663000.0,1302.0,20.67,28.27,574.0,9.11,6.34,7563.0,120.050003,240.949997,0.580078,0.600098,0.5,0.459961,36.060001,127.07
9,373.0,9.0,14256.0,2176574.0,1519920000.0,1302.0,20.030001,28.23,618.0,9.51,6.58,7737.0,119.029999,233.639999,0.540039,0.569824,0.540039,0.549805,31.200001,123.019997


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     373.0      0.0   1466.0    24665.0   13653181.0   
 1                     373.0      1.0   1388.0    26648.0   15409539.0   
 2                     373.0      2.0   1274.0    14669.0    6007035.0   
 3                     373.0      3.0   2240.0   194586.0  159798272.0   
 4                     373.0      4.0  11053.0  1227568.0  980162246.0   
 ...                     ...      ...      ...        ...          ...   
 3354                  373.0   3354.0   1395.0    67797.0   57529127.0   
 3355                  373.0   3355.0   1351.0    70260.0   60748248.0   
 3356                  373.0   3356.0   1302.0    71305.0   61699872.0   
 3357                  373.0   3357.0   1436.0    52865.0   39491518.0   
 3358                  373.0   3358.0   1588.0    86109.0   71639503.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              594.0        9.4300

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,2.0,0.0,164.0,228.0,8625.0,56.0,4.67,1.15,50.0,4.17,1.03,150.0,12.5,10.86,0.549805,0.560059,0.389893,0.379883,3.57,200.830002
1,2.0,1.0,104.0,130.0,5506.0,56.0,4.67,0.89,59.0,4.92,0.9,100.0,8.33,4.01,0.640137,0.620117,0.340088,0.310059,2.69,192.830002
2,2.0,2.0,78.0,108.0,4234.0,42.0,3.82,1.25,39.0,3.55,1.51,76.0,6.91,2.39,0.399902,0.419922,0.47998,0.459961,7.81,183.630005
3,2.0,3.0,52.0,70.0,2686.0,36.0,3.6,1.35,38.0,3.8,1.55,51.0,5.1,1.1,0.560059,0.580078,0.399902,0.389893,0.16,205.809998
4,2.0,4.0,61.0,86.0,3834.0,39.0,3.9,1.73,35.0,3.5,1.51,60.0,6.0,3.56,0.569824,0.589844,0.449951,0.439941,3.56,187.419998
5,2.0,5.0,65.0,91.0,3832.0,33.0,3.0,1.48,28.0,2.55,1.29,64.0,5.82,4.17,0.560059,0.569824,0.540039,0.529785,4.49,184.600006
6,2.0,6.0,56.0,77.0,3127.0,36.0,3.27,1.19,34.0,3.09,1.22,52.0,4.73,1.95,0.540039,0.540039,0.419922,0.409912,3.62,181.880005
7,2.0,7.0,51.0,64.0,3892.0,38.0,3.8,1.62,39.0,3.9,1.73,50.0,5.0,1.56,0.560059,0.469971,0.360107,0.320068,0.23,163.169998
8,2.0,8.0,49.0,72.0,3228.0,29.0,2.9,1.37,28.0,2.8,1.48,48.0,4.8,2.3,0.509766,0.52002,0.5,0.5,4.92,181.570007
9,2.0,9.0,81.0,121.0,4694.0,44.0,4.4,1.17,34.0,3.4,1.58,77.0,7.7,3.5,0.469971,0.47998,0.459961,0.449951,2.97,189.770004


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,17.0,0.0,661810.0,27256441.0,29750540000.0,188208.0,6.71,33.119999,399507.0,14.24,312.070007,462409.0,16.49,336.779999,0.790039,0.77002,0.449951,0.459961,12.07,145.949997
1,17.0,1.0,711480.0,23075833.0,22606100000.0,194656.0,6.71,32.32,415972.0,14.34,338.440002,495970.0,17.09,368.76001,0.779785,0.759766,0.449951,0.459961,12.41,142.259995
2,17.0,2.0,791648.0,29617104.0,28155490000.0,205098.0,6.53,30.280001,468957.0,14.92,336.540009,515043.0,16.389999,352.899994,0.790039,0.779785,0.459961,0.449951,13.64,141.160004
3,17.0,3.0,780649.0,47170245.0,48267970000.0,240607.0,6.04,25.709999,414693.0,10.4,269.100006,557662.0,13.99,296.26001,0.819824,0.810059,0.469971,0.439941,16.4,133.889999
4,17.0,4.0,878839.0,71064434.0,72758030000.0,298122.0,5.35,20.040001,442726.0,7.94,201.949997,657850.0,11.8,222.850006,0.839844,0.830078,0.48999,0.439941,21.59,129.589996
5,17.0,5.0,897077.0,74533792.0,75108900000.0,323472.0,4.89,17.67,450004.0,6.8,168.600006,691812.0,10.45,184.350006,0.839844,0.830078,0.5,0.449951,24.9,126.980003
6,17.0,6.0,967014.0,80750346.0,79796390000.0,357687.0,4.71,17.18,481002.0,6.33,168.139999,740919.0,9.76,181.960007,0.850098,0.839844,0.5,0.449951,27.549999,124.220001
7,17.0,7.0,1070342.0,79271432.0,74943490000.0,396834.0,5.03,18.629999,532229.0,6.75,171.190002,819204.0,10.38,187.860001,0.850098,0.839844,0.5,0.449951,26.23,125.230003
8,17.0,8.0,1051438.0,89899033.0,91262150000.0,385248.0,4.88,18.040001,519742.0,6.58,162.440002,791258.0,10.01,177.970001,0.850098,0.839844,0.5,0.449951,27.92,124.129997
9,17.0,9.0,1090639.0,90338547.0,88348760000.0,396581.0,4.91,18.129999,536627.0,6.65,164.070007,816100.0,10.11,178.110001,0.850098,0.839844,0.5,0.449951,27.370001,121.050003


In [22]:
dfs

[      id_institution_subnet  id_time    n_flows   n_packets       n_bytes  \
 0                      17.0      0.0   661810.0  27256441.0  2.975054e+10   
 1                      17.0      1.0   711480.0  23075833.0  2.260610e+10   
 2                      17.0      2.0   791648.0  29617104.0  2.815549e+10   
 3                      17.0      3.0   780649.0  47170245.0  4.826797e+10   
 4                      17.0      4.0   878839.0  71064434.0  7.275803e+10   
 ...                     ...      ...        ...         ...           ...   
 3354                   17.0   3354.0  1184791.0  69730807.0  6.989515e+10   
 3355                   17.0   3355.0  1253491.0  72252651.0  7.201448e+10   
 3356                   17.0   3356.0  1198521.0  68012849.0  6.824209e+10   
 3357                   17.0   3357.0  1279711.0  72369016.0  7.381966e+10   
 3358                   17.0   3358.0  1188603.0  54860255.0  5.679931e+10   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_d

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:14,222][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:14,232][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:14,236][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2018.32it/s]
[2025-08-17 13:57:14,281][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:14,282][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [485   7 411 206 332 ... 354  35 191  31 173], Length=54
        Val time series IDS: [493  83 267 393 266 ... 509 402 375 132  85], Length=25
        Test time series IDS [486  33 345  99 276 292 542 527 188 416], Length=10
        All time series IDS [485   7 411 206 332 ... 292 542 527 188 416], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:14,429][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:14,489][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:14,493][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2250.92it/s]
[2025-08-17 13:57:14,535][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:14,535][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [515 545  84 124 173 ... 544 502 277 329 402], Length=54
        Val time series IDS: [287 484 236 164 336 ... 181 434 478  43 122], Length=25
        Test time series IDS [158  60 223 350 457 135 481 528 452 175], Length=10
        All time series IDS [515 545  84 124 173 ... 135 481 528 452 175], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:14,582][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:14,592][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:14,595][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3176.34it/s]
[2025-08-17 13:57:14,624][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:14,624][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [385 351  74 138 507 ... 200 248 235 315 155], Length=54
        Val time series IDS: [ 36 474 471 484 258 ... 480 472 542 161 522], Length=25
        Test time series IDS [187 266  17 422  22 448 339   2 431 452], Length=10
        All time series IDS [385 351  74 138 507 ... 448 339   2 431 452], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-17 13:57:14,632][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 124.93it/s]


array([[[3.850e+02, 0.000e+00, 3.823e+03],
        [3.850e+02, 1.000e+00, 3.687e+03],
        [3.850e+02, 2.000e+00, 4.951e+03],
        ...,
        [3.850e+02, 3.356e+03, 1.983e+03],
        [3.850e+02, 3.357e+03, 2.314e+03],
        [3.850e+02, 3.358e+03, 2.248e+03]],

       [[3.510e+02, 0.000e+00, 0.000e+00],
        [3.510e+02, 1.000e+00, 2.000e+01],
        [3.510e+02, 2.000e+00, 1.700e+01],
        ...,
        [3.510e+02, 3.356e+03, 0.000e+00],
        [3.510e+02, 3.357e+03, 0.000e+00],
        [3.510e+02, 3.358e+03, 0.000e+00]],

       [[7.400e+01, 0.000e+00, 2.700e+01],
        [7.400e+01, 1.000e+00, 2.300e+01],
        [7.400e+01, 2.000e+00, 3.300e+01],
        ...,
        [7.400e+01, 3.356e+03, 6.300e+01],
        [7.400e+01, 3.357e+03, 2.800e+01],
        [7.400e+01, 3.358e+03, 1.200e+01]],

       ...,

       [[5.000e+02, 0.000e+00, 1.490e+03],
        [5.000e+02, 1.000e+00, 2.319e+03],
        [5.000e+02, 2.000e+00, 9.623e+03],
        ...,
        [5.000e+02, 3.356e

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-17 13:57:14,656][series_config][INFO] - Quick validation succeeded.
[2025-08-17 13:57:14,666][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-17 13:57:14,669][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3407.61it/s]
[2025-08-17 13:57:14,697][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-17 13:57:14,697][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [258 349  84 510  47 ...  69 441 230 338 299], Length=54
        Val time series IDS: [ 38 117 267 175  75 ...  33 361  97 194 132], Length=25
        Test time series IDS [131 543 471  39 298  36 107 344  70 123], Length=10
        All time series IDS [258 349  84 510  47 ...  36 107 344  70 123], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-17 13:57:14,704][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 103.58it/s]


array([[[4.4100e+02, 0.0000e+00, 1.0990e+03],
        [4.4100e+02, 1.0000e+00, 1.1600e+03],
        [4.4100e+02, 2.0000e+00, 2.0900e+03],
        ...,
        [4.4100e+02, 3.3560e+03, 1.1230e+03],
        [4.4100e+02, 3.3570e+03, 1.2960e+03],
        [4.4100e+02, 3.3580e+03, 1.3140e+03]],

       [[5.1000e+02, 0.0000e+00, 1.1200e+02],
        [5.1000e+02, 1.0000e+00, 8.1000e+01],
        [5.1000e+02, 2.0000e+00, 1.0900e+02],
        ...,
        [5.1000e+02, 3.3560e+03, 5.4000e+01],
        [5.1000e+02, 3.3570e+03, 5.6000e+01],
        [5.1000e+02, 3.3580e+03, 6.2000e+01]],

       [[3.3800e+02, 0.0000e+00, 5.3150e+03],
        [3.3800e+02, 1.0000e+00, 5.3700e+03],
        [3.3800e+02, 2.0000e+00, 1.0269e+04],
        ...,
        [3.3800e+02, 3.3560e+03, 8.5200e+03],
        [3.3800e+02, 3.3570e+03, 9.5620e+03],
        [3.3800e+02, 3.3580e+03, 8.0650e+03]],

       ...,

       [[4.8000e+02, 0.0000e+00, 2.1730e+03],
        [4.8000e+02, 1.0000e+00, 2.6360e+03],
        [4.8000e+02, 2