# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, is_series_based=True, display_details=True)

[2025-08-05 19:44:49,043][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:49,053][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:49,064][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:49,068][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1446.01it/s]
[2025-08-05 19:44:49,137][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:49,137][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 12 409 521 503 422 ... 256  47 198 248  92], Length=54
        Val time series IDS: [327 104 206 414 391 ... 143 113 219 429 141], Length=25
        Test time series IDS [135 294 209 343  32  29 519 372  25  38], Length=10
        All time series IDS [ 12 409 521 503 422 ...  29 519 372  25  38], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-05 19:44:49,145][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-05 19:44:49,145][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-05 19:44:49,146][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-05 19:44:49,146][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-05 19:44:49,147][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-05 19:44:49,160][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-05 19:44:49,160][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-05 19:44:49,161][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-05 19:44:49,161][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-05 19:44:49,162][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-05 19:44:49,180][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 47.31it/s]


(33, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-05 19:44:49,242][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 52.57it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-05 19:44:49,288][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 98.31it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-05 19:44:49,318][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 35.22it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:49,361][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:49,372][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:49,376][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1781.48it/s]
[2025-08-05 19:44:49,429][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:49,429][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 15 311 504 395 371 ... 259 195 359 388 350], Length=54
        Val time series IDS: [194 537 292 536 220 ... 469 173 435 140  42], Length=25
        Test time series IDS [ 10  40 126 492 107  97 330 499 167 257], Length=10
        All time series IDS [ 15 311 504 395 371 ...  97 330 499 167 257], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        T

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-05 19:44:49,442][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 117.49it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:49,470][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:49,479][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:49,484][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2002.77it/s]
[2025-08-05 19:44:49,488][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:49,489][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All worker count: 0
        Init worker count: 0
    Other
        Nan th

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-05 19:44:49,506][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 665.76it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:49,517][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:49,527][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:49,531][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1651.11it/s]
[2025-08-05 19:44:49,587][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:49,587][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [223 230 367 179 411 ... 505 455 284 206 112], Length=54
        Val time series IDS: [479 297 252 341 268 ...   5 380 250 319 232], Length=25
        Test time series IDS [465  37 318 484  86 475 278 483 182 364], Length=10
        All time series IDS [223 230 367 179 411 ... 475 278 483 182 364], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
1,223.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
2,223.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
3,223.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
4,223.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
5,223.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,223.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,223.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
8,223.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,223.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [17]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets  n_bytes  \
 0                     223.0      0.0      0.0        0.0      0.0   
 1                     223.0      1.0      0.0        0.0      0.0   
 2                     223.0      2.0      0.0        0.0      0.0   
 3                     223.0      3.0      0.0        0.0      0.0   
 4                     223.0      4.0      0.0        0.0      0.0   
 ...                     ...      ...      ...        ...      ...   
 3354                  223.0   3354.0     37.0       62.0   2489.0   
 3355                  223.0   3355.0     23.0       38.0   1460.0   
 3356                  223.0   3356.0     53.0       71.0   2624.0   
 3357                  223.0   3357.0     43.0       60.0   2264.0   
 3358                  223.0   3358.0    105.0      182.0   7931.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0                0.0            0.00            0.00               0.0   
 1      

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,479.0,0.0,1465.0,18548.0,8541053.0,508.0,13.73,15.25,599.0,16.190001,20.68,1101.0,29.76,30.370001,0.77002,0.72998,0.429932,0.449951,12.1,124.870003
1,479.0,1.0,2187.0,69737.0,54844960.0,585.0,18.870001,20.889999,563.0,18.16,19.940001,1462.0,47.16,53.98,0.799805,0.799805,0.52002,0.509766,11.31,111.970001
2,479.0,2.0,4325.0,174765.0,127216600.0,769.0,23.299999,26.09,1487.0,45.060001,159.130005,2104.0,63.759998,70.540001,0.75,0.740234,0.47998,0.47998,7.82,110.089996
3,479.0,3.0,9407.0,1556188.0,1268688000.0,897.0,27.18,29.33,1822.0,55.209999,154.350006,3647.0,110.519997,133.369995,0.77002,0.759766,0.47998,0.469971,7.11,111.599998
4,479.0,4.0,8718.0,1912507.0,1490566000.0,973.0,27.799999,29.110001,920.0,26.290001,40.700001,3882.0,110.910004,140.990005,0.72998,0.700195,0.509766,0.48999,8.99,107.32
5,479.0,5.0,8816.0,1937153.0,1499715000.0,1022.0,26.889999,31.18,771.0,20.290001,37.07,4063.0,106.919998,146.770004,0.740234,0.72998,0.529785,0.5,11.36,106.349998
6,479.0,6.0,7785.0,912131.0,729120800.0,1046.0,27.530001,32.439999,628.0,16.530001,23.889999,4124.0,108.529999,152.610001,0.720215,0.709961,0.549805,0.529785,13.65,106.589996
7,479.0,7.0,8654.0,1233656.0,705455700.0,1073.0,24.950001,32.25,746.0,17.35,28.540001,4535.0,105.470001,159.869995,0.799805,0.790039,0.52002,0.48999,9.93,119.720001
8,479.0,8.0,8118.0,1337165.0,1081996000.0,1096.0,24.91,32.380001,656.0,14.91,23.15,4228.0,96.089996,147.059998,0.689941,0.680176,0.48999,0.48999,9.28,112.739998
9,479.0,9.0,7298.0,860759.0,675445000.0,1088.0,29.41,33.560001,655.0,17.700001,25.15,3932.0,106.269997,138.960007,0.799805,0.790039,0.47998,0.449951,10.99,112.730003


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     479.0      0.0   1465.0    18548.0  8.541053e+06   
 1                     479.0      1.0   2187.0    69737.0  5.484496e+07   
 2                     479.0      2.0   4325.0   174765.0  1.272166e+08   
 3                     479.0      3.0   9407.0  1556188.0  1.268688e+09   
 4                     479.0      4.0   8718.0  1912507.0  1.490566e+09   
 ...                     ...      ...      ...        ...           ...   
 3354                  479.0   3354.0  10386.0   353722.0  2.061706e+08   
 3355                  479.0   3355.0   1773.0    72165.0  4.803945e+07   
 3356                  479.0   3356.0   1400.0    49224.0  3.312486e+07   
 3357                  479.0   3357.0   1784.0   146826.0  1.118188e+08   
 3358                  479.0   3358.0   1785.0   148006.0  1.388537e+08   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              508.0  

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,465.0,0.0,1626.0,11555.0,3852742.0,303.0,6.31,3.88,341.0,7.1,4.61,513.0,10.69,12.3,0.680176,0.660156,0.47998,0.429932,14.7,119.730003
1,465.0,1.0,1741.0,10271.0,3644584.0,305.0,6.93,3.64,339.0,7.7,4.92,520.0,11.82,12.75,0.720215,0.709961,0.469971,0.429932,11.29,116.279999
2,465.0,2.0,1730.0,23066.0,18637220.0,343.0,6.86,4.14,366.0,7.32,4.41,552.0,11.04,12.81,0.77002,0.779785,0.459961,0.439941,9.28,117.639999
3,465.0,3.0,1569.0,39001.0,33310840.0,308.0,6.29,4.17,374.0,7.63,6.85,516.0,10.53,13.04,0.790039,0.779785,0.459961,0.459961,9.75,121.459999
4,465.0,4.0,1832.0,29171.0,14139680.0,375.0,6.47,4.35,402.0,6.93,4.94,691.0,11.91,16.870001,0.759766,0.759766,0.5,0.449951,14.9,110.870003
5,465.0,5.0,2192.0,440224.0,84156560.0,455.0,8.58,6.61,333.0,6.28,3.76,921.0,17.379999,21.01,0.790039,0.790039,0.449951,0.379883,15.61,109.050003
6,465.0,6.0,1948.0,656826.0,895812900.0,443.0,7.38,5.16,470.0,7.83,6.39,857.0,14.28,16.379999,0.879883,0.899902,0.47998,0.439941,16.43,121.209999
7,465.0,7.0,2377.0,222369.0,166417600.0,495.0,7.98,5.61,499.0,8.05,6.52,1007.0,16.24,19.709999,0.779785,0.77002,0.469971,0.429932,20.5,121.330002
8,465.0,8.0,2410.0,1808021.0,1808931000.0,489.0,8.43,7.3,400.0,6.9,4.89,983.0,16.950001,22.780001,0.799805,0.810059,0.560059,0.5,18.790001,116.019997
9,465.0,9.0,2156.0,52891.0,33509350.0,476.0,7.93,6.58,426.0,7.1,5.54,896.0,14.93,20.85,0.75,0.77002,0.560059,0.549805,25.219999,118.040001


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,223.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
1,223.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
2,223.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
3,223.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
4,223.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
5,223.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
6,223.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
7,223.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
8,223.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0
9,223.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.5,0.5,0.0,0.0


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets  n_bytes  \
 0                     223.0      0.0      0.0        0.0      0.0   
 1                     223.0      1.0      0.0        0.0      0.0   
 2                     223.0      2.0      0.0        0.0      0.0   
 3                     223.0      3.0      0.0        0.0      0.0   
 4                     223.0      4.0      0.0        0.0      0.0   
 ...                     ...      ...      ...        ...      ...   
 3354                  223.0   3354.0     37.0       62.0   2489.0   
 3355                  223.0   3355.0     23.0       38.0   1460.0   
 3356                  223.0   3356.0     53.0       71.0   2624.0   
 3357                  223.0   3357.0     43.0       60.0   2264.0   
 3358                  223.0   3358.0    105.0      182.0   7931.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0                0.0            0.00            0.00               0.0   
 1      

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:50,776][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:50,786][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:50,790][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1673.74it/s]
[2025-08-05 19:44:50,847][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:50,847][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [127 333 269 440 337 ... 166 133 469 232 374], Length=54
        Val time series IDS: [535  84 314 371 353 ... 221 156 428 143 425], Length=25
        Test time series IDS [326 473 168 123  64 174 361 502 349 383], Length=10
        All time series IDS [127 333 269 440 337 ... 174 361 502 349 383], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:51,053][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:51,066][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:51,070][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1778.69it/s]
[2025-08-05 19:44:51,121][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:51,122][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [447   6 313 523 211 ... 114 303 175 418 379], Length=54
        Val time series IDS: [441  41 286 135  35 ... 297 437 157 202 529], Length=25
        Test time series IDS [449 311 118 377 490 244 145 429 246 533], Length=10
        All time series IDS [447   6 313 523 211 ... 244 145 429 246 533], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:51,193][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:51,204][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:51,208][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3052.67it/s]
[2025-08-05 19:44:51,239][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:51,240][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [360  54 488 401 108 ... 446 511  58 210  41], Length=54
        Val time series IDS: [207 392 103 443 191 ... 367 254 281 314 134], Length=25
        Test time series IDS [ 62 398 118 144 310  74 371 186 302 435], Length=10
        All time series IDS [360  54 488 401 108 ...  74 371 186 302 435], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-05 19:44:51,258][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 114.26it/s]


array([[[3.6000e+02, 0.0000e+00, 2.4090e+03],
        [3.6000e+02, 1.0000e+00, 3.4070e+03],
        [3.6000e+02, 2.0000e+00, 6.5520e+03],
        ...,
        [3.6000e+02, 3.3560e+03, 1.9870e+03],
        [3.6000e+02, 3.3570e+03, 2.9940e+03],
        [3.6000e+02, 3.3580e+03, 1.9560e+03]],

       [[5.4000e+01, 0.0000e+00, 2.1700e+02],
        [5.4000e+01, 1.0000e+00, 2.2600e+02],
        [5.4000e+01, 2.0000e+00, 2.3000e+02],
        ...,
        [5.4000e+01, 3.3560e+03, 2.3200e+02],
        [5.4000e+01, 3.3570e+03, 2.6900e+02],
        [5.4000e+01, 3.3580e+03, 2.6700e+02]],

       [[4.8800e+02, 0.0000e+00, 5.3430e+03],
        [4.8800e+02, 1.0000e+00, 5.3470e+03],
        [4.8800e+02, 2.0000e+00, 4.9960e+03],
        ...,
        [4.8800e+02, 3.3560e+03, 1.2780e+03],
        [4.8800e+02, 3.3570e+03, 1.4440e+03],
        [4.8800e+02, 3.3580e+03, 1.3500e+03]],

       ...,

       [[3.8700e+02, 0.0000e+00, 1.4856e+04],
        [3.8700e+02, 1.0000e+00, 1.9115e+04],
        [3.8700e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-05 19:44:51,286][config][INFO] - Quick validation succeeded.
[2025-08-05 19:44:51,295][config][INFO] - Finalization and validation completed successfully.
[2025-08-05 19:44:51,298][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2773.78it/s]
[2025-08-05 19:44:51,334][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-05 19:44:51,334][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [439 255 398 299 463 ...  15 309 496  25 242], Length=54
        Val time series IDS: [473 350  75 289 253 ... 106 237 327 261 494], Length=25
        Test time series IDS [525 547 144 330 139 465 154 336 485 156], Length=10
        All time series IDS [439 255 398 299 463 ... 465 154 336 485 156], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-05 19:44:51,352][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 83.01it/s]


array([[[1.21000e+02, 0.00000e+00, 1.26000e+02],
        [1.21000e+02, 1.00000e+00, 1.91000e+02],
        [1.21000e+02, 2.00000e+00, 1.11000e+02],
        ...,
        [1.21000e+02, 3.35600e+03, 9.60000e+01],
        [1.21000e+02, 3.35700e+03, 8.80000e+01],
        [1.21000e+02, 3.35800e+03, 7.10000e+01]],

       [[2.99000e+02, 0.00000e+00, 9.96700e+03],
        [2.99000e+02, 1.00000e+00, 1.12660e+04],
        [2.99000e+02, 2.00000e+00, 1.23010e+04],
        ...,
        [2.99000e+02, 3.35600e+03, 6.98700e+03],
        [2.99000e+02, 3.35700e+03, 7.52700e+03],
        [2.99000e+02, 3.35800e+03, 7.56400e+03]],

       [[4.39000e+02, 0.00000e+00, 1.81700e+03],
        [4.39000e+02, 1.00000e+00, 2.15600e+03],
        [4.39000e+02, 2.00000e+00, 2.13500e+03],
        ...,
        [4.39000e+02, 3.35600e+03, 1.11900e+03],
        [4.39000e+02, 3.35700e+03, 1.25200e+03],
        [4.39000e+02, 3.35800e+03, 1.13000e+03]],

       ...,

       [[6.30000e+01, 0.00000e+00, 1.98000e+02],
        [6.