# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-08-26 09:16:30,100][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:30,105][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:30,116][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:30,120][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1346.53it/s]
[2025-08-26 09:16:30,193][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:30,193][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 55 236   1 240 165 ... 385 388  84 288 480], Length=54
        Val time series IDS: [457 186 282 152 373 ... 444 175 132 245 379], Length=25
        Test time series IDS [381  53 281 238 405 153 380 362  63 366], Length=10
        All time series IDS [ 55 236   1 240 165 ... 153 380 362  63 366], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-26 09:16:30,197][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 09:16:30,198][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 09:16:30,198][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 09:16:30,199][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 09:16:30,199][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-26 09:16:30,205][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 09:16:30,206][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 09:16:30,206][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-26 09:16:30,207][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-26 09:16:30,207][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 09:16:30,215][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 45.91it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 09:16:30,271][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 50.21it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 09:16:30,301][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 128.92it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 09:16:30,319][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 37.22it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:30,353][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:30,364][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:30,367][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1677.75it/s]
[2025-08-26 09:16:30,422][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:30,422][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 13  44 462 269  58 ... 295 204 316 273  49], Length=54
        Val time series IDS: [ 82 437 123 320 191 ... 223 303  50 407 415], Length=25
        Test time series IDS [357 331  80  46  55 168 144 451 510  83], Length=10
        All time series IDS [ 13  44 462 269  58 ... 168 144 451 510  83], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-26 09:16:30,430][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 128.32it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:30,453][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:30,462][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:30,466][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2000.86it/s]
[2025-08-26 09:16:30,470][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:30,470][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-26 09:16:30,477][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:30,487][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:30,496][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:30,499][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1585.20it/s]
[2025-08-26 09:16:30,557][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:30,557][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 70 457 517 208 453 ... 196 277 262 378 469], Length=54
        Val time series IDS: [488 425 113 129 509 ...  78 225 327 183  66], Length=25
        Test time series IDS [139 415 483 232 145 321 205 370 178  98], Length=10
        All time series IDS [ 70 457 517 208 453 ... 321 205 370 178  98], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,70.0,0.0,34541.0,116541.0,16856236.0,18896.0,7.62,3.88,21825.0,8.8,4.72,28076.0,11.32,8.56,0.72998,0.720215,0.459961,0.5,3.68,122.580002
1,70.0,1.0,35648.0,101475.0,11964163.0,19332.0,7.68,3.84,23032.0,9.15,4.93,29240.0,11.62,8.97,0.75,0.72998,0.47998,0.5,3.58,120.169998
2,70.0,2.0,29927.0,85991.0,12593681.0,17189.0,7.0,3.54,19238.0,7.83,4.13,25122.0,10.23,8.0,0.72998,0.709961,0.47998,0.5,3.91,117.760002
3,70.0,3.0,25206.0,72020.0,9261641.0,15308.0,6.3,3.15,16857.0,6.94,3.69,21575.0,8.88,6.61,0.740234,0.72998,0.47998,0.509766,4.28,117.970001
4,70.0,4.0,20274.0,73554.0,25988794.0,13091.0,5.54,2.89,13783.0,5.84,3.08,17806.0,7.54,5.66,0.72998,0.720215,0.5,0.52002,4.05,116.239998
5,70.0,5.0,17214.0,51141.0,5871328.0,11323.0,4.85,2.6,11622.0,4.98,2.7,15418.0,6.6,5.07,0.720215,0.709961,0.5,0.529785,4.52,115.690002
6,70.0,6.0,17459.0,62215.0,9116394.0,11175.0,4.8,2.61,11522.0,4.95,2.69,15170.0,6.51,5.18,0.720215,0.709961,0.5,0.52002,4.51,115.669998
7,70.0,7.0,18243.0,57569.0,9248663.0,11546.0,4.93,2.66,11864.0,5.06,2.76,16175.0,6.9,5.34,0.72998,0.720215,0.509766,0.529785,4.35,114.239998
8,70.0,8.0,18962.0,71590.0,10361839.0,11904.0,5.09,2.94,12792.0,5.46,3.28,16534.0,7.06,5.49,0.720215,0.720215,0.5,0.52002,4.35,113.449997
9,70.0,9.0,20426.0,64409.0,7675545.0,12847.0,5.45,3.26,13465.0,5.71,3.42,18015.0,7.64,6.06,0.740234,0.740234,0.509766,0.540039,4.29,109.470001


In [17]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets     n_bytes  \
 0                      70.0      0.0  34541.0   116541.0  16856236.0   
 1                      70.0      1.0  35648.0   101475.0  11964163.0   
 2                      70.0      2.0  29927.0    85991.0  12593681.0   
 3                      70.0      3.0  25206.0    72020.0   9261641.0   
 4                      70.0      4.0  20274.0    73554.0  25988794.0   
 ...                     ...      ...      ...        ...         ...   
 3354                   70.0   3354.0  23542.0    65002.0   6450891.0   
 3355                   70.0   3355.0  25793.0    70041.0   6571362.0   
 3356                   70.0   3356.0  25243.0    63064.0   5869493.0   
 3357                   70.0   3357.0  28878.0    79485.0   7535866.0   
 3358                   70.0   3358.0  27897.0    81503.0   7608691.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0            18896.0            7.62          

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,488.0,0.0,5343.0,33690.0,11763618.0,321.0,6.98,3.71,3671.0,79.800003,191.759995,537.0,11.67,12.35,0.740234,0.740234,0.469971,0.429932,1.92,159.339996
1,488.0,1.0,5347.0,38956.0,20723615.0,357.0,8.11,3.89,3867.0,87.889999,204.660004,550.0,12.5,10.36,0.709961,0.700195,0.48999,0.439941,2.33,155.610001
2,488.0,2.0,4996.0,32482.0,12304350.0,323.0,7.02,3.74,3396.0,73.830002,177.419998,550.0,11.96,12.23,0.720215,0.720215,0.47998,0.47998,3.16,149.809998
3,488.0,3.0,4298.0,705123.0,995485397.0,318.0,7.23,5.35,2836.0,64.449997,150.169998,581.0,13.2,16.01,0.689941,0.680176,0.469971,0.439941,4.5,156.050003
4,488.0,4.0,3535.0,63417.0,22726193.0,309.0,7.36,6.77,2330.0,55.48,126.040001,591.0,14.07,18.719999,0.740234,0.740234,0.469971,0.419922,4.59,149.75
5,488.0,5.0,3188.0,26381.0,10329641.0,297.0,7.82,7.0,2095.0,55.130001,117.900002,578.0,15.21,19.76,0.709961,0.72998,0.5,0.439941,5.75,146.429993
6,488.0,6.0,3606.0,159412.0,121603977.0,325.0,8.12,8.88,1949.0,48.720001,105.900002,890.0,22.25,38.470001,0.72998,0.72998,0.469971,0.419922,4.3,146.110001
7,488.0,7.0,4017.0,75551.0,54338781.0,317.0,8.34,8.98,2160.0,56.84,121.139999,946.0,24.889999,39.810001,0.790039,0.740234,0.449951,0.409912,2.67,159.979996
8,488.0,8.0,4194.0,354831.0,332140134.0,356.0,8.09,10.2,2136.0,48.549999,112.300003,1007.0,22.889999,41.130001,0.709961,0.700195,0.509766,0.449951,7.15,152.020004
9,488.0,9.0,4320.0,104837.0,69084679.0,319.0,9.11,10.38,2179.0,62.259998,128.160004,1047.0,29.91,48.630001,0.660156,0.629883,0.5,0.449951,4.03,147.369995


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     488.0      0.0   5343.0    33690.0   11763618.0   
 1                     488.0      1.0   5347.0    38956.0   20723615.0   
 2                     488.0      2.0   4996.0    32482.0   12304350.0   
 3                     488.0      3.0   4298.0   705123.0  995485397.0   
 4                     488.0      4.0   3535.0    63417.0   22726193.0   
 ...                     ...      ...      ...        ...          ...   
 3354                  488.0   3354.0   1312.0    16854.0    8665611.0   
 3355                  488.0   3355.0   1336.0    16041.0    5651840.0   
 3356                  488.0   3356.0   1278.0    16045.0    2089048.0   
 3357                  488.0   3357.0   1444.0    23701.0    8743536.0   
 3358                  488.0   3358.0   1350.0    18585.0    5741758.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              321.0            6.

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,139.0,0.0,434.0,16758.0,12849949.0,113.0,18.83,4.4,412.0,68.669998,13.62,299.0,49.830002,2.32,0.990234,0.990234,0.609863,0.950195,8.02,93.82
1,139.0,1.0,379.0,44284.0,50845494.0,111.0,18.5,3.02,368.0,61.330002,5.47,278.0,46.330002,3.83,1.0,1.0,0.779785,0.97998,5.72,91.959999
2,139.0,2.0,499.0,23199.0,19666146.0,117.0,19.5,2.66,487.0,81.169998,31.959999,303.0,50.5,4.68,1.0,1.0,0.620117,0.959961,5.26,85.900002
3,139.0,3.0,386.0,31090.0,28402732.0,130.0,21.67,4.97,376.0,62.669998,14.79,285.0,47.5,7.06,1.0,1.0,0.649902,0.959961,7.94,92.870003
4,139.0,4.0,432.0,75358.0,77984115.0,144.0,24.0,4.77,422.0,70.330002,18.98,356.0,59.330002,13.49,1.0,1.0,0.689941,0.97998,6.48,92.230003
5,139.0,5.0,435.0,43765.0,40946981.0,147.0,24.5,5.5,428.0,71.330002,31.08,358.0,59.669998,24.209999,1.0,1.0,0.700195,0.97998,8.07,94.300003
6,139.0,6.0,426.0,44726.0,45396645.0,153.0,25.5,5.24,420.0,70.0,7.16,323.0,53.830002,6.62,1.0,1.0,0.75,0.97998,8.15,91.400002
7,139.0,7.0,441.0,42765.0,43532492.0,145.0,24.17,2.23,433.0,72.169998,5.23,329.0,54.830002,5.08,1.0,1.0,0.740234,0.97998,8.2,86.360001
8,139.0,8.0,404.0,48138.0,47068580.0,150.0,25.0,4.15,394.0,65.669998,10.11,319.0,53.169998,9.93,1.0,1.0,0.700195,0.97998,10.75,93.730003
9,139.0,9.0,414.0,77986.0,83712839.0,138.0,23.0,5.9,404.0,67.330002,10.73,334.0,55.669998,6.65,1.0,1.0,0.759766,0.97998,10.03,94.150002


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,70.0,0.0,34541.0,116541.0,16856236.0,18896.0,7.62,3.88,21825.0,8.8,4.72,28076.0,11.32,8.56,0.72998,0.720215,0.459961,0.5,3.68,122.580002
1,70.0,1.0,35648.0,101475.0,11964163.0,19332.0,7.68,3.84,23032.0,9.15,4.93,29240.0,11.62,8.97,0.75,0.72998,0.47998,0.5,3.58,120.169998
2,70.0,2.0,29927.0,85991.0,12593681.0,17189.0,7.0,3.54,19238.0,7.83,4.13,25122.0,10.23,8.0,0.72998,0.709961,0.47998,0.5,3.91,117.760002
3,70.0,3.0,25206.0,72020.0,9261641.0,15308.0,6.3,3.15,16857.0,6.94,3.69,21575.0,8.88,6.61,0.740234,0.72998,0.47998,0.509766,4.28,117.970001
4,70.0,4.0,20274.0,73554.0,25988794.0,13091.0,5.54,2.89,13783.0,5.84,3.08,17806.0,7.54,5.66,0.72998,0.720215,0.5,0.52002,4.05,116.239998
5,70.0,5.0,17214.0,51141.0,5871328.0,11323.0,4.85,2.6,11622.0,4.98,2.7,15418.0,6.6,5.07,0.720215,0.709961,0.5,0.529785,4.52,115.690002
6,70.0,6.0,17459.0,62215.0,9116394.0,11175.0,4.8,2.61,11522.0,4.95,2.69,15170.0,6.51,5.18,0.720215,0.709961,0.5,0.52002,4.51,115.669998
7,70.0,7.0,18243.0,57569.0,9248663.0,11546.0,4.93,2.66,11864.0,5.06,2.76,16175.0,6.9,5.34,0.72998,0.720215,0.509766,0.529785,4.35,114.239998
8,70.0,8.0,18962.0,71590.0,10361839.0,11904.0,5.09,2.94,12792.0,5.46,3.28,16534.0,7.06,5.49,0.720215,0.720215,0.5,0.52002,4.35,113.449997
9,70.0,9.0,20426.0,64409.0,7675545.0,12847.0,5.45,3.26,13465.0,5.71,3.42,18015.0,7.64,6.06,0.740234,0.740234,0.509766,0.540039,4.29,109.470001


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets     n_bytes  \
 0                      70.0      0.0  34541.0   116541.0  16856236.0   
 1                      70.0      1.0  35648.0   101475.0  11964163.0   
 2                      70.0      2.0  29927.0    85991.0  12593681.0   
 3                      70.0      3.0  25206.0    72020.0   9261641.0   
 4                      70.0      4.0  20274.0    73554.0  25988794.0   
 ...                     ...      ...      ...        ...         ...   
 3354                   70.0   3354.0  23542.0    65002.0   6450891.0   
 3355                   70.0   3355.0  25793.0    70041.0   6571362.0   
 3356                   70.0   3356.0  25243.0    63064.0   5869493.0   
 3357                   70.0   3357.0  28878.0    79485.0   7535866.0   
 3358                   70.0   3358.0  27897.0    81503.0   7608691.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0            18896.0            7.62          

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:31,616][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:31,627][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:31,630][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1631.85it/s]
[2025-08-26 09:16:31,688][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:31,688][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [  9 497 358 142 256 ...  88 445 522 211  83], Length=54
        Val time series IDS: [345 328 214 137 388 ... 529 202 311  15 124], Length=25
        Test time series IDS [213 362   6  33  74 133 455  61 123 316], Length=10
        All time series IDS [  9 497 358 142 256 ... 133 455  61 123 316], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:31,854][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:31,917][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:31,921][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1993.88it/s]
[2025-08-26 09:16:31,967][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:31,967][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [416 214 521 123 440 ... 523  98 253 384 500], Length=54
        Val time series IDS: [406 507 506 302 199 ... 116  27 483 225 137], Length=25
        Test time series IDS [401 434 361  80 427  22 505 270  10 317], Length=10
        All time series IDS [416 214 521 123 440 ...  22 505 270  10 317], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:32,021][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:32,031][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:32,035][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3137.23it/s]
[2025-08-26 09:16:32,065][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:32,066][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [182 259 515 328 443 ...  56 461 407 369 506], Length=54
        Val time series IDS: [238   5 130  30 486 ...  73 495 241 519 425], Length=25
        Test time series IDS [219 218 375 418 510  15 337 426 174 363], Length=10
        All time series IDS [182 259 515 328 443 ...  15 337 426 174 363], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-26 09:16:32,073][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 121.14it/s]


array([[[1.8200e+02, 0.0000e+00, 5.1600e+02],
        [1.8200e+02, 1.0000e+00, 9.9400e+02],
        [1.8200e+02, 2.0000e+00, 2.3790e+03],
        ...,
        [1.8200e+02, 3.3560e+03, 8.3600e+02],
        [1.8200e+02, 3.3570e+03, 1.2140e+03],
        [1.8200e+02, 3.3580e+03, 6.2700e+02]],

       [[2.5900e+02, 0.0000e+00, 7.0000e+01],
        [2.5900e+02, 1.0000e+00, 5.0000e+01],
        [2.5900e+02, 2.0000e+00, 4.2000e+01],
        ...,
        [2.5900e+02, 3.3560e+03, 1.8000e+02],
        [2.5900e+02, 3.3570e+03, 5.9000e+01],
        [2.5900e+02, 3.3580e+03, 7.6000e+01]],

       [[5.1500e+02, 0.0000e+00, 1.0750e+03],
        [5.1500e+02, 1.0000e+00, 1.1190e+03],
        [5.1500e+02, 2.0000e+00, 1.0170e+03],
        ...,
        [5.1500e+02, 3.3560e+03, 7.8400e+02],
        [5.1500e+02, 3.3570e+03, 9.1800e+02],
        [5.1500e+02, 3.3580e+03, 8.3000e+02]],

       ...,

       [[2.6100e+02, 0.0000e+00, 3.8224e+04],
        [2.6100e+02, 1.0000e+00, 4.0583e+04],
        [2.6100e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-26 09:16:32,098][series_config][INFO] - Quick validation succeeded.
[2025-08-26 09:16:32,107][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-26 09:16:32,110][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3119.22it/s]
[2025-08-26 09:16:32,142][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-26 09:16:32,142][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [431 448 537 303 501 ... 314 365  75 159 438], Length=54
        Val time series IDS: [514 457 456 463 241 ... 505 504  15  94  20], Length=25
        Test time series IDS [481 525 325 102 113 418 198 380  22 363], Length=10
        All time series IDS [431 448 537 303 501 ... 418 198 380  22 363], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-26 09:16:32,151][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 99.92it/s]


array([[[2.0600e+02, 0.0000e+00, 2.3597e+04],
        [2.0600e+02, 1.0000e+00, 2.5923e+04],
        [2.0600e+02, 2.0000e+00, 2.4718e+04],
        ...,
        [2.0600e+02, 3.3560e+03, 3.0484e+04],
        [2.0600e+02, 3.3570e+03, 3.8880e+04],
        [2.0600e+02, 3.3580e+03, 3.3151e+04]],

       [[9.2000e+01, 0.0000e+00, 6.3000e+02],
        [9.2000e+01, 1.0000e+00, 7.0000e+02],
        [9.2000e+01, 2.0000e+00, 5.7100e+02],
        ...,
        [9.2000e+01, 3.3560e+03, 8.2000e+02],
        [9.2000e+01, 3.3570e+03, 6.9900e+02],
        [9.2000e+01, 3.3580e+03, 7.6500e+02]],

       [[3.0300e+02, 0.0000e+00, 1.4976e+04],
        [3.0300e+02, 1.0000e+00, 1.5967e+04],
        [3.0300e+02, 2.0000e+00, 1.7295e+04],
        ...,
        [3.0300e+02, 3.3560e+03, 1.9844e+04],
        [3.0300e+02, 3.3570e+03, 1.9258e+04],
        [3.0300e+02, 3.3580e+03, 1.8608e+04]],

       ...,

       [[2.4500e+02, 0.0000e+00, 1.2100e+02],
        [2.4500e+02, 1.0000e+00, 1.3300e+02],
        [2.4500e+02, 2