# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-14 14:41:17,735][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:17,740][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:17,751][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:17,756][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1140.78it/s]
[2025-09-14 14:41:17,843][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:17,843][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [342 343 439 210 468 ... 104 272 489 364 428], Length=54
        Val time series IDS: [280 405 225 162 357 ...   3 267 295 318 245], Length=25
        Test time series IDS [455 427  66 485 363 341 534 501 260 187], Length=10
        All time series IDS [342 343 439 210 468 ... 341 534 501 260 187], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-14 14:41:17,856][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:41:17,857][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:41:17,859][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:41:17,860][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:41:17,861][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-14 14:41:17,870][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:41:17,871][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:41:17,873][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-14 14:41:17,873][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-14 14:41:17,874][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:41:17,884][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 40.77it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:41:17,946][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 47.56it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:41:17,978][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 108.00it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:41:17,998][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 37.70it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,032][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:18,044][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:18,048][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1743.81it/s]
[2025-09-14 14:41:18,100][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:18,101][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [108 472 313 256 382 ...  56 243 393 385 189], Length=54
        Val time series IDS: [455  58 500 493 247 ...   0 250 480 374 135], Length=25
        Test time series IDS [359 405 192 222 151 531 418 526 138  14], Length=10
        All time series IDS [108 472 313 256 382 ... 531 418 526 138  14], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Tes

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-14 14:41:18,109][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 113.21it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,134][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:18,143][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:18,147][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2001.10it/s]
[2025-09-14 14:41:18,150][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:18,151][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
      

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-14 14:41:18,159][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 999.83it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,166][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:18,175][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:18,179][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1574.23it/s]
[2025-09-14 14:41:18,239][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:18,239][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [413 129 172 400 366 ...  12  24 265 397 497], Length=54
        Val time series IDS: [307  11 138 237 107 ... 419 354 387  91 450], Length=25
        Test time series IDS [120  21 244 513 228  42 250 441 358 529], Length=10
        All time series IDS [413 129 172 400 366 ...  42 250 441 358 529], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,413.0,0.0,4175.0,188154.0,75528940.0,717.0,16.67,15.57,495.0,11.51,12.05,2208.0,51.349998,64.260002,0.839844,0.850098,0.439941,0.360107,12.49,144.279999
1,413.0,1.0,7987.0,929424.0,603463200.0,989.0,19.389999,21.950001,831.0,16.290001,17.200001,3544.0,69.489998,106.870003,0.839844,0.850098,0.52002,0.5,6.97,139.380005
2,413.0,2.0,14870.0,1974697.0,1544723000.0,1315.0,29.889999,36.669998,512.0,11.64,11.51,6098.0,138.589996,216.690002,0.890137,0.899902,0.459961,0.389893,6.56,143.300003
3,413.0,3.0,16629.0,2572737.0,1755707000.0,1474.0,33.5,42.200001,464.0,10.55,12.1,7085.0,161.020004,247.380005,0.850098,0.859863,0.540039,0.449951,6.3,148.279999
4,413.0,4.0,14207.0,1819461.0,1288797000.0,1427.0,32.43,39.970001,376.0,8.55,8.94,6616.0,150.360001,225.740005,0.830078,0.830078,0.469971,0.389893,16.9,136.410004
5,413.0,5.0,13232.0,1461740.0,1023883000.0,1446.0,28.35,38.02,492.0,9.65,11.22,6444.0,126.349998,210.259995,0.790039,0.799805,0.459961,0.409912,13.58,136.350006
6,413.0,6.0,14329.0,1728942.0,1084642000.0,1420.0,30.209999,38.860001,535.0,11.38,12.61,6783.0,144.320007,230.830002,0.910156,0.919922,0.47998,0.439941,12.97,135.300003
7,413.0,7.0,12652.0,1668491.0,1377631000.0,1350.0,32.139999,38.139999,390.0,9.29,8.69,6231.0,148.360001,218.699997,0.810059,0.810059,0.449951,0.370117,14.84,131.130005
8,413.0,8.0,12563.0,1624366.0,1347698000.0,1384.0,32.950001,39.349998,337.0,8.02,8.43,6175.0,147.020004,215.990005,0.810059,0.819824,0.509766,0.419922,12.67,136.479996
9,413.0,9.0,13348.0,1579630.0,1283445000.0,1434.0,35.849998,42.799999,348.0,8.7,9.02,6506.0,162.649994,232.169998,0.839844,0.859863,0.48999,0.379883,11.62,138.830002


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,307.0,0.0,11718.0,758957.0,674056800.0,2473.0,7.49,5.6,3737.0,11.32,17.5,5481.0,16.610001,24.950001,0.709961,0.720215,0.429932,0.48999,8.51,136.110001
1,307.0,1.0,11968.0,829743.0,664436300.0,2586.0,7.91,6.14,3784.0,11.57,17.459999,5831.0,17.83,25.76,0.669922,0.680176,0.409912,0.449951,8.64,137.429993
2,307.0,2.0,11009.0,874128.0,675415600.0,2494.0,7.36,5.95,3553.0,10.48,16.389999,5689.0,16.780001,27.73,0.669922,0.689941,0.419922,0.459961,8.86,135.169998
3,307.0,3.0,10499.0,1249875.0,750990400.0,2184.0,7.28,7.69,3110.0,10.37,16.32,5335.0,17.780001,35.200001,0.649902,0.660156,0.449951,0.48999,8.48,127.300003
4,307.0,4.0,12152.0,2471160.0,1696159000.0,2202.0,7.2,10.27,2849.0,9.31,14.23,6475.0,21.16,61.299999,0.669922,0.689941,0.449951,0.48999,10.0,128.720001
5,307.0,5.0,16806.0,2233975.0,1865119000.0,2492.0,8.06,15.17,3031.0,9.81,15.43,8778.0,28.41,98.699997,0.720215,0.740234,0.459961,0.509766,9.15,126.25
6,307.0,6.0,17983.0,2425362.0,1776008000.0,2522.0,8.55,18.43,2718.0,9.21,14.21,8864.0,30.049999,107.150002,0.700195,0.720215,0.47998,0.52002,10.39,127.550003
7,307.0,7.0,19398.0,3747747.0,2384432000.0,2678.0,8.42,17.030001,2941.0,9.25,13.33,9292.0,29.219999,103.339996,0.680176,0.689941,0.449951,0.5,9.25,132.720001
8,307.0,8.0,17343.0,6151914.0,3486004000.0,2724.0,8.49,18.139999,3386.0,10.55,21.02,9023.0,28.110001,98.510002,0.649902,0.669922,0.469971,0.5,9.58,120.900002
9,307.0,9.0,18931.0,3999605.0,1914538000.0,3192.0,9.56,24.200001,3377.0,10.11,19.459999,9952.0,29.799999,112.470001,0.649902,0.660156,0.459961,0.5,9.78,129.089996


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,120.0,0.0,990.0,1462.0,73117.0,669.0,5.4,2.16,695.0,5.6,2.3,923.0,7.44,4.58,0.709961,0.669922,0.449951,0.449951,2.22,142.410004
1,120.0,1.0,969.0,1634.0,96248.0,699.0,5.55,2.17,698.0,5.54,2.16,898.0,7.13,3.86,0.669922,0.609863,0.419922,0.429932,3.8,137.160004
2,120.0,2.0,830.0,1206.0,60113.0,567.0,4.65,1.96,581.0,4.76,2.03,764.0,6.26,3.62,0.700195,0.669922,0.419922,0.409912,1.77,137.429993
3,120.0,3.0,424.0,632.0,32939.0,270.0,4.22,1.72,259.0,4.05,1.47,394.0,6.16,4.24,0.580078,0.549805,0.409912,0.409912,4.04,143.619995
4,120.0,4.0,238.0,406.0,22290.0,159.0,3.79,1.73,155.0,3.69,1.76,215.0,5.12,3.06,0.569824,0.52002,0.399902,0.389893,5.35,154.5
5,120.0,5.0,412.0,694.0,35341.0,327.0,3.55,1.57,331.0,3.6,1.72,393.0,4.27,2.46,0.75,0.720215,0.48999,0.48999,2.6,132.419998
6,120.0,6.0,402.0,631.0,32049.0,306.0,3.44,1.57,318.0,3.57,1.78,384.0,4.31,2.68,0.72998,0.700195,0.439941,0.419922,1.28,143.850006
7,120.0,7.0,490.0,721.0,33649.0,394.0,3.94,1.71,409.0,4.09,1.79,466.0,4.66,2.5,0.779785,0.75,0.439941,0.439941,1.49,139.059998
8,120.0,8.0,364.0,612.0,32794.0,287.0,3.19,1.31,288.0,3.2,1.46,353.0,3.92,2.25,0.689941,0.629883,0.509766,0.540039,1.28,133.539993
9,120.0,9.0,458.0,751.0,39657.0,344.0,3.55,1.5,356.0,3.67,1.62,430.0,4.43,2.69,0.700195,0.640137,0.47998,0.469971,1.82,133.529999


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,413.0,0.0,4175.0,188154.0,75528940.0,717.0,16.67,15.57,495.0,11.51,12.05,2208.0,51.349998,64.260002,0.839844,0.850098,0.439941,0.360107,12.49,144.279999
1,413.0,1.0,7987.0,929424.0,603463200.0,989.0,19.389999,21.950001,831.0,16.290001,17.200001,3544.0,69.489998,106.870003,0.839844,0.850098,0.52002,0.5,6.97,139.380005
2,413.0,2.0,14870.0,1974697.0,1544723000.0,1315.0,29.889999,36.669998,512.0,11.64,11.51,6098.0,138.589996,216.690002,0.890137,0.899902,0.459961,0.389893,6.56,143.300003
3,413.0,3.0,16629.0,2572737.0,1755707000.0,1474.0,33.5,42.200001,464.0,10.55,12.1,7085.0,161.020004,247.380005,0.850098,0.859863,0.540039,0.449951,6.3,148.279999
4,413.0,4.0,14207.0,1819461.0,1288797000.0,1427.0,32.43,39.970001,376.0,8.55,8.94,6616.0,150.360001,225.740005,0.830078,0.830078,0.469971,0.389893,16.9,136.410004
5,413.0,5.0,13232.0,1461740.0,1023883000.0,1446.0,28.35,38.02,492.0,9.65,11.22,6444.0,126.349998,210.259995,0.790039,0.799805,0.459961,0.409912,13.58,136.350006
6,413.0,6.0,14329.0,1728942.0,1084642000.0,1420.0,30.209999,38.860001,535.0,11.38,12.61,6783.0,144.320007,230.830002,0.910156,0.919922,0.47998,0.439941,12.97,135.300003
7,413.0,7.0,12652.0,1668491.0,1377631000.0,1350.0,32.139999,38.139999,390.0,9.29,8.69,6231.0,148.360001,218.699997,0.810059,0.810059,0.449951,0.370117,14.84,131.130005
8,413.0,8.0,12563.0,1624366.0,1347698000.0,1384.0,32.950001,39.349998,337.0,8.02,8.43,6175.0,147.020004,215.990005,0.810059,0.819824,0.509766,0.419922,12.67,136.479996
9,413.0,9.0,13348.0,1579630.0,1283445000.0,1434.0,35.849998,42.799999,348.0,8.7,9.02,6506.0,162.649994,232.169998,0.839844,0.859863,0.48999,0.379883,11.62,138.830002


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,592][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:18,602][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:18,605][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1727.21it/s]
[2025-09-14 14:41:18,660][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:18,661][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [104 171 511  83 461 ... 160 255 490 309 123], Length=54
        Val time series IDS: [ 35 472 144 536 217 ... 180 340 254 265  10], Length=25
        Test time series IDS [304 105 430 537  74 154 267 495 520 434], Length=10
        All time series IDS [104 171 511  83 461 ... 154 267 495 520 434], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,879][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:18,892][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:18,896][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1955.47it/s]
[2025-09-14 14:41:18,945][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:18,945][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [367 547 531 332 432 ... 419  74 211 382 254], Length=54
        Val time series IDS: [443 426 479 351 138 ... 406 207 475  71 112], Length=25
        Test time series IDS [ 96 248  89 526  95 116 480 376 362 272], Length=10
        All time series IDS [367 547 531 332 432 ... 116 480 376 362 272], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:18,998][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:19,009][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:19,012][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2967.33it/s]
[2025-09-14 14:41:19,044][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:19,044][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [277 262 372  67 544 ... 450 220 138  83  27], Length=54
        Val time series IDS: [345 464 362 484 452 ... 123  93 377  59 234], Length=25
        Test time series IDS [265 250 137 311 368  85 443 290 522 271], Length=10
        All time series IDS [277 262 372  67 544 ...  85 443 290 522 271], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-14 14:41:19,053][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 114.22it/s]


array([[[2.7700e+02, 0.0000e+00, 1.2246e+04],
        [2.7700e+02, 1.0000e+00, 2.2513e+04],
        [2.7700e+02, 2.0000e+00, 3.1937e+04],
        ...,
        [2.7700e+02, 3.3560e+03, 7.8430e+03],
        [2.7700e+02, 3.3570e+03, 7.7100e+03],
        [2.7700e+02, 3.3580e+03, 7.1210e+03]],

       [[2.6200e+02, 0.0000e+00, 4.0400e+02],
        [2.6200e+02, 1.0000e+00, 4.2000e+02],
        [2.6200e+02, 2.0000e+00, 3.7100e+02],
        ...,
        [2.6200e+02, 3.3560e+03, 3.0100e+02],
        [2.6200e+02, 3.3570e+03, 3.9000e+02],
        [2.6200e+02, 3.3580e+03, 3.4100e+02]],

       [[3.7200e+02, 0.0000e+00, 5.1225e+04],
        [3.7200e+02, 1.0000e+00, 5.1596e+04],
        [3.7200e+02, 2.0000e+00, 5.0173e+04],
        ...,
        [3.7200e+02, 3.3560e+03, 4.0853e+04],
        [3.7200e+02, 3.3570e+03, 4.6939e+04],
        [3.7200e+02, 3.3580e+03, 4.6034e+04]],

       ...,

       [[1.8300e+02, 0.0000e+00, 3.8000e+01],
        [1.8300e+02, 1.0000e+00, 4.2000e+01],
        [1.8300e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-14 14:41:19,078][series_config][INFO] - Quick validation succeeded.
[2025-09-14 14:41:19,088][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-14 14:41:19,092][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2897.74it/s]
[2025-09-14 14:41:19,125][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-14 14:41:19,125][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [346 387 177 475 236 ... 411 349 322  93 406], Length=54
        Val time series IDS: [532 350 184 108 304 ...  97 479 421 197 303], Length=25
        Test time series IDS [541 206 115 453 294 525 538 510 545  32], Length=10
        All time series IDS [346 387 177 475 236 ... 525 538 510 545  32], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: no_filler
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-14 14:41:19,134][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 97.82it/s]


array([[[1.4000e+01, 0.0000e+00, 5.0480e+03],
        [1.4000e+01, 1.0000e+00, 5.5120e+03],
        [1.4000e+01, 2.0000e+00, 1.2028e+04],
        ...,
        [1.4000e+01, 3.3560e+03, 8.2053e+04],
        [1.4000e+01, 3.3570e+03, 9.6078e+04],
        [1.4000e+01, 3.3580e+03, 9.1393e+04]],

       [[2.8700e+02, 0.0000e+00, 7.7910e+03],
        [2.8700e+02, 1.0000e+00, 1.0505e+04],
        [2.8700e+02, 2.0000e+00, 3.9630e+04],
        ...,
        [2.8700e+02, 3.3560e+03, 9.1190e+03],
        [2.8700e+02, 3.3570e+03, 9.8320e+03],
        [2.8700e+02, 3.3580e+03, 9.4430e+03]],

       [[4.1100e+02, 0.0000e+00, 1.5310e+03],
        [4.1100e+02, 1.0000e+00, 1.9940e+03],
        [4.1100e+02, 2.0000e+00, 4.2270e+03],
        ...,
        [4.1100e+02, 3.3560e+03, 2.3940e+03],
        [4.1100e+02, 3.3570e+03, 1.7420e+03],
        [4.1100e+02, 3.3580e+03, 1.6350e+03]],

       ...,

       [[1.7700e+02, 0.0000e+00, 1.3840e+03],
        [1.7700e+02, 1.0000e+00, 2.1160e+03],
        [1.7700e+02, 2