# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-08-31 12:06:39,344][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:39,350][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:39,360][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:39,364][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1489.06it/s]
[2025-08-31 12:06:39,432][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:39,433][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [151 473 411  16 277 ... 115 254 489  27  61], Length=54
        Val time series IDS: [323 350 491 438 242 ... 466 374  59 210 164], Length=25
        Test time series IDS [ 68 206 534 310 396 169 275 165 385  39], Length=10
        All time series IDS [151 473 411  16 277 ... 169 275 165 385  39], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-31 12:06:39,437][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:06:39,438][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:06:39,438][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:06:39,438][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:06:39,439][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-31 12:06:39,444][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:06:39,444][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:06:39,445][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-31 12:06:39,446][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-31 12:06:39,446][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:06:39,454][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 47.96it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:06:39,508][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 52.61it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:06:39,538][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 133.36it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:06:39,554][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 36.33it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:39,588][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:39,603][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:39,609][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1520.00it/s]
[2025-08-31 12:06:39,670][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:39,671][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [345 508 298 457 397 ... 502 330 431 495 163], Length=54
        Val time series IDS: [ 53 398 166 266 378 ... 221 302 323 108  10], Length=25
        Test time series IDS [ 31 197  21 453 373 418 138 300 511 327], Length=10
        All time series IDS [345 508 298 457 397 ... 418 138 300 511 327], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-31 12:06:39,678][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 117.58it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:39,702][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:39,712][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:39,715][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 1140.76it/s]
[2025-08-31 12:06:39,720][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:39,721][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-31 12:06:39,728][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 1001.03it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:39,735][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:39,745][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:39,748][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 893.91it/s]
[2025-08-31 12:06:39,850][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:39,850][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [388 367 430 519 164 ... 366 426 269 265 391], Length=54
        Val time series IDS: [402 150 338 494  73 ... 486 288 322 155 299], Length=25
        Test time series IDS [435  63 505 535 160 466 496 390 208 507], Length=10
        All time series IDS [388 367 430 519 164 ... 466 496 390 208 507], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,388.0,0.0,679.0,17699.0,9137883.0,143.0,11.92,7.33,101.0,8.42,3.73,300.0,25.0,20.49,0.97998,0.939941,0.409912,0.23999,9.73,160.339996
1,388.0,1.0,620.0,14105.0,6761003.0,139.0,11.58,7.25,97.0,8.08,3.06,296.0,24.67,22.469999,0.939941,0.910156,0.370117,0.22998,12.73,160.449997
2,388.0,2.0,1312.0,544225.0,738767768.0,221.0,22.1,18.82,92.0,9.2,5.22,737.0,73.699997,74.239998,0.790039,0.779785,0.300049,0.199951,16.92,165.389999
3,388.0,3.0,2463.0,219134.0,229962357.0,293.0,24.42,21.6,140.0,11.67,7.49,1387.0,115.580002,126.32,0.919922,0.919922,0.360107,0.23999,13.31,144.690002
4,388.0,4.0,3593.0,422291.0,452458531.0,316.0,26.33,24.91,124.0,10.33,7.7,1804.0,150.330002,158.320007,0.870117,0.839844,0.439941,0.340088,11.82,149.009995
5,388.0,5.0,2898.0,288039.0,259821202.0,334.0,33.400002,25.290001,129.0,12.9,7.5,1664.0,166.399994,142.759995,0.810059,0.759766,0.429932,0.280029,16.280001,140.190002
6,388.0,6.0,3445.0,368034.0,383137472.0,315.0,28.639999,25.49,116.0,10.55,7.5,1834.0,166.729996,165.550003,0.819824,0.839844,0.310059,0.209961,12.85,125.32
7,388.0,7.0,2597.0,167910.0,139096245.0,302.0,25.17,23.969999,124.0,10.33,7.45,1481.0,123.419998,134.139999,0.859863,0.810059,0.379883,0.26001,14.52,171.639999
8,388.0,8.0,2920.0,421457.0,463314176.0,334.0,27.83,25.879999,110.0,9.17,5.73,1684.0,140.330002,144.339996,0.77002,0.700195,0.310059,0.209961,13.29,131.940002
9,388.0,9.0,1562.0,193100.0,89362048.0,256.0,23.27,23.08,136.0,12.36,10.04,942.0,85.639999,103.779999,0.939941,0.910156,0.419922,0.25,14.75,156.580002


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,402.0,0.0,13579.0,444671.0,255621300.0,2131.0,15.33,15.43,2311.0,16.629999,16.459999,4435.0,31.91,64.940002,0.640137,0.629883,0.459961,0.549805,22.66,113.940002
1,402.0,1.0,16812.0,601725.0,413793000.0,2471.0,17.780001,20.280001,2516.0,18.1,16.91,5931.0,42.669998,101.110001,0.669922,0.649902,0.469971,0.589844,22.540001,114.279999
2,402.0,2.0,27842.0,945098.0,628232700.0,2512.0,18.34,26.360001,2480.0,18.1,17.48,7408.0,54.07,149.679993,0.689941,0.660156,0.459961,0.569824,23.43,109.639999
3,402.0,3.0,39886.0,1869894.0,1223225000.0,3146.0,22.799999,37.02,2542.0,18.42,17.49,11610.0,84.129997,240.419998,0.660156,0.640137,0.509766,0.629883,25.440001,103.029999
4,402.0,4.0,32353.0,2206741.0,1607171000.0,3295.0,24.41,38.849998,2688.0,19.91,20.42,12039.0,89.18,240.570007,0.660156,0.640137,0.540039,0.640137,29.99,96.730003
5,402.0,5.0,29260.0,1950809.0,1525337000.0,3001.0,22.559999,38.32,2172.0,16.33,14.55,11116.0,83.580002,234.490005,0.640137,0.620117,0.48999,0.589844,27.790001,99.519997
6,402.0,6.0,28396.0,4297351.0,4456770000.0,3038.0,23.190001,37.939999,2108.0,16.09,14.65,11455.0,87.440002,239.539993,0.660156,0.649902,0.509766,0.620117,30.85,101.330002
7,402.0,7.0,30479.0,1978539.0,1546917000.0,3227.0,23.049999,40.189999,2426.0,17.33,16.4,11828.0,84.489998,242.729996,0.620117,0.609863,0.52002,0.609863,30.9,98.739998
8,402.0,8.0,29829.0,2239542.0,1744617000.0,3231.0,22.440001,37.860001,2419.0,16.799999,16.459999,11695.0,81.220001,233.509995,0.620117,0.609863,0.529785,0.629883,29.48,96.690002
9,402.0,9.0,28262.0,2447174.0,2066795000.0,3001.0,22.23,38.0,2245.0,16.629999,15.22,11185.0,82.849998,237.25,0.660156,0.640137,0.509766,0.620117,31.809999,94.550003


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,435.0,0.0,205.0,2201.0,791062.0,99.0,7.07,3.22,96.0,6.86,3.03,160.0,11.43,9.09,0.919922,0.939941,0.399902,0.340088,3.36,152.520004
1,435.0,1.0,151.0,1867.0,458112.0,92.0,6.57,3.08,92.0,6.57,3.2,128.0,9.14,6.04,0.930176,0.950195,0.449951,0.419922,8.48,158.600006
2,435.0,2.0,167.0,1259.0,523845.0,93.0,6.2,2.81,97.0,6.47,2.2,141.0,9.4,5.42,0.959961,0.970215,0.459961,0.429932,4.21,159.720001
3,435.0,3.0,1022.0,164052.0,124209469.0,191.0,14.69,15.55,90.0,6.92,5.63,695.0,53.459999,68.360001,0.850098,0.839844,0.439941,0.340088,24.620001,120.150002
4,435.0,4.0,1176.0,118831.0,104197208.0,232.0,14.5,16.26,111.0,6.94,5.4,818.0,51.119999,66.360001,0.899902,0.890137,0.449951,0.379883,12.87,145.509995
5,435.0,5.0,1503.0,137914.0,114411837.0,233.0,16.639999,17.23,92.0,6.57,5.18,944.0,67.43,79.309998,0.879883,0.879883,0.449951,0.360107,26.540001,130.389999
6,435.0,6.0,2530.0,396042.0,358812145.0,306.0,20.4,21.780001,111.0,7.4,4.93,1413.0,94.199997,119.720001,0.899902,0.850098,0.409912,0.350098,11.93,132.179993
7,435.0,7.0,3536.0,669870.0,616926388.0,353.0,22.059999,25.549999,134.0,8.38,6.96,1944.0,121.5,158.270004,0.919922,0.899902,0.52002,0.429932,14.08,143.039993
8,435.0,8.0,2941.0,488155.0,458571884.0,300.0,20.0,21.959999,112.0,7.47,5.66,1532.0,102.129997,127.779999,0.830078,0.810059,0.399902,0.360107,11.34,143.089996
9,435.0,9.0,2970.0,321596.0,240495089.0,320.0,22.860001,23.709999,124.0,8.86,6.86,1604.0,114.57,134.720001,0.870117,0.850098,0.509766,0.439941,13.87,148.0


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,388.0,0.0,679.0,17699.0,9137883.0,143.0,11.92,7.33,101.0,8.42,3.73,300.0,25.0,20.49,0.97998,0.939941,0.409912,0.23999,9.73,160.339996
1,388.0,1.0,620.0,14105.0,6761003.0,139.0,11.58,7.25,97.0,8.08,3.06,296.0,24.67,22.469999,0.939941,0.910156,0.370117,0.22998,12.73,160.449997
2,388.0,2.0,1312.0,544225.0,738767768.0,221.0,22.1,18.82,92.0,9.2,5.22,737.0,73.699997,74.239998,0.790039,0.779785,0.300049,0.199951,16.92,165.389999
3,388.0,3.0,2463.0,219134.0,229962357.0,293.0,24.42,21.6,140.0,11.67,7.49,1387.0,115.580002,126.32,0.919922,0.919922,0.360107,0.23999,13.31,144.690002
4,388.0,4.0,3593.0,422291.0,452458531.0,316.0,26.33,24.91,124.0,10.33,7.7,1804.0,150.330002,158.320007,0.870117,0.839844,0.439941,0.340088,11.82,149.009995
5,388.0,5.0,2898.0,288039.0,259821202.0,334.0,33.400002,25.290001,129.0,12.9,7.5,1664.0,166.399994,142.759995,0.810059,0.759766,0.429932,0.280029,16.280001,140.190002
6,388.0,6.0,3445.0,368034.0,383137472.0,315.0,28.639999,25.49,116.0,10.55,7.5,1834.0,166.729996,165.550003,0.819824,0.839844,0.310059,0.209961,12.85,125.32
7,388.0,7.0,2597.0,167910.0,139096245.0,302.0,25.17,23.969999,124.0,10.33,7.45,1481.0,123.419998,134.139999,0.859863,0.810059,0.379883,0.26001,14.52,171.639999
8,388.0,8.0,2920.0,421457.0,463314176.0,334.0,27.83,25.879999,110.0,9.17,5.73,1684.0,140.330002,144.339996,0.77002,0.700195,0.310059,0.209961,13.29,131.940002
9,388.0,9.0,1562.0,193100.0,89362048.0,256.0,23.27,23.08,136.0,12.36,10.04,942.0,85.639999,103.779999,0.939941,0.910156,0.419922,0.25,14.75,156.580002


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:40,193][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:40,204][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:40,208][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1583.68it/s]
[2025-08-31 12:06:40,266][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:40,266][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 57 173 512 292 432 ... 516 508  54 261 306], Length=54
        Val time series IDS: [225 172 336 178  53 ...  77 145  61 155 171], Length=25
        Test time series IDS [154 346 324 406  20 257 521 341 510 315], Length=10
        All time series IDS [ 57 173 512 292 432 ... 257 521 341 510 315], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:40,433][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:40,444][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:40,446][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2020.76it/s]
[2025-08-31 12:06:40,494][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:40,494][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [299 513 521 465 479 ... 309 118 349 202   2], Length=54
        Val time series IDS: [416 185 215  23 261 ... 506 267 433 367 535], Length=25
        Test time series IDS [ 92   0 251 436 407 454  13 179 315 370], Length=10
        All time series IDS [299 513 521 465 479 ... 454  13 179 315 370], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:40,548][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:40,558][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:40,561][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2736.67it/s]
[2025-08-31 12:06:40,595][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:40,595][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [217 403 511 145 523 ...  16 184 286 393 346], Length=54
        Val time series IDS: [ 10 125 212 500 402 ...  35 261 417 343 408], Length=25
        Test time series IDS [ 20 239 536 201 275 470 147 378 194 176], Length=10
        All time series IDS [217 403 511 145 523 ... 470 147 378 194 176], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-31 12:06:40,603][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 120.47it/s]


array([[[2.1700e+02, 0.0000e+00, 3.7000e+01],
        [2.1700e+02, 1.0000e+00, 1.6000e+01],
        [2.1700e+02, 2.0000e+00, 2.1000e+01],
        ...,
        [2.1700e+02, 3.3560e+03, 4.6000e+01],
        [2.1700e+02, 3.3570e+03, 7.7000e+01],
        [2.1700e+02, 3.3580e+03, 5.4000e+01]],

       [[4.0300e+02, 0.0000e+00, 4.5420e+03],
        [4.0300e+02, 1.0000e+00, 5.1620e+03],
        [4.0300e+02, 2.0000e+00, 7.0830e+03],
        ...,
        [4.0300e+02, 3.3560e+03, 4.9380e+03],
        [4.0300e+02, 3.3570e+03, 5.0570e+03],
        [4.0300e+02, 3.3580e+03, 4.2300e+03]],

       [[5.1100e+02, 0.0000e+00, 1.8100e+02],
        [5.1100e+02, 1.0000e+00, 1.8900e+02],
        [5.1100e+02, 2.0000e+00, 1.6600e+02],
        ...,
        [5.1100e+02, 3.3560e+03, 9.8000e+01],
        [5.1100e+02, 3.3570e+03, 1.5900e+02],
        [5.1100e+02, 3.3580e+03, 1.1700e+02]],

       ...,

       [[3.3400e+02, 0.0000e+00, 1.1400e+03],
        [3.3400e+02, 1.0000e+00, 1.4590e+03],
        [3.3400e+02, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-31 12:06:40,628][series_config][INFO] - Quick validation succeeded.
[2025-08-31 12:06:40,638][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-31 12:06:40,641][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3292.87it/s]
[2025-08-31 12:06:40,670][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-31 12:06:40,670][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [437 449 138 318 191 ... 511 370  75  87 342], Length=54
        Val time series IDS: [ 88 400 402 220  23 ... 121  16  20 474 526], Length=25
        Test time series IDS [104 313 352 411 208 297 100 396 269 423], Length=10
        All time series IDS [437 449 138 318 191 ... 297 100 396 269 423], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-31 12:06:40,677][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 93.00it/s]


array([[[4.4900e+02, 0.0000e+00, 2.8340e+03],
        [4.4900e+02, 1.0000e+00, 2.7730e+03],
        [4.4900e+02, 2.0000e+00, 3.0080e+03],
        ...,
        [4.4900e+02, 3.3560e+03, 2.2610e+03],
        [4.4900e+02, 3.3570e+03, 2.4430e+03],
        [4.4900e+02, 3.3580e+03, 4.0000e+03]],

       [[1.5900e+02, 0.0000e+00, 4.1900e+02],
        [1.5900e+02, 1.0000e+00, 3.3800e+02],
        [1.5900e+02, 2.0000e+00, 3.3700e+02],
        ...,
        [1.5900e+02, 3.3560e+03, 3.2600e+02],
        [1.5900e+02, 3.3570e+03, 3.0800e+02],
        [1.5900e+02, 3.3580e+03, 4.0100e+02]],

       [[4.3700e+02, 0.0000e+00, 7.3630e+03],
        [4.3700e+02, 1.0000e+00, 8.4980e+03],
        [4.3700e+02, 2.0000e+00, 1.4758e+04],
        ...,
        [4.3700e+02, 3.3560e+03, 9.2940e+03],
        [4.3700e+02, 3.3570e+03, 9.5680e+03],
        [4.3700e+02, 3.3580e+03, 8.1860e+03]],

       ...,

       [[3.7600e+02, 0.0000e+00, 4.8700e+02],
        [3.7600e+02, 1.0000e+00, 4.9400e+02],
        [3.7600e+02, 2