# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-09-06 19:24:43,305][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:43,309][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:43,321][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:43,326][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1252.29it/s]
[2025-09-06 19:24:43,402][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:43,403][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [366 393 154 336 283 ... 524 295 144  86   4], Length=54
        Val time series IDS: [491 539 165  43 270 ... 455 102 292 301 306], Length=25
        Test time series IDS [369  67  18 261  44 294 307 380 407 296], Length=10
        All time series IDS [366 393 154 336 283 ... 294 307 380 407 296], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-09-06 19:24:43,408][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:24:43,408][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:24:43,409][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:24:43,410][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:24:43,410][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-09-06 19:24:43,415][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:24:43,416][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:24:43,416][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-09-06 19:24:43,417][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-09-06 19:24:43,417][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:24:43,425][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 44.95it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:24:43,481][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 52.61it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:24:43,511][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 142.54it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:24:43,528][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 38.45it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:43,562][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:43,574][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:43,577][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1382.82it/s]
[2025-09-06 19:24:43,643][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:43,644][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [363  12 137 440 435 ... 347 535  45 110 395], Length=54
        Val time series IDS: [194 333 140 287 198 ... 191 390  75  63 250], Length=25
        Test time series IDS [ 42 479   4 209 486 496 520 430 340  76], Length=10
        All time series IDS [363  12 137 440 435 ... 496 520 430 340  76], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test bat

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-09-06 19:24:43,651][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 118.25it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:43,676][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:43,685][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:43,689][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 2001.34it/s]
[2025-09-06 19:24:43,693][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:43,693][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-09-06 19:24:43,701][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 999.36it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:43,709][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:43,719][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:43,722][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 764.65it/s]
[2025-09-06 19:24:43,841][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:43,841][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [130 328 169 140 332 ...  52 266 419 445  79], Length=54
        Val time series IDS: [ 15 330  94 260 291 ...  74 340 399 162 262], Length=25
        Test time series IDS [373 432 123 484 539 201 456 159 243  95], Length=10
        All time series IDS [130 328 169 140 332 ... 201 456 159 243  95], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,130.0,0.0,2118.0,28878.0,12816980.0,239.0,10.39,7.11,203.0,8.83,3.73,710.0,30.870001,41.43,0.799805,0.799805,0.419922,0.409912,10.92,175.210007
1,130.0,1.0,2214.0,33738.0,17826110.0,228.0,9.12,7.33,193.0,7.72,4.77,685.0,27.4,39.759998,0.799805,0.799805,0.47998,0.469971,9.6,158.339996
2,130.0,2.0,2697.0,136038.0,134841600.0,327.0,11.28,8.55,251.0,8.66,6.98,1090.0,37.59,53.400002,0.810059,0.799805,0.449951,0.409912,9.26,163.710007
3,130.0,3.0,14452.0,1504216.0,1521991000.0,664.0,25.540001,28.08,214.0,8.23,4.16,4883.0,187.809998,264.679993,0.790039,0.779785,0.360107,0.27002,10.56,151.410004
4,130.0,4.0,26459.0,3405807.0,3470389000.0,937.0,33.459999,37.290001,251.0,8.96,7.19,8292.0,296.140015,350.869995,0.879883,0.870117,0.429932,0.379883,10.22,146.839996
5,130.0,5.0,20250.0,1712374.0,1570317000.0,856.0,38.91,37.16,275.0,12.5,9.39,6923.0,314.679993,322.910004,0.899902,0.890137,0.429932,0.300049,14.43,154.179993
6,130.0,6.0,19889.0,2009139.0,2075146000.0,919.0,38.290001,39.110001,302.0,12.58,10.99,7336.0,305.670013,327.709991,0.839844,0.839844,0.419922,0.320068,13.99,135.100006
7,130.0,7.0,21920.0,2714611.0,2757594000.0,1030.0,39.619999,41.389999,302.0,11.62,8.3,8083.0,310.880005,351.929993,0.839844,0.830078,0.389893,0.300049,13.49,152.190002
8,130.0,8.0,19240.0,2303394.0,2405494000.0,938.0,34.740002,38.509998,224.0,8.3,5.59,7313.0,270.850006,322.929993,0.790039,0.790039,0.459961,0.379883,13.71,140.270004
9,130.0,9.0,18590.0,1745734.0,1677400000.0,857.0,32.959999,34.040001,209.0,8.04,4.83,7007.0,269.5,306.480011,0.77002,0.759766,0.429932,0.340088,12.54,153.889999


In [17]:
len(dfs) # every time series has its own dataframe

54

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,15.0,0.0,558.0,1551.0,308126.0,256.0,5.45,1.77,250.0,5.32,1.9,520.0,11.06,8.12,0.649902,0.640137,0.48999,0.47998,4.11,197.089996
1,15.0,1.0,559.0,1417.0,186956.0,256.0,5.57,2.14,258.0,5.61,2.33,497.0,10.8,8.55,0.629883,0.649902,0.419922,0.399902,4.66,198.619995
2,15.0,2.0,563.0,1485.0,218414.0,255.0,5.43,2.39,256.0,5.45,2.29,513.0,10.91,7.58,0.660156,0.669922,0.47998,0.439941,2.78,203.520004
3,15.0,3.0,824.0,6881.0,1382020.0,296.0,6.73,6.16,218.0,4.95,2.68,747.0,16.98,21.58,0.589844,0.629883,0.5,0.449951,3.58,204.589996
4,15.0,4.0,674.0,6262.0,1445632.0,268.0,6.23,5.46,212.0,4.93,2.24,617.0,14.35,19.0,0.680176,0.709961,0.48999,0.449951,3.2,197.229996
5,15.0,5.0,655.0,8550.0,1156376.0,233.0,5.68,6.25,154.0,3.76,2.09,584.0,14.24,21.040001,0.660156,0.689941,0.580078,0.529785,3.52,207.110001
6,15.0,6.0,635.0,5999.0,1175863.0,247.0,5.88,6.82,164.0,3.9,2.12,548.0,13.05,20.15,0.660156,0.689941,0.48999,0.439941,3.49,203.580002
7,15.0,7.0,714.0,7436.0,1274207.0,270.0,6.28,7.3,177.0,4.12,2.17,632.0,14.7,24.32,0.640137,0.680176,0.52002,0.469971,3.6,200.720001
8,15.0,8.0,688.0,6999.0,1211895.0,269.0,6.56,7.93,179.0,4.37,2.62,607.0,14.8,22.43,0.640137,0.660156,0.5,0.439941,3.67,191.169998
9,15.0,9.0,674.0,8831.0,1529635.0,260.0,5.65,6.84,184.0,4.0,2.68,586.0,12.74,18.719999,0.649902,0.669922,0.569824,0.529785,3.39,198.809998


In [19]:
len(dfs) # every time series has its own dataframe

25

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,373.0,0.0,1466.0,24665.0,13653180.0,594.0,9.43,4.56,738.0,11.71,9.37,1115.0,17.700001,10.82,0.709961,0.740234,0.469971,0.52002,29.030001,135.210007
1,373.0,1.0,1388.0,26648.0,15409540.0,589.0,9.5,4.31,719.0,11.6,7.95,1049.0,16.92,8.83,0.709961,0.740234,0.509766,0.560059,32.279999,130.710007
2,373.0,2.0,1274.0,14669.0,6007035.0,533.0,9.03,4.01,610.0,10.34,7.45,990.0,16.780001,8.78,0.660156,0.689941,0.509766,0.569824,33.790001,124.769997
3,373.0,3.0,2240.0,194586.0,159798300.0,582.0,9.86,6.04,603.0,10.22,6.66,1468.0,24.879999,36.5,0.720215,0.75,0.48999,0.5,24.549999,121.010002
4,373.0,4.0,11053.0,1227568.0,980162200.0,1112.0,17.940001,18.440001,599.0,9.66,4.92,6062.0,97.769997,168.660004,0.680176,0.709961,0.469971,0.429932,22.950001,120.379997
5,373.0,5.0,12439.0,1718785.0,1584893000.0,1269.0,19.52,23.309999,588.0,9.05,5.53,6905.0,106.230003,187.070007,0.629883,0.649902,0.469971,0.429932,29.120001,119.940002
6,373.0,6.0,12704.0,1135011.0,816228800.0,1168.0,18.25,21.969999,600.0,9.38,5.29,6723.0,105.050003,183.970001,0.620117,0.660156,0.540039,0.549805,30.25,126.339996
7,373.0,7.0,14327.0,1724529.0,1344559000.0,1287.0,19.799999,26.620001,627.0,9.65,6.83,7656.0,117.779999,232.119995,0.580078,0.620117,0.48999,0.449951,26.57,124.559998
8,373.0,8.0,13351.0,1639653.0,1068663000.0,1302.0,20.67,28.27,574.0,9.11,6.34,7563.0,120.050003,240.949997,0.580078,0.600098,0.5,0.459961,36.060001,127.07
9,373.0,9.0,14256.0,2176574.0,1519920000.0,1302.0,20.030001,28.23,618.0,9.51,6.58,7737.0,119.029999,233.639999,0.540039,0.569824,0.540039,0.549805,31.200001,123.019997


In [21]:
len(dfs) # every time series has its own dataframe

10

#### All set

- Affected by `all_workers`.

In [22]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,130.0,0.0,2118.0,28878.0,12816980.0,239.0,10.39,7.11,203.0,8.83,3.73,710.0,30.870001,41.43,0.799805,0.799805,0.419922,0.409912,10.92,175.210007
1,130.0,1.0,2214.0,33738.0,17826110.0,228.0,9.12,7.33,193.0,7.72,4.77,685.0,27.4,39.759998,0.799805,0.799805,0.47998,0.469971,9.6,158.339996
2,130.0,2.0,2697.0,136038.0,134841600.0,327.0,11.28,8.55,251.0,8.66,6.98,1090.0,37.59,53.400002,0.810059,0.799805,0.449951,0.409912,9.26,163.710007
3,130.0,3.0,14452.0,1504216.0,1521991000.0,664.0,25.540001,28.08,214.0,8.23,4.16,4883.0,187.809998,264.679993,0.790039,0.779785,0.360107,0.27002,10.56,151.410004
4,130.0,4.0,26459.0,3405807.0,3470389000.0,937.0,33.459999,37.290001,251.0,8.96,7.19,8292.0,296.140015,350.869995,0.879883,0.870117,0.429932,0.379883,10.22,146.839996
5,130.0,5.0,20250.0,1712374.0,1570317000.0,856.0,38.91,37.16,275.0,12.5,9.39,6923.0,314.679993,322.910004,0.899902,0.890137,0.429932,0.300049,14.43,154.179993
6,130.0,6.0,19889.0,2009139.0,2075146000.0,919.0,38.290001,39.110001,302.0,12.58,10.99,7336.0,305.670013,327.709991,0.839844,0.839844,0.419922,0.320068,13.99,135.100006
7,130.0,7.0,21920.0,2714611.0,2757594000.0,1030.0,39.619999,41.389999,302.0,11.62,8.3,8083.0,310.880005,351.929993,0.839844,0.830078,0.389893,0.300049,13.49,152.190002
8,130.0,8.0,19240.0,2303394.0,2405494000.0,938.0,34.740002,38.509998,224.0,8.3,5.59,7313.0,270.850006,322.929993,0.790039,0.790039,0.459961,0.379883,13.71,140.270004
9,130.0,9.0,18590.0,1745734.0,1677400000.0,857.0,32.959999,34.040001,209.0,8.04,4.83,7007.0,269.5,306.480011,0.77002,0.759766,0.429932,0.340088,12.54,153.889999


In [23]:
len(dfs) # every time series has its own dataframe

89

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [24]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:44,218][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:44,228][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:44,232][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1677.02it/s]
[2025-09-06 19:24:44,288][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:44,288][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [296 189 431 340 425 ... 365 287 201 406 532], Length=54
        Val time series IDS: [374 291 258 116 449 ... 295   2 210 500 529], Length=25
        Test time series IDS [175 512 522 106 314 178 227  48 261 191], Length=10
        All time series IDS [296 189 431 340 425 ... 178 227  48 261 191], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [25]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [26]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [27]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [28]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [29]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:44,456][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:44,468][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:44,471][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1814.93it/s]
[2025-09-06 19:24:44,522][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:44,522][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [491 323  22   4  91 ... 344 537  98 514  67], Length=54
        Val time series IDS: [ 42 139 135 198 187 ... 319 506 310 427 237], Length=25
        Test time series IDS [  0 536 485 454 120 393 166 122 231 242], Length=10
        All time series IDS [491 323  22   4  91 ... 393 166 122 231 242], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [30]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [31]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:44,578][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:44,586][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:44,591][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2778.49it/s]
[2025-09-06 19:24:44,625][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:44,625][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [402   0   2 517 187 ... 327 363  89   1 490], Length=54
        Val time series IDS: [519 489  18 195 246 ... 451 349 371  12 269], Length=25
        Test time series IDS [495 194 469 141 116 411 316 234 198 152], Length=10
        All time series IDS [402   0   2 517 187 ... 411 316 234 198 152], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [32]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-06 19:24:44,633][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 111.00it/s]


array([[[4.02000e+02, 0.00000e+00, 1.35790e+04],
        [4.02000e+02, 1.00000e+00, 1.68120e+04],
        [4.02000e+02, 2.00000e+00, 2.78420e+04],
        ...,
        [4.02000e+02, 3.35600e+03, 1.34260e+04],
        [4.02000e+02, 3.35700e+03, 1.59980e+04],
        [4.02000e+02, 3.35800e+03, 1.47120e+04]],

       [[0.00000e+00, 0.00000e+00, 1.39436e+05],
        [0.00000e+00, 1.00000e+00, 1.57535e+05],
        [0.00000e+00, 2.00000e+00, 1.88005e+05],
        ...,
        [0.00000e+00, 3.35600e+03, 3.89916e+05],
        [0.00000e+00, 3.35700e+03, 3.80709e+05],
        [0.00000e+00, 3.35800e+03, 2.72751e+05]],

       [[2.00000e+00, 0.00000e+00, 1.64000e+02],
        [2.00000e+00, 1.00000e+00, 1.04000e+02],
        [2.00000e+00, 2.00000e+00, 7.80000e+01],
        ...,
        [2.00000e+00, 3.35600e+03, 6.80000e+01],
        [2.00000e+00, 3.35700e+03, 4.00000e+01],
        [2.00000e+00, 3.35800e+03, 7.00000e+01]],

       ...,

       [[1.45000e+02, 0.00000e+00, 7.93900e+03],
        [1.

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [33]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-09-06 19:24:44,660][series_config][INFO] - Quick validation succeeded.
[2025-09-06 19:24:44,670][series_config][INFO] - Finalization and validation completed successfully.
[2025-09-06 19:24:44,674][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2867.05it/s]
[2025-09-06 19:24:44,706][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-09-06 19:24:44,706][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [336 365 210 232 388 ...  28  14 469 389 109], Length=54
        Val time series IDS: [393 298 366 523 266 ... 268  24 163 406 497], Length=25
        Test time series IDS [ 56 356 534 225 498 188 535 103 419 228], Length=10
        All time series IDS [336 365 210 232 388 ... 188 535 103 419 228], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Anomaly handler
        Anomaly handler type (train set): None   
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batc

In [34]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-09-06 19:24:44,714][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 95.15it/s]


array([[[6.1000e+01, 0.0000e+00, 5.7300e+02],
        [6.1000e+01, 1.0000e+00, 5.4400e+02],
        [6.1000e+01, 2.0000e+00, 5.2200e+02],
        ...,
        [6.1000e+01, 3.3560e+03, 3.9400e+02],
        [6.1000e+01, 3.3570e+03, 4.4400e+02],
        [6.1000e+01, 3.3580e+03, 5.2600e+02]],

       [[3.3600e+02, 0.0000e+00, 1.2284e+04],
        [3.3600e+02, 1.0000e+00, 1.3489e+04],
        [3.3600e+02, 2.0000e+00, 1.8412e+04],
        ...,
        [3.3600e+02, 3.3560e+03, 4.3660e+03],
        [3.3600e+02, 3.3570e+03, 4.3260e+03],
        [3.3600e+02, 3.3580e+03, 3.6280e+03]],

       [[8.5000e+01, 0.0000e+00, 3.1170e+03],
        [8.5000e+01, 1.0000e+00, 3.2750e+03],
        [8.5000e+01, 2.0000e+00, 2.8440e+03],
        ...,
        [8.5000e+01, 3.3560e+03, 3.2840e+03],
        [8.5000e+01, 3.3570e+03, 3.6780e+03],
        [8.5000e+01, 3.3580e+03, 3.4950e+03]],

       ...,

       [[2.1000e+02, 0.0000e+00, 1.0700e+02],
        [2.1000e+02, 1.0000e+00, 2.3900e+02],
        [2.1000e+02, 2