# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder, DatasetType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, dataset_type=DatasetType.SERIES_BASED, display_details=True)

[2025-08-19 12:03:50,713][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:50,718][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:50,729][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:50,733][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1060.93it/s]
[2025-08-19 12:03:50,824][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:50,825][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [210 233 192 215  85 ... 500 407 382 475  93], Length=54
        Val time series IDS: [297 301 328 114  42 ...  63 390 155 410 311], Length=25
        Test time series IDS [ 57 170 400  34 208 129 313  82 169   8], Length=10
        All time series IDS [210 233 192 215  85 ... 129 313  82 169   8], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-08-19 12:03:50,831][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:03:50,832][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:03:50,833][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:03:50,834][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:03:50,834][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-08-19 12:03:50,840][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:03:50,841][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:03:50,841][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-08-19 12:03:50,842][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-08-19 12:03:50,842][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:03:50,852][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 45.12it/s]


(32, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:03:50,908][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 49.92it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:03:50,937][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 117.53it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:03:50,956][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 36.17it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:50,990][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:51,001][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:51,004][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1646.19it/s]
[2025-08-19 12:03:51,059][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:51,060][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [534 360 370 527 157 ... 127 407 532 416 365], Length=54
        Val time series IDS: [195  48 500  14 249 ... 481 380 210  21  29], Length=25
        Test time series IDS [208  59 388 542 538 386 176 394 359 223], Length=10
        All time series IDS [534 360 370 527 157 ... 386 176 394 359 223], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        T

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-08-19 12:03:51,068][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 120.85it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:51,092][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:51,102][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:51,106][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 1998.24it/s]
[2025-08-19 12:03:51,110][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:51,110][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All worker count: 0
        Init worker count: 0
    Other
        Nan th

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-08-19 12:03:51,120][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 999.12it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:51,128][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:51,138][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:51,141][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1692.86it/s]
[2025-08-19 12:03:51,196][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:51,196][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [484  69 324 290 445 ...  31  32 526 363  11], Length=54
        Val time series IDS: [452 291 151  79 154 ... 222 388 477 340 242], Length=25
        Test time series IDS [397 249  94  91 194 538 160 137 512 111], Length=10
        All time series IDS [484  69 324 290 445 ... 538 160 137 512 111], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,484.0,0.0,4846.0,107664.0,69537630.0,1093.0,15.84,11.08,3088.0,44.75,36.950001,2890.0,41.880001,47.450001,0.930176,0.930176,0.459961,0.589844,9.81,102.089996
1,484.0,1.0,6244.0,272962.0,201708100.0,1192.0,15.89,14.33,3398.0,45.310001,45.5,3692.0,49.23,71.550003,0.910156,0.899902,0.459961,0.580078,9.05,101.879997
2,484.0,2.0,8814.0,737368.0,738107400.0,1348.0,17.74,19.709999,3317.0,43.639999,43.060001,4884.0,64.260002,127.93,0.950195,0.930176,0.469971,0.589844,8.57,108.010002
3,484.0,3.0,11545.0,1093531.0,1067791000.0,1378.0,18.879999,24.57,3265.0,44.73,44.900002,5751.0,78.779999,170.389999,0.910156,0.910156,0.469971,0.600098,9.51,99.68
4,484.0,4.0,7976.0,705660.0,577283400.0,1262.0,18.290001,22.549999,2768.0,40.119999,40.490002,4496.0,65.160004,128.649994,0.919922,0.910156,0.5,0.609863,10.71,93.410004
5,484.0,5.0,7417.0,471816.0,360111700.0,1163.0,16.860001,21.43,2621.0,37.990002,39.07,4138.0,59.970001,118.849998,0.939941,0.930176,0.48999,0.600098,10.96,99.949997
6,484.0,6.0,7284.0,775307.0,654271000.0,1166.0,15.97,21.190001,2606.0,35.700001,38.369999,4203.0,57.580002,120.169998,0.939941,0.930176,0.5,0.620117,13.04,99.0
7,484.0,7.0,7809.0,759133.0,678151800.0,1199.0,17.9,21.65,2743.0,40.939999,39.57,4297.0,64.129997,123.639999,0.939941,0.939941,0.459961,0.589844,11.39,97.739998
8,484.0,8.0,8108.0,678962.0,620498500.0,1237.0,17.42,22.629999,2736.0,38.540001,40.860001,4509.0,63.509998,131.330002,0.910156,0.899902,0.47998,0.620117,11.61,96.349998
9,484.0,9.0,7619.0,830686.0,851341000.0,1207.0,15.88,20.610001,2834.0,37.290001,42.119999,4339.0,57.09,117.760002,0.959961,0.959961,0.5,0.629883,11.86,101.029999


In [17]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     484.0      0.0   4846.0   107664.0  6.953763e+07   
 1                     484.0      1.0   6244.0   272962.0  2.017081e+08   
 2                     484.0      2.0   8814.0   737368.0  7.381074e+08   
 3                     484.0      3.0  11545.0  1093531.0  1.067791e+09   
 4                     484.0      4.0   7976.0   705660.0  5.772834e+08   
 ...                     ...      ...      ...        ...           ...   
 3354                  484.0   3354.0   7625.0   700457.0  6.632737e+08   
 3355                  484.0   3355.0   7818.0   933245.0  8.989697e+08   
 3356                  484.0   3356.0   6458.0   470614.0  4.373639e+08   
 3357                  484.0   3357.0   6711.0   219344.0  1.564904e+08   
 3358                  484.0   3358.0   5979.0   134168.0  1.059670e+08   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             1093.0  

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,452.0,0.0,1555.0,2723.0,378429.0,462.0,38.5,42.049999,231.0,19.25,16.389999,809.0,67.419998,79.879997,0.26001,0.23999,0.449951,0.320068,2.84,131.919998
1,452.0,1.0,1551.0,2427.0,341497.0,425.0,38.639999,37.48,251.0,22.82,18.870001,804.0,73.089996,71.480003,0.180054,0.130005,0.439941,0.320068,3.69,139.710007
2,452.0,2.0,1743.0,2898.0,348745.0,386.0,35.09,30.08,235.0,21.360001,17.6,1123.0,102.089996,93.099998,0.180054,0.160034,0.429932,0.310059,2.07,125.019997
3,452.0,3.0,2386.0,3829.0,474914.0,452.0,37.669998,36.200001,199.0,16.58,14.95,1426.0,118.830002,117.050003,0.059998,0.049988,0.469971,0.360107,4.46,123.730003
4,452.0,4.0,1822.0,2777.0,379889.0,415.0,51.880001,31.42,123.0,15.38,9.29,1194.0,149.25,94.379997,0.059998,0.029999,0.5,0.350098,3.01,82.790001
5,452.0,5.0,1828.0,2946.0,356941.0,406.0,36.91,33.310001,136.0,12.36,10.11,1202.0,109.269997,101.790001,0.150024,0.140015,0.469971,0.350098,2.15,124.739998
6,452.0,6.0,1626.0,2534.0,330348.0,410.0,45.560001,31.1,120.0,13.33,7.66,1103.0,122.559998,88.089996,0.150024,0.140015,0.439941,0.280029,1.77,113.849998
7,452.0,7.0,1825.0,2803.0,374676.0,437.0,54.619999,30.700001,125.0,15.62,7.27,1142.0,142.75,85.889999,0.160034,0.140015,0.459961,0.300049,2.53,87.809998
8,452.0,8.0,1839.0,2890.0,377678.0,403.0,33.580002,32.07,136.0,11.33,9.38,1195.0,99.580002,101.739998,0.170044,0.150024,0.459961,0.379883,1.74,118.489998
9,452.0,9.0,1706.0,2826.0,366681.0,403.0,40.299999,32.049999,145.0,14.5,10.02,1128.0,112.800003,91.900002,0.130005,0.119995,0.469971,0.320068,1.39,116.339996


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets   n_bytes  \
 0                     452.0      0.0   1555.0     2723.0  378429.0   
 1                     452.0      1.0   1551.0     2427.0  341497.0   
 2                     452.0      2.0   1743.0     2898.0  348745.0   
 3                     452.0      3.0   2386.0     3829.0  474914.0   
 4                     452.0      4.0   1822.0     2777.0  379889.0   
 ...                     ...      ...      ...        ...       ...   
 3354                  452.0   3354.0    660.0     2323.0  216276.0   
 3355                  452.0   3355.0    636.0     2350.0  206599.0   
 3356                  452.0   3356.0    620.0     2252.0  177939.0   
 3357                  452.0   3357.0    780.0     3519.0  289053.0   
 3358                  452.0   3358.0    730.0     3159.0  273266.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              462.0       38.500000       42.049999             231.0

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,397.0,0.0,535.0,5100.0,987490.0,195.0,6.5,3.16,495.0,16.5,21.85,218.0,7.27,3.41,0.810059,0.759766,0.429932,0.429932,3.64,140.160004
1,397.0,1.0,438.0,4140.0,1011000.0,202.0,6.52,2.64,359.0,11.58,10.86,236.0,7.61,5.3,0.819824,0.77002,0.439941,0.429932,6.34,140.639999
2,397.0,2.0,1235.0,123035.0,104802073.0,338.0,9.94,8.29,339.0,9.97,8.01,694.0,20.41,28.870001,0.899902,0.870117,0.449951,0.340088,12.09,128.509995
3,397.0,3.0,2822.0,240860.0,197504663.0,456.0,13.03,10.92,379.0,10.83,9.64,1323.0,37.799999,51.900002,0.859863,0.819824,0.429932,0.340088,11.04,126.860001
4,397.0,4.0,2486.0,197376.0,167559457.0,425.0,12.14,12.33,506.0,14.46,19.450001,1199.0,34.259998,50.080002,0.879883,0.850098,0.48999,0.419922,13.45,124.839996
5,397.0,5.0,1991.0,82563.0,66200261.0,382.0,11.24,11.71,420.0,12.35,17.549999,991.0,29.15,42.610001,0.850098,0.830078,0.48999,0.409912,12.75,124.07
6,397.0,6.0,1775.0,92132.0,64287658.0,372.0,11.27,11.71,365.0,11.06,14.69,957.0,29.0,42.330002,0.870117,0.850098,0.47998,0.399902,16.6,135.020004
7,397.0,7.0,1962.0,124572.0,57924393.0,444.0,13.06,12.37,309.0,9.09,7.47,1090.0,32.060001,39.5,0.879883,0.830078,0.47998,0.399902,16.59,128.050003
8,397.0,8.0,1435.0,97041.0,60553877.0,377.0,11.42,11.02,254.0,7.7,6.53,813.0,24.639999,32.75,0.839844,0.810059,0.469971,0.409912,15.53,132.270004
9,397.0,9.0,1514.0,78949.0,44374854.0,357.0,10.5,10.13,266.0,7.82,6.83,841.0,24.74,32.310001,0.910156,0.890137,0.47998,0.409912,14.37,129.619995


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,484.0,0.0,4846.0,107664.0,69537630.0,1093.0,15.84,11.08,3088.0,44.75,36.950001,2890.0,41.880001,47.450001,0.930176,0.930176,0.459961,0.589844,9.81,102.089996
1,484.0,1.0,6244.0,272962.0,201708100.0,1192.0,15.89,14.33,3398.0,45.310001,45.5,3692.0,49.23,71.550003,0.910156,0.899902,0.459961,0.580078,9.05,101.879997
2,484.0,2.0,8814.0,737368.0,738107400.0,1348.0,17.74,19.709999,3317.0,43.639999,43.060001,4884.0,64.260002,127.93,0.950195,0.930176,0.469971,0.589844,8.57,108.010002
3,484.0,3.0,11545.0,1093531.0,1067791000.0,1378.0,18.879999,24.57,3265.0,44.73,44.900002,5751.0,78.779999,170.389999,0.910156,0.910156,0.469971,0.600098,9.51,99.68
4,484.0,4.0,7976.0,705660.0,577283400.0,1262.0,18.290001,22.549999,2768.0,40.119999,40.490002,4496.0,65.160004,128.649994,0.919922,0.910156,0.5,0.609863,10.71,93.410004
5,484.0,5.0,7417.0,471816.0,360111700.0,1163.0,16.860001,21.43,2621.0,37.990002,39.07,4138.0,59.970001,118.849998,0.939941,0.930176,0.48999,0.600098,10.96,99.949997
6,484.0,6.0,7284.0,775307.0,654271000.0,1166.0,15.97,21.190001,2606.0,35.700001,38.369999,4203.0,57.580002,120.169998,0.939941,0.930176,0.5,0.620117,13.04,99.0
7,484.0,7.0,7809.0,759133.0,678151800.0,1199.0,17.9,21.65,2743.0,40.939999,39.57,4297.0,64.129997,123.639999,0.939941,0.939941,0.459961,0.589844,11.39,97.739998
8,484.0,8.0,8108.0,678962.0,620498500.0,1237.0,17.42,22.629999,2736.0,38.540001,40.860001,4509.0,63.509998,131.330002,0.910156,0.899902,0.47998,0.620117,11.61,96.349998
9,484.0,9.0,7619.0,830686.0,851341000.0,1207.0,15.88,20.610001,2834.0,37.290001,42.119999,4339.0,57.09,117.760002,0.959961,0.959961,0.5,0.629883,11.86,101.029999


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets       n_bytes  \
 0                     484.0      0.0   4846.0   107664.0  6.953763e+07   
 1                     484.0      1.0   6244.0   272962.0  2.017081e+08   
 2                     484.0      2.0   8814.0   737368.0  7.381074e+08   
 3                     484.0      3.0  11545.0  1093531.0  1.067791e+09   
 4                     484.0      4.0   7976.0   705660.0  5.772834e+08   
 ...                     ...      ...      ...        ...           ...   
 3354                  484.0   3354.0   7625.0   700457.0  6.632737e+08   
 3355                  484.0   3355.0   7818.0   933245.0  8.989697e+08   
 3356                  484.0   3356.0   6458.0   470614.0  4.373639e+08   
 3357                  484.0   3357.0   6711.0   219344.0  1.564904e+08   
 3358                  484.0   3358.0   5979.0   134168.0  1.059670e+08   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0             1093.0  

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:52,247][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:52,259][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:52,262][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1740.51it/s]
[2025-08-19 12:03:52,316][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:52,317][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [215  57 516 306 422 ... 438 304 281 241 128], Length=54
        Val time series IDS: [254 209 390 391 232 ... 135 272 179  40 362], Length=25
        Test time series IDS [309 282 122 407 217 310 470 160 190 277], Length=10
        All time series IDS [215  57 516 306 422 ... 310 470 160 190 277], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:52,496][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:52,562][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:52,566][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1908.53it/s]
[2025-08-19 12:03:52,616][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:52,616][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [438 480 509 443 476 ... 386 356 200 226 454], Length=54
        Val time series IDS: [ 99 255 516 512  65 ... 406 459  39 124  46], Length=25
        Test time series IDS [130 260  38 533 539  17 460 224  54  80], Length=10
        All time series IDS [438 480 509 443 476 ...  17 460 224  54  80], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      shape=(3359,), dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:52,670][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:52,680][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:52,683][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2687.80it/s]
[2025-08-19 12:03:52,719][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:52,719][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [281  10  88 392 396 ...   5 175 413 448 131], Length=54
        Val time series IDS: [282  30 518 179 229 ... 211 205 515  58 260], Length=25
        Test time series IDS [ 22 370 289 521 115 355 296 330 144 111], Length=10
        All time series IDS [281  10  88 392 396 ... 355 296 330 144 111], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-19 12:03:52,727][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 117.55it/s]


array([[[2.8100e+02, 0.0000e+00, 1.1168e+04],
        [2.8100e+02, 1.0000e+00, 1.1266e+04],
        [2.8100e+02, 2.0000e+00, 1.1489e+04],
        ...,
        [2.8100e+02, 3.3560e+03, 8.5620e+03],
        [2.8100e+02, 3.3570e+03, 9.3670e+03],
        [2.8100e+02, 3.3580e+03, 8.5590e+03]],

       [[1.0000e+01, 0.0000e+00, 3.5410e+03],
        [1.0000e+01, 1.0000e+00, 3.5230e+03],
        [1.0000e+01, 2.0000e+00, 4.0480e+03],
        ...,
        [1.0000e+01, 3.3560e+03, 4.2600e+03],
        [1.0000e+01, 3.3570e+03, 4.4920e+03],
        [1.0000e+01, 3.3580e+03, 4.0020e+03]],

       [[8.8000e+01, 0.0000e+00, 3.7100e+02],
        [8.8000e+01, 1.0000e+00, 3.9500e+02],
        [8.8000e+01, 2.0000e+00, 3.3400e+02],
        ...,
        [8.8000e+01, 3.3560e+03, 2.5000e+02],
        [8.8000e+01, 3.3570e+03, 2.9400e+02],
        [8.8000e+01, 3.3580e+03, 3.0700e+02]],

       ...,

       [[5.7000e+01, 0.0000e+00, 3.9000e+01],
        [5.7000e+01, 1.0000e+00, 4.5000e+01],
        [5.7000e+01, 2

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-08-19 12:03:52,753][series_config][INFO] - Quick validation succeeded.
[2025-08-19 12:03:52,762][series_config][INFO] - Finalization and validation completed successfully.
[2025-08-19 12:03:52,766][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 3503.49it/s]
[2025-08-19 12:03:52,794][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-08-19 12:03:52,795][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [397 123 360 222 140 ... 202 365 441 407 353], Length=54
        Val time series IDS: [481  71 281 394 489 ... 168 169 120   4  64], Length=25
        Test time series IDS [445 236 493 157 321 515  38 510 546 252], Length=10
        All time series IDS [397 123 360 222 140 ... 515  38 510 546 252], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Transformers
        Transformer type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Tr

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-08-19 12:03:52,802][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 106.39it/s]


array([[[2.2000e+01, 0.0000e+00, 1.4700e+02],
        [2.2000e+01, 1.0000e+00, 1.6800e+02],
        [2.2000e+01, 2.0000e+00, 1.1700e+02],
        ...,
        [2.2000e+01, 3.3560e+03, 1.4100e+02],
        [2.2000e+01, 3.3570e+03, 7.9000e+01],
        [2.2000e+01, 3.3580e+03, 5.7000e+01]],

       [[4.4400e+02, 0.0000e+00, 3.1170e+03],
        [4.4400e+02, 1.0000e+00, 3.2210e+03],
        [4.4400e+02, 2.0000e+00, 2.7640e+03],
        ...,
        [4.4400e+02, 3.3560e+03, 2.8210e+03],
        [4.4400e+02, 3.3570e+03, 3.4100e+03],
        [4.4400e+02, 3.3580e+03, 3.3560e+03]],

       [[2.4900e+02, 0.0000e+00, 4.7740e+03],
        [2.4900e+02, 1.0000e+00, 4.6720e+03],
        [2.4900e+02, 2.0000e+00, 4.2180e+03],
        ...,
        [2.4900e+02, 3.3560e+03, 9.6860e+03],
        [2.4900e+02, 3.3570e+03, 1.0839e+04],
        [2.4900e+02, 3.3580e+03, 1.0692e+04]],

       ...,

       [[4.5900e+02, 0.0000e+00, 7.8000e+01],
        [4.5900e+02, 1.0000e+00, 1.0900e+02],
        [4.5900e+02, 2