# Loading data with SeriesBasedCesnetDataset

### Import

In [1]:
from tqdm import tqdm
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, TimeFormat, DataloaderOrder
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import SeriesBasedConfig # Series based dataset MUST use SeriesBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
series_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.INSTITUTION_SUBNETS, aggregation=AgreggationType.AGG_1_HOUR, is_series_based=True, display_details=True)

[2025-04-09 11:45:39,607][wrapper_dataset][INFO] - Dataset is series-based. Use cesnet_tszoo.configs.SeriesBasedConfig



Dataset details:

    AgreggationType.AGG_1_HOUR
        Time indices: range(0, 6717)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 0, tzinfo=datetime.timezone.utc))

    SourceType.INSTITUTION_SUBNETS
        Time series indices: [0 1 2 3 4 ... 543 544 545 546 547], Length=548; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Loading data with DataLoader

- Load data using Pytorch Dataloader.
- Last batch is never dropped.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_dataloader` with parameter `workers`.
- Batch size affect how many time series will be in one batch (no effect when loading specific time series)
- Batch consists of:
    - When `time_format` is not TimeFormat.DATETIME, then batch is one Numpy array of shape `(batch_size, time_period, features_to_take + used ids)`.
    - When `time_format` is TimeFormat.DATETIME, then batch is a tuple: (Numpy array of shape `(batch_size, time_period, features_to_take + used ids (without time))`, Numpy array of shape `(time_period)` of time)

In [4]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:39,613][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:39,626][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:39,630][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1322.66it/s]
[2025-04-09 11:45:39,705][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:39,705][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 96 465 406 222 539 ... 437 533 445 186 174], Length=54
        Val time series IDS: [504 443 469 408 314 ...  88 525  52 537  47], Length=25
        Test time series IDS [282 229 422 402 487 197 540 435 448 542], Length=10
        All time series IDS [ 96 465 406 222 539 ... 197 540 435 448 542], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Scalers
        Scaler type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker

You can also change set batch sizes later with `update_dataset_config_and_initialize` or `set_batch_sizes`.

In [5]:
series_based_dataset.update_dataset_config_and_initialize(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")
# Or
series_based_dataset.set_batch_sizes(train_batch_size=33, val_batch_size=65, test_batch_size="config", all_batch_size="config")

[2025-04-09 11:45:39,712][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-04-09 11:45:39,712][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:45:39,713][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-04-09 11:45:39,714][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:45:39,714][cesnet_dataset][INFO] - Batch sizes has been changed successfuly.


You can also change set workers later with `update_dataset_config_and_initialize` or `set_workers`.

In [6]:
series_based_dataset.update_dataset_config_and_initialize(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
# Or
series_based_dataset.set_workers(train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)

[2025-04-09 11:45:39,720][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-04-09 11:45:39,721][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:45:39,722][cesnet_dataset][INFO] - Re-initialization is not needed.
[2025-04-09 11:45:39,722][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:45:39,723][cesnet_dataset][INFO] - Workers has been changed successfuly.


#### Train set

- Affected by `train_batch_size`.
- Affected by `train_workers`.

In [7]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-04-09 11:45:39,733][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 46.37it/s]


(33, 3359, 3)

#### Val set

- Affected by `val_batch_size`.
- Affected by `val_workers`.

In [8]:
dataloader = series_based_dataset.get_val_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-04-09 11:45:39,789][cesnet_dataset][INFO] - Created new cached val_dataloader.
100%|██████████| 1/1 [00:00<00:00, 49.93it/s]


(25, 3359, 3)

#### Test set

- Affected by `test_batch_size`.
- Affected by `test_workers`.

In [9]:
dataloader = series_based_dataset.get_test_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-04-09 11:45:39,821][cesnet_dataset][INFO] - Created new cached test_dataloader.
100%|██████████| 1/1 [00:00<00:00, 115.18it/s]


(10, 3359, 3)

#### All set

- Affected by `all_batch_size`.
- Affected by `all_workers`.

In [10]:
dataloader = series_based_dataset.get_all_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-04-09 11:45:39,840][cesnet_dataset][INFO] - Created new cached all_dataloader.
100%|██████████| 1/1 [00:00<00:00, 34.97it/s]


(89, 3359, 3)

#### Using time_format=TimeFormat.DATETIME

In [11]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:39,878][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:39,890][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:39,894][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1664.75it/s]
[2025-04-09 11:45:39,949][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:39,950][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [452 354 422   3 478 ...  47   4 451 151 521], Length=54
        Val time series IDS: [135 127 179 359 531 ... 483 363 125 505 307], Length=25
        Test time series IDS [401   1 460 134 450 211 526 154 141 455], Length=10
        All time series IDS [452 354 422   3 478 ... 211 526 154 141 455], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.DATETIME
    Fillers         
        Filler type: None
    Scalers
        Scaler type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worke

In [12]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0][0].shape) # data without time
display(batches[0][1].shape) # time

[2025-04-09 11:45:39,959][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 105.11it/s]


(32, 3359, 2)

(3359,)

#### Specifying which time series to load

- Every `get_*_dataloader` has parameter `ts_id`.
    - When `ts_id` is None, then it returns as previous examples.
    - When `ts_id` is not None, then it returns only one time series of that specified id.

In [13]:
config = SeriesBasedConfig(time_period=0.5, train_ts=[177, 176, 319, 267], features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:39,986][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:39,996][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:40,001][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 4/4 [00:00<00:00, 661.33it/s]
[2025-04-09 11:45:40,009][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:40,010][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [177 176 319 267], Length=4
        Val time series IDS: None
        Test time series IDS None
        All time series IDS [177 176 319 267], Length=4
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Scalers
        Scaler type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker count: 0
        Val worker count: 0
        Test worker count: 0
        All worker count: 0
        Init worker count: 0
    Other
        Nan threshold: 1

In [14]:
dataloader = series_based_dataset.get_train_dataloader(ts_id=177 , workers="config",)

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
display(batches[0].shape)

[2025-04-09 11:45:40,022][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 1/1 [00:00<00:00, 1002.70it/s]


(1, 3359, 3)

### Loading data as Dataframe

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as Pandas Dataframe.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_df` with parameter `workers`.

In [15]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:40,033][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:40,046][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:40,050][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1551.41it/s]
[2025-04-09 11:45:40,110][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:40,110][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [171  53 186 492  98 ... 541 532 429  57  27], Length=54
        Val time series IDS: [494 265   0 413 137 ... 483 315 143  96 453], Length=25
        Test time series IDS [307 336 380  18  85  90 163 211 166  61], Length=10
        All time series IDS [171  53 186 492  98 ...  90 163 211 166  61], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [16]:
df = series_based_dataset.get_train_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_train_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,171.0,0.0,6809.0,149516.0,70642290.0,750.0,37.5,53.07,494.0,24.700001,32.619999,2660.0,133.0,201.029999,0.620117,0.609863,0.379883,0.280029,6.02,167.149994
1,171.0,1.0,9533.0,317434.0,141982100.0,899.0,47.32,65.360001,492.0,25.889999,32.709999,3679.0,193.630005,284.089996,0.540039,0.569824,0.439941,0.350098,8.58,155.830002
2,171.0,2.0,16023.0,813824.0,625225100.0,1136.0,49.389999,76.75,539.0,23.43,32.310001,5816.0,252.869995,424.299988,0.52002,0.529785,0.370117,0.290039,4.67,160.100006
3,171.0,3.0,19307.0,1175152.0,942318200.0,1173.0,65.169998,89.949997,461.0,25.610001,32.02,6706.0,372.559998,533.969971,0.580078,0.600098,0.370117,0.26001,7.16,155.75
4,171.0,4.0,15268.0,834485.0,704084500.0,1151.0,67.709999,90.080002,426.0,25.059999,30.5,6241.0,367.119995,504.140015,0.609863,0.609863,0.409912,0.300049,5.83,155.0
5,171.0,5.0,12166.0,595207.0,419208100.0,1060.0,58.889999,81.169998,365.0,20.280001,25.190001,5360.0,297.779999,427.049988,0.640137,0.629883,0.350098,0.219971,4.32,146.869995
6,171.0,6.0,11740.0,905164.0,739909700.0,1047.0,65.440002,83.599998,319.0,19.940001,23.049999,5269.0,329.309998,439.630005,0.709961,0.709961,0.399902,0.27002,5.91,162.690002
7,171.0,7.0,13633.0,823782.0,680232000.0,1139.0,51.77,80.989998,417.0,18.950001,26.27,5894.0,267.910004,440.399994,0.629883,0.660156,0.429932,0.340088,4.51,148.470001
8,171.0,8.0,12750.0,1602647.0,1793204000.0,1113.0,69.559998,88.400002,373.0,23.309999,27.209999,5591.0,349.440002,461.399994,0.740234,0.75,0.399902,0.360107,6.25,139.889999
9,171.0,9.0,13868.0,702336.0,517099500.0,1118.0,53.240002,80.5,369.0,17.57,22.57,5825.0,277.380005,442.0,0.649902,0.629883,0.370117,0.26001,7.64,139.360001


In [17]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     171.0      0.0   6809.0   149516.0   70642294.0   
 1                     171.0      1.0   9533.0   317434.0  141982060.0   
 2                     171.0      2.0  16023.0   813824.0  625225139.0   
 3                     171.0      3.0  19307.0  1175152.0  942318182.0   
 4                     171.0      4.0  15268.0   834485.0  704084456.0   
 ...                     ...      ...      ...        ...          ...   
 3354                  171.0   3354.0   6030.0   260787.0  198902854.0   
 3355                  171.0   3355.0   5581.0   479540.0  513479610.0   
 3356                  171.0   3356.0   6000.0   516360.0  429922338.0   
 3357                  171.0   3357.0   6441.0   376779.0  187221071.0   
 3358                  171.0   3358.0   6124.0   427535.0  323785281.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              750.0       37.5000

#### Val set

- Affected by `val_workers`.

In [18]:
df = series_based_dataset.get_val_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_val_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,494.0,0.0,700.0,15432.0,7996389.0,181.0,13.92,3.82,106.0,8.15,2.44,476.0,36.619999,12.17,0.5,0.5,0.439941,0.379883,39.779999,118.989998
1,494.0,1.0,769.0,14173.0,5528280.0,185.0,14.23,3.92,102.0,7.85,2.03,497.0,38.23,15.4,0.509766,0.5,0.580078,0.469971,38.25,109.419998
2,494.0,2.0,999.0,30801.0,21420732.0,218.0,15.57,6.61,114.0,8.14,2.77,605.0,43.209999,19.9,0.790039,0.790039,0.389893,0.340088,31.809999,113.389999
3,494.0,3.0,2032.0,65441.0,41999931.0,331.0,23.639999,9.76,115.0,8.21,2.52,1326.0,94.709999,50.02,0.740234,0.759766,0.320068,0.27002,29.74,112.739998
4,494.0,4.0,2088.0,195543.0,116996194.0,346.0,28.83,7.4,107.0,8.92,2.57,1368.0,114.0,29.83,0.660156,0.709961,0.409912,0.290039,46.799999,103.589996
5,494.0,5.0,2432.0,166089.0,138589106.0,400.0,30.77,10.58,119.0,9.15,3.11,1533.0,117.919998,45.0,0.669922,0.669922,0.370117,0.219971,37.419998,108.75
6,494.0,6.0,2107.0,167831.0,104200413.0,377.0,31.42,7.56,94.0,7.83,2.98,1482.0,123.5,30.200001,0.600098,0.620117,0.399902,0.300049,48.259998,105.0
7,494.0,7.0,2342.0,146761.0,115118387.0,385.0,29.620001,10.52,108.0,8.31,2.5,1536.0,118.150002,42.610001,0.700195,0.680176,0.379883,0.280029,44.389999,110.230003
8,494.0,8.0,2748.0,337735.0,295790591.0,460.0,38.330002,6.44,104.0,8.67,1.44,1866.0,155.5,30.49,0.660156,0.649902,0.459961,0.370117,50.169998,107.309998
9,494.0,9.0,2883.0,103748.0,80195428.0,450.0,37.5,5.52,121.0,10.08,2.15,1912.0,159.330002,24.360001,0.720215,0.669922,0.429932,0.27002,49.509998,110.639999


In [19]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     494.0      0.0    700.0    15432.0    7996389.0   
 1                     494.0      1.0    769.0    14173.0    5528280.0   
 2                     494.0      2.0    999.0    30801.0   21420732.0   
 3                     494.0      3.0   2032.0    65441.0   41999931.0   
 4                     494.0      4.0   2088.0   195543.0  116996194.0   
 ...                     ...      ...      ...        ...          ...   
 3354                  494.0   3354.0    659.0    12569.0    6160056.0   
 3355                  494.0   3355.0    691.0    18598.0    7444131.0   
 3356                  494.0   3356.0    702.0     6765.0    2349776.0   
 3357                  494.0   3357.0    771.0     9161.0    5356661.0   
 3358                  494.0   3358.0    708.0     5503.0    2422639.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              181.0       13.9200

#### Test set

- Affected by `test_workers`.

In [20]:
df = series_based_dataset.get_test_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_test_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,307.0,0.0,11718.0,758957.0,674056800.0,2473.0,7.49,5.6,3737.0,11.32,17.5,5481.0,16.610001,24.950001,0.709961,0.720215,0.429932,0.48999,8.51,136.110001
1,307.0,1.0,11968.0,829743.0,664436300.0,2586.0,7.91,6.14,3784.0,11.57,17.459999,5831.0,17.83,25.76,0.669922,0.680176,0.409912,0.449951,8.64,137.429993
2,307.0,2.0,11009.0,874128.0,675415600.0,2494.0,7.36,5.95,3553.0,10.48,16.389999,5689.0,16.780001,27.73,0.669922,0.689941,0.419922,0.459961,8.86,135.169998
3,307.0,3.0,10499.0,1249875.0,750990400.0,2184.0,7.28,7.69,3110.0,10.37,16.32,5335.0,17.780001,35.200001,0.649902,0.660156,0.449951,0.48999,8.48,127.300003
4,307.0,4.0,12152.0,2471160.0,1696159000.0,2202.0,7.2,10.27,2849.0,9.31,14.23,6475.0,21.16,61.299999,0.669922,0.689941,0.449951,0.48999,10.0,128.720001
5,307.0,5.0,16806.0,2233975.0,1865119000.0,2492.0,8.06,15.17,3031.0,9.81,15.43,8778.0,28.41,98.699997,0.720215,0.740234,0.459961,0.509766,9.15,126.25
6,307.0,6.0,17983.0,2425362.0,1776008000.0,2522.0,8.55,18.43,2718.0,9.21,14.21,8864.0,30.049999,107.150002,0.700195,0.720215,0.47998,0.52002,10.39,127.550003
7,307.0,7.0,19398.0,3747747.0,2384432000.0,2678.0,8.42,17.030001,2941.0,9.25,13.33,9292.0,29.219999,103.339996,0.680176,0.689941,0.449951,0.5,9.25,132.720001
8,307.0,8.0,17343.0,6151914.0,3486004000.0,2724.0,8.49,18.139999,3386.0,10.55,21.02,9023.0,28.110001,98.510002,0.649902,0.669922,0.469971,0.5,9.58,120.900002
9,307.0,9.0,18931.0,3999605.0,1914538000.0,3192.0,9.56,24.200001,3377.0,10.11,19.459999,9952.0,29.799999,112.470001,0.649902,0.660156,0.459961,0.5,9.78,129.089996


#### All set

- Affected by `all_workers`.

In [21]:
df = series_based_dataset.get_all_df(as_single_dataframe=True, workers="config")
dfs = series_based_dataset.get_all_df(as_single_dataframe=False, workers="config")

df.head(10)

Unnamed: 0,id_institution_subnet,id_time,n_flows,n_packets,n_bytes,sum_n_dest_asn,avg_n_dest_asn,std_n_dest_asn,sum_n_dest_ports,avg_n_dest_ports,std_n_dest_ports,sum_n_dest_ip,avg_n_dest_ip,std_n_dest_ip,tcp_udp_ratio_packets,tcp_udp_ratio_bytes,dir_ratio_packets,dir_ratio_bytes,avg_duration,avg_ttl
0,171.0,0.0,6809.0,149516.0,70642290.0,750.0,37.5,53.07,494.0,24.700001,32.619999,2660.0,133.0,201.029999,0.620117,0.609863,0.379883,0.280029,6.02,167.149994
1,171.0,1.0,9533.0,317434.0,141982100.0,899.0,47.32,65.360001,492.0,25.889999,32.709999,3679.0,193.630005,284.089996,0.540039,0.569824,0.439941,0.350098,8.58,155.830002
2,171.0,2.0,16023.0,813824.0,625225100.0,1136.0,49.389999,76.75,539.0,23.43,32.310001,5816.0,252.869995,424.299988,0.52002,0.529785,0.370117,0.290039,4.67,160.100006
3,171.0,3.0,19307.0,1175152.0,942318200.0,1173.0,65.169998,89.949997,461.0,25.610001,32.02,6706.0,372.559998,533.969971,0.580078,0.600098,0.370117,0.26001,7.16,155.75
4,171.0,4.0,15268.0,834485.0,704084500.0,1151.0,67.709999,90.080002,426.0,25.059999,30.5,6241.0,367.119995,504.140015,0.609863,0.609863,0.409912,0.300049,5.83,155.0
5,171.0,5.0,12166.0,595207.0,419208100.0,1060.0,58.889999,81.169998,365.0,20.280001,25.190001,5360.0,297.779999,427.049988,0.640137,0.629883,0.350098,0.219971,4.32,146.869995
6,171.0,6.0,11740.0,905164.0,739909700.0,1047.0,65.440002,83.599998,319.0,19.940001,23.049999,5269.0,329.309998,439.630005,0.709961,0.709961,0.399902,0.27002,5.91,162.690002
7,171.0,7.0,13633.0,823782.0,680232000.0,1139.0,51.77,80.989998,417.0,18.950001,26.27,5894.0,267.910004,440.399994,0.629883,0.660156,0.429932,0.340088,4.51,148.470001
8,171.0,8.0,12750.0,1602647.0,1793204000.0,1113.0,69.559998,88.400002,373.0,23.309999,27.209999,5591.0,349.440002,461.399994,0.740234,0.75,0.399902,0.360107,6.25,139.889999
9,171.0,9.0,13868.0,702336.0,517099500.0,1118.0,53.240002,80.5,369.0,17.57,22.57,5825.0,277.380005,442.0,0.649902,0.629883,0.370117,0.26001,7.64,139.360001


In [22]:
dfs

[      id_institution_subnet  id_time  n_flows  n_packets      n_bytes  \
 0                     171.0      0.0   6809.0   149516.0   70642294.0   
 1                     171.0      1.0   9533.0   317434.0  141982060.0   
 2                     171.0      2.0  16023.0   813824.0  625225139.0   
 3                     171.0      3.0  19307.0  1175152.0  942318182.0   
 4                     171.0      4.0  15268.0   834485.0  704084456.0   
 ...                     ...      ...      ...        ...          ...   
 3354                  171.0   3354.0   6030.0   260787.0  198902854.0   
 3355                  171.0   3355.0   5581.0   479540.0  513479610.0   
 3356                  171.0   3356.0   6000.0   516360.0  429922338.0   
 3357                  171.0   3357.0   6441.0   376779.0  187221071.0   
 3358                  171.0   3358.0   6124.0   427535.0  323785281.0   
 
       sum_n_dest_asn  avg_n_dest_asn  std_n_dest_asn  sum_n_dest_ports  \
 0              750.0       37.5000

### Loading data as singular Numpy array 

- Batch size has no effect.
- Returns every time series in set with specified `time_period`.
- Data is returned as one Numpy array.
- Follows similar rules to Dataloader batches, regarding shape.
- Workers affect how many processes will be used for loading data for specific set.
    - Workers set to 0, means loading will be ran on main process.
    - Set workers can be overriden in `get_*_numpy` with parameter `workers`.

In [23]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:41,201][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:41,212][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:41,216][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1972.11it/s]
[2025-04-09 11:45:41,263][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:41,263][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [448  31 133 464 532 ... 125 380 357 231 515], Length=54
        Val time series IDS: [209 243 286 416   1 ... 138   4 393 383 265], Length=25
        Test time series IDS [543 455  57 354 339 333 479 328 129 248], Length=10
        All time series IDS [448  31 133 464 532 ... 333 479 328 129 248], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

#### Train set

- Affected by `train_workers`.

In [24]:
numpy_array = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)

(54, 3359, 20)

#### Val set

- Affected by `val_workers`.

In [25]:
numpy_array = series_based_dataset.get_val_numpy(workers="config")

display(numpy_array.shape)

(25, 3359, 20)

#### Test set

- Affected by `test_workers`.

In [26]:
numpy_array = series_based_dataset.get_test_numpy(workers="config")

display(numpy_array.shape)

(10, 3359, 20)

#### All set

- Affected by `all_workers`.

In [27]:
numpy_array = series_based_dataset.get_all_numpy(workers="config")

display(numpy_array.shape)

(89, 3359, 20)

#### Using time_format=TimeFormat.DATETIME

In [28]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take="all", time_format=TimeFormat.DATETIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:41,449][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:41,462][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:41,466][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 1588.90it/s]
[2025-04-09 11:45:41,524][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:41,524][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 15  51 235 496  85 ... 173  40  93 150 110], Length=54
        Val time series IDS: [493 267 546 172 265 ... 408 179 357 239 201], Length=25
        Test time series IDS [120 495  63 176 500 338 280 112 209 105], Length=10
        All time series IDS [ 15  51 235 496  85 ... 338 280 112 209 105], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 

In [29]:
numpy_array, times = series_based_dataset.get_train_numpy(workers="config")

display(numpy_array.shape)
display(times)

(54, 3359, 19)

array([datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 1, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2023, 10, 9, 2, 0, tzinfo=datetime.timezone.utc),
       ...,
       datetime.datetime(2024, 2, 25, 20, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 21, 0, tzinfo=datetime.timezone.utc),
       datetime.datetime(2024, 2, 25, 22, 0, tzinfo=datetime.timezone.utc)],
      dtype=object)

### Setting train_dataloader_order

- Affects `get_train_dataloader`, `get_train_df`, `get_train_numpy`.
- No effect when using `get_train_dataloader` with specified `ts_id`.

#### Using DataloaderOrder.SEQUENTIAL (default)

- Returns data in the same order as they are set in config.

In [30]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.SEQUENTIAL)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:41,585][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:41,595][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:41,599][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2324.31it/s]
[2025-04-09 11:45:41,639][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:41,641][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [ 53 456 154 386 512 ... 238 402 160 524 464], Length=54
        Val time series IDS: [212 437 443 228 455 ... 544 399 297  50  47], Length=25
        Test time series IDS [330 182  70 473 401 539 419 431 364 260], Length=10
        All time series IDS [ 53 456 154 386 512 ... 539 419 431 364 260], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Scalers
        Scaler type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker

In [31]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-04-09 11:45:41,653][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 81.18it/s]


array([[[5.300e+01, 0.000e+00, 2.330e+02],
        [5.300e+01, 1.000e+00, 3.260e+02],
        [5.300e+01, 2.000e+00, 3.370e+02],
        ...,
        [5.300e+01, 3.356e+03, 2.440e+02],
        [5.300e+01, 3.357e+03, 3.800e+02],
        [5.300e+01, 3.358e+03, 3.890e+02]],

       [[4.560e+02, 0.000e+00, 2.041e+03],
        [4.560e+02, 1.000e+00, 2.048e+03],
        [4.560e+02, 2.000e+00, 3.779e+03],
        ...,
        [4.560e+02, 3.356e+03, 4.840e+03],
        [4.560e+02, 3.357e+03, 3.935e+03],
        [4.560e+02, 3.358e+03, 2.703e+03]],

       [[1.540e+02, 0.000e+00, 7.220e+02],
        [1.540e+02, 1.000e+00, 7.050e+02],
        [1.540e+02, 2.000e+00, 2.472e+03],
        ...,
        [1.540e+02, 3.356e+03, 7.510e+02],
        [1.540e+02, 3.357e+03, 7.840e+02],
        [1.540e+02, 3.358e+03, 7.760e+02]],

       ...,

       [[1.050e+02, 0.000e+00, 3.250e+02],
        [1.050e+02, 1.000e+00, 4.140e+02],
        [1.050e+02, 2.000e+00, 2.660e+02],
        ...,
        [1.050e+02, 3.356e

#### Using DataloaderOrder.RANDOM

- Returns batches with shuffled time series.
- Is affected by `random_state`.
    - When `random_state` is set, batches will be same.

In [32]:
config = SeriesBasedConfig(time_period=0.5, train_ts=54, val_ts=25, test_ts=10, features_to_take=["n_flows"], time_format=TimeFormat.ID_TIME,
                           train_workers=0, val_workers=0, test_workers=0, all_workers=0, init_workers=0,
                           train_batch_size=32, val_batch_size=64, test_batch_size=128, all_batch_size=128,
                           train_dataloader_order=DataloaderOrder.RANDOM, random_state=None)
series_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:45:41,689][config][INFO] - Quick validation succeeded.
[2025-04-09 11:45:41,701][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:45:41,706][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time period.
100%|██████████| 89/89 [00:00<00:00, 2959.21it/s]
[2025-04-09 11:45:41,739][cesnet_dataset][INFO] - Dataset initialization complete. Configuration updated.
[2025-04-09 11:45:41,739][cesnet_dataset][INFO] - Config initialized successfully.



Config Details:
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_HOUR
    Source: SourceType.INSTITUTION_SUBNETS

    Time series
        Train time series IDS: [210 244 233 489  51 ... 149 214  67  99 144], Length=54
        Val time series IDS: [217  57 112 270 161 ... 540 232  63 420 248], Length=25
        Test time series IDS [ 95  60   1 117  79 230 468   8 487 453], Length=10
        All time series IDS [210 244 233 489  51 ... 230 468   8 487 453], Length=89
    Time periods
        Time period: range(0, 3359)
    Features
        Taken features: ['n_flows']
        Default values: [0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Fillers         
        Filler type: None
    Scalers
        Scaler type: None
    Batch sizes
        Train batch size: 32
        Val batch size: 64
        Test batch size: 128
        All batch size: 128
    Default workers
        Train worker

In [33]:
dataloader = series_based_dataset.get_train_dataloader(workers="config")

batches = []

for batch in tqdm(dataloader):
    batches.append(batch)
    
batches[0]

[2025-04-09 11:45:41,748][cesnet_dataset][INFO] - Created new cached train_dataloader.
100%|██████████| 2/2 [00:00<00:00, 21.21it/s]


array([[[4.300e+02, 0.000e+00, 7.300e+01],
        [4.300e+02, 1.000e+00, 7.000e+01],
        [4.300e+02, 2.000e+00, 7.800e+01],
        ...,
        [4.300e+02, 3.356e+03, 2.500e+01],
        [4.300e+02, 3.357e+03, 4.800e+01],
        [4.300e+02, 3.358e+03, 2.500e+01]],

       [[5.360e+02, 0.000e+00, 1.500e+02],
        [5.360e+02, 1.000e+00, 1.530e+02],
        [5.360e+02, 2.000e+00, 1.530e+02],
        ...,
        [5.360e+02, 3.356e+03, 7.900e+01],
        [5.360e+02, 3.357e+03, 8.100e+01],
        [5.360e+02, 3.358e+03, 7.500e+01]],

       [[8.000e+01, 0.000e+00, 7.340e+02],
        [8.000e+01, 1.000e+00, 7.000e+02],
        [8.000e+01, 2.000e+00, 6.510e+02],
        ...,
        [8.000e+01, 3.356e+03, 8.280e+02],
        [8.000e+01, 3.357e+03, 2.850e+02],
        [8.000e+01, 3.358e+03, 2.380e+02]],

       ...,

       [[5.340e+02, 0.000e+00, 1.720e+02],
        [5.340e+02, 1.000e+00, 1.070e+02],
        [5.340e+02, 2.000e+00, 1.940e+02],
        ...,
        [5.340e+02, 3.356e