# Utilities

This notebook will only use TimeBasedCesnetDataset, but all methods work the same way for SeriesBasedCesnetDataset.

### Import

In [1]:
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, ScalerType, SplitType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Checking for errors

- Goes through all data in dataset to check whether everything is in correct state,
- Can be called when creating dataset or with method `check_errors` on already create dataset.
- Recommended to call at least once after download

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False, check_errors=True)

[2025-04-09 11:47:15,535][cesnet_dataset][INFO] - Table '/ids_relationship' checked successfully. (275124 rows processed)
[2025-04-09 11:47:15,541][cesnet_dataset][INFO] - Table '/weekends_and_holidays' checked successfully. (91 rows processed)
[2025-04-09 11:47:15,558][cesnet_dataset][INFO] - Table '/ip_addresses_sample/agg_1_day' checked successfully. (159950 rows processed)
[2025-04-09 11:47:15,560][cesnet_dataset][INFO] - Table '/ip_addresses_sample/id_ranges_agg_1_day' checked successfully. (1000 rows processed)
[2025-04-09 11:47:15,561][cesnet_dataset][INFO] - Table '/ip_addresses_sample/identifiers' checked successfully. (1000 rows processed)
[2025-04-09 11:47:15,562][cesnet_dataset][INFO] - Table '/times/times_1_day' checked successfully. (280 rows processed)
[2025-04-09 11:47:15,563][cesnet_dataset][INFO] - Dataset check completed with no errors found.
[2025-04-09 11:47:15,564][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig


In [4]:
time_based_dataset.check_errors()

[2025-04-09 11:47:15,572][cesnet_dataset][INFO] - Table '/ids_relationship' checked successfully. (275124 rows processed)
[2025-04-09 11:47:15,574][cesnet_dataset][INFO] - Table '/weekends_and_holidays' checked successfully. (91 rows processed)
[2025-04-09 11:47:15,587][cesnet_dataset][INFO] - Table '/ip_addresses_sample/agg_1_day' checked successfully. (159950 rows processed)
[2025-04-09 11:47:15,589][cesnet_dataset][INFO] - Table '/ip_addresses_sample/id_ranges_agg_1_day' checked successfully. (1000 rows processed)
[2025-04-09 11:47:15,590][cesnet_dataset][INFO] - Table '/ip_addresses_sample/identifiers' checked successfully. (1000 rows processed)
[2025-04-09 11:47:15,592][cesnet_dataset][INFO] - Table '/times/times_1_day' checked successfully. (280 rows processed)
[2025-04-09 11:47:15,592][cesnet_dataset][INFO] - Dataset check completed with no errors found.


### Dataset details

In [5]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False, display_details=True)

[2025-04-09 11:47:15,604][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_DAY
        Time indices: range(0, 279)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 0, 0, tzinfo=datetime.timezone.utc))

    SourceType.IP_ADDRESSES_SAMPLE
        Time series indices: [ 11  20 101 103 118 ... 2003134 2008461 2011839 2022235 2044888], Length=1000; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


#### Displaying all data about selected dataset

In [6]:
time_based_dataset.display_dataset_details()


Dataset details:

    AgreggationType.AGG_1_DAY
        Time indices: range(0, 279)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 0, 0, tzinfo=datetime.timezone.utc))

    SourceType.IP_ADDRESSES_SAMPLE
        Time series indices: [ 11  20 101 103 118 ... 2003134 2008461 2011839 2022235 2044888], Length=1000; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


#### Get list of available features

In [7]:
time_based_dataset.get_feature_names()

['id_ip',
 'id_time',
 'n_flows',
 'n_packets',
 'n_bytes',
 'sum_n_dest_asn',
 'avg_n_dest_asn',
 'std_n_dest_asn',
 'sum_n_dest_ports',
 'avg_n_dest_ports',
 'std_n_dest_ports',
 'sum_n_dest_ip',
 'avg_n_dest_ip',
 'std_n_dest_ip',
 'tcp_udp_ratio_packets',
 'tcp_udp_ratio_bytes',
 'dir_ratio_packets',
 'dir_ratio_bytes',
 'avg_duration',
 'avg_ttl']

#### Get numpy array of available dataset time series indices

In [8]:
time_based_dataset.get_available_ts_indices()

array([(     11,), (     20,), (    101,), (    103,), (    118,),
       (    120,), (    122,), (    171,), (    178,), (    190,),
       (    695,), (   1037,), (   1040,), (   1196,), (   1200,),
       (   1367,), (   1368,), (   1370,), (   1381,), (   1385,),
       (   1553,), (   1554,), (   1580,), (   1605,), (   1656,),
       (   1672,), (   1774,), (   1845,), (   1852,), (   3271,),
       (   3370,), (   4380,), (   5101,), (   6184,), (   6794,),
       (   7516,), (   7782,), (   8089,), (   8989,), (  10125,),
       (  10158,), (  10196,), (  10197,), (  10256,), (  10374,),
       (  10396,), (  10409,), (  10703,), (  10729,), (  10809,),
       (  11069,), (  11188,), (  11204,), (  11212,), (  11254,),
       (  11363,), (  11587,), (  11799,), (  12926,), (  14458,),
       (  14471,), (  15159,), (  15907,), (  18264,), (  18579,),
       (  18683,), (  19288,), (  20150,), (  20342,), (  21756,),
       (  23518,), (  25910,), (  29194,), (  29646,), (  3217

#### Get dictionary of related set data

In [9]:
config = TimeBasedConfig(20, train_time_period=0.5)
time_based_dataset.set_dataset_config_and_initialize(config, workers=0, display_config_details=False)

time_based_dataset.get_data_about_set(about=SplitType.TRAIN)

[2025-04-09 11:47:15,631][config][INFO] - Quick validation succeeded.
[2025-04-09 11:47:15,637][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:47:15,641][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 20/20 [00:00<00:00, 3998.57it/s]
[2025-04-09 11:47:15,649][cesnet_dataset][INFO] - Config initialized successfully.


{'ts_ids': array([ 565727, 1808419, 1498142,  546509,   88152,   85341,  756220,
        1538089, 1859462,  382625,  237694,  514728,  330012,  377112,
         587344,  362327,  774906,  151737,    1370,  168060], dtype=uint32),
 <TimeFormat.ID_TIME: 'id_time'>: array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       

### Displaying config details

- Can be called when calling `set_dataset_config_and_initialize` or after it with `display_config`

In [10]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False)
config = TimeBasedConfig(20)

time_based_dataset.set_dataset_config_and_initialize(config, workers=0, display_config_details=True)

[2025-04-09 11:47:15,662][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig
[2025-04-09 11:47:15,662][config][INFO] - Quick validation succeeded.
[2025-04-09 11:47:15,668][config][INFO] - Using all times for all_time_period because train_time_period, val_time_period, and test_time_period are all set to None.
[2025-04-09 11:47:15,669][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:47:15,673][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 20/20 [00:00<00:00, 3502.99it/s]
[2025-04-09 11:47:15,679][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_DAY
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1732661  502550  508635 1508996  663507 ... 1548925 1523783  118055   10396  316980], Length=20
        Test time series IDS: None
    Time periods
        Train time periods: None
        Val time periods: None
        Test time periods: None
        All time periods: range(0, 280)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
       

In [11]:
time_based_dataset.display_config()


Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_DAY
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1732661  502550  508635 1508996  663507 ... 1548925 1523783  118055   10396  316980], Length=20
        Test time series IDS: None
    Time periods
        Train time periods: None
        Val time periods: None
        Test time periods: None
        All time periods: range(0, 280)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes', 'sum_n_dest_asn', 'avg_n_dest_asn', 'std_n_dest_asn', 'sum_n_dest_ports', 'avg_n_dest_ports', 'std_n_dest_ports', 'sum_n_dest_ip', 'avg_n_dest_ip', 'std_n_dest_ip', 'tcp_udp_ratio_packets', 'tcp_udp_ratio_bytes', 'dir_ratio_packets', 'dir_ratio_bytes', 'avg_duration', 'avg_ttl']
        Default values: [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.5 0.5 0.  0. ]
        Time series ID included: True
        Time included: True    
       

### Plotting

- Uses [`Plotly`](https://plotly.com/python/) library.
- You can plot specific time series with method `plot`
- You can set `ts_id` to any time series id used in config
- Plot will always contains time period of all set
- Config must be set before using

In [12]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False)
config = TimeBasedConfig([1548925, 443967], train_time_period=1.0, features_to_take=["n_flows", "n_packets", "n_bytes"], scale_with=ScalerType.MIN_MAX_SCALER)

time_based_dataset.set_dataset_config_and_initialize(config, workers=0, display_config_details=True)

[2025-04-09 11:47:15,695][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig
[2025-04-09 11:47:15,696][config][INFO] - Quick validation succeeded.
[2025-04-09 11:47:15,702][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:47:15,706][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 1999.67it/s]
[2025-04-09 11:47:15,710][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_DAY
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1548925  443967], Length=2
        Test time series IDS: None
    Time periods
        Train time periods: range(0, 280)
        Val time periods: None
        Test time periods: None
        All time periods: range(0, 280)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes']
        Default values: [0. 0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: min_max_scaler
        Is scaler per Time series: True
        Are scalers premade: False
        Are premade scalers partial_fitted: Fals

In [13]:
time_based_dataset.plot(ts_id=443967, plot_type="line", features="config", feature_per_plot=True, time_format="datetime", use_scalers=True)

In [14]:
time_based_dataset.plot(ts_id=443967, plot_type="line", features="config", feature_per_plot=False, time_format="datetime", use_scalers=True)

In [15]:
time_based_dataset.plot(ts_id=443967, plot_type="line", features=["n_flows", "n_packets"], feature_per_plot=True, time_format="datetime", use_scalers=True)

In [16]:
time_based_dataset.plot(ts_id=443967, plot_type="line", features="n_flows", feature_per_plot=True, time_format="datetime", use_scalers=True)

### Get additional data

- You can check whether dataset has additional data, with method `display_dataset_details` or when creating dataset like below.

In [17]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False, display_details=True)

[2025-04-09 11:47:17,087][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_1_DAY
        Time indices: range(0, 279)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 0, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 0, 0, tzinfo=datetime.timezone.utc))

    SourceType.IP_ADDRESSES_SAMPLE
        Time series indices: [ 11  20 101 103 118 ... 2003134 2008461 2011839 2022235 2044888], Length=1000; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0, 'sum_n_dest_asn': 0, 'avg_n_dest_asn': 0, 'std_n_dest_asn': 0, 'sum_n_dest_ports': 0, 'avg_n_dest_ports': 0, 'std_n_dest_ports': 0, 'sum_n_dest_ip': 0, 'avg_n_dest_ip': 0, 'std_n_dest_ip': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


In [18]:
time_based_dataset.get_additional_data('ids_relationship')

Unnamed: 0,id_ip,id_institution,id_institution_subnet
0,42,0,0
1,52,0,0
2,275,0,0
3,1026,0,0
4,1128,0,0
...,...,...,...
275119,637069,283,546
275120,4821,284,547
275121,64129,284,547
275122,590641,284,547


In [19]:
time_based_dataset.get_additional_data('weekends_and_holidays')

Unnamed: 0,Date,Type
0,2023-10-13 22:00:00+00:00,Weekend
1,2023-10-14 22:00:00+00:00,Weekend
2,2023-10-20 22:00:00+00:00,Weekend
3,2023-10-21 22:00:00+00:00,Weekend
4,2023-10-27 22:00:00+00:00,Weekend
...,...,...
86,2024-07-04 22:00:00+00:00,Holiday
87,2024-07-05 22:00:00+00:00,Weekend
88,2024-07-06 22:00:00+00:00,Weekend
89,2024-07-12 22:00:00+00:00,Weekend


### Get fitted scalers

Returns used scaler/s that are used for transforming data.

In [20]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_1_DAY, is_series_based=False)
config = TimeBasedConfig([1548925, 443967], train_time_period=1.0, features_to_take=["n_flows", "n_packets", "n_bytes"], scale_with=ScalerType.MIN_MAX_SCALER)

time_based_dataset.set_dataset_config_and_initialize(config, workers=0, display_config_details=True)

[2025-04-09 11:47:17,267][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig
[2025-04-09 11:47:17,268][config][INFO] - Quick validation succeeded.
[2025-04-09 11:47:17,273][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:47:17,277][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 2000.62it/s]
[2025-04-09 11:47:17,280][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_1_DAY
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1548925  443967], Length=2
        Test time series IDS: None
    Time periods
        Train time periods: range(0, 280)
        Val time periods: None
        Test time periods: None
        All time periods: range(0, 280)
    Features
        Taken features: ['n_flows', 'n_packets', 'n_bytes']
        Default values: [0. 0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: min_max_scaler
        Is scaler per Time series: True
        Are scalers premade: False
        Are premade scalers partial_fitted: Fals

In [21]:
time_based_dataset.get_scalers()

array([<cesnet_tszoo.utils.scaler.MinMaxScaler object at 0x0000022895E55E20>,
       <cesnet_tszoo.utils.scaler.MinMaxScaler object at 0x0000022895CD79B0>],
      dtype=object)