# Using scalers for TimeBasedCesnetDataset

### Import

In [1]:
import numpy as np
import logging

from cesnet_tszoo.utils.enums import AgreggationType, SourceType, ScalerType
from cesnet_tszoo.datasets import CESNET_TimeSeries24
from cesnet_tszoo.configs import TimeBasedConfig # Time based dataset MUST use TimeBasedConfig

from cesnet_tszoo.utils.scaler import Scaler # For creating custom Scaler

### Setting logger

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

### Preparing dataset

In [3]:
time_based_dataset = CESNET_TimeSeries24.get_dataset(data_root="/some_directory/", source_type=SourceType.IP_ADDRESSES_SAMPLE, aggregation=AgreggationType.AGG_10_MINUTES, is_series_based=False, display_details=True)

[2025-04-09 11:46:56,937][wrapper_dataset][INFO] - Dataset is time-based. Use cesnet_tszoo.configs.TimeBasedConfig



Dataset details:

    AgreggationType.AGG_10_MINUTES
        Time indices: range(0, 40297)
        Datetime: (datetime.datetime(2023, 10, 9, 0, 3, 49, tzinfo=datetime.timezone.utc), datetime.datetime(2024, 7, 14, 21, 50, 52, tzinfo=datetime.timezone.utc))

    SourceType.IP_ADDRESSES_SAMPLE
        Time series indices: [ 11  20 101 103 118 ... 2003134 2008461 2011839 2022235 2044888], Length=1000; use 'get_available_ts_indices' for full list
        Features with default values: {'n_flows': 0, 'n_packets': 0, 'n_bytes': 0, 'n_dest_ip': 0, 'n_dest_asn': 0, 'n_dest_ports': 0, 'tcp_udp_ratio_packets': 0.5, 'tcp_udp_ratio_bytes': 0.5, 'dir_ratio_packets': 0.5, 'dir_ratio_bytes': 0.5, 'avg_duration': 0, 'avg_ttl': 0}
        
        Additional data: ['ids_relationship', 'weekends_and_holidays']
        


### Scalers

- Scalers are implemented as class.
    - You can create your own or use built-in one.
- Scaler must implement `transform`.
- Scalers are applied after `default_values` and fillers took care of missing values.
- To use scalers, train set must be implemented (unless scalers are already fitted).
- You can change used scaler later with `update_dataset_config_and_initialize` or `apply_scaler`.

#### Built-in

In [4]:
# Options

## Supports fit and partial_fit -> can be used when create_scaler_per_time_series=True or create_scaler_per_time_series=False
ScalerType.STANDARD_SCALER
ScalerType.L2_NORMALIZER
ScalerType.LOG_SCALER
ScalerType.MAX_ABS_SCALER
ScalerType.MIN_MAX_SCALER

## Supports only fit -> can be used create_scaler_per_time_series=True
ScalerType.POWER_TRANSFORMER
ScalerType.QUANTILE_TRANSFORMER
ScalerType.ROBUST_SCALER

<ScalerType.ROBUST_SCALER: 'robust_scaler'>

##### Scaler per time series

- One scaler per time series is created.
- Scaler must implement `fit` (unless scalers are already fitted).
- Scalers wont be used on time series from `test_ts_ids`.

In [5]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=ScalerType.MIN_MAX_SCALER, create_scaler_per_time_series=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:46:56,955][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:57,051][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,055][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 66.56it/s]
[2025-04-09 11:46:57,091][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 462.44it/s]
[2025-04-09 11:46:57,094][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_10_MINUTES
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1367 1368], Length=2
        Test time series IDS: [1370], Length=1
    Time periods
        Train time periods: range(0, 20149)
        Val time periods: range(20149, 28208)
        Test time periods: range(28208, 32237)
        All time periods: range(0, 32237)
    Features
        Taken features: ['n_flows', 'n_packets']
        Default values: [0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: min_max_scaler
        Is scaler per Time series: True
        Are scalers premade: False
        Are prema

In [6]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1367.0,0.0,0.166391,0.008877
1,1367.0,1.0,0.19214,0.004397
2,1367.0,2.0,0.156302,0.006889
3,1367.0,3.0,0.148472,0.006699
4,1367.0,4.0,0.142448,0.01236
5,1367.0,5.0,0.16624,0.004723
6,1367.0,6.0,0.18973,0.00934
7,1367.0,7.0,0.192441,0.005277
8,1367.0,8.0,0.232646,0.021287
9,1367.0,9.0,0.213823,0.0309


In [7]:
time_based_dataset.get_test_other_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1370.0,28208.0,294.0,5411.0
1,1370.0,28209.0,243.0,1141.0
2,1370.0,28210.0,319.0,2583.0
3,1370.0,28211.0,280.0,1702.0
4,1370.0,28212.0,240.0,2589.0
5,1370.0,28213.0,276.0,1637.0
6,1370.0,28214.0,299.0,1922.0
7,1370.0,28215.0,301.0,1540.0
8,1370.0,28216.0,250.0,1316.0
9,1370.0,28217.0,262.0,1669.0


In [8]:
time_based_dataset.get_scalers()

array([<cesnet_tszoo.utils.scaler.MinMaxScaler object at 0x000002537053EDE0>,
       <cesnet_tszoo.utils.scaler.MinMaxScaler object at 0x000002537053ED20>],
      dtype=object)

Or later with:

In [9]:
time_based_dataset.update_dataset_config_and_initialize(scale_with=ScalerType.MIN_MAX_SCALER, create_scaler_per_time_series=True, partial_fit_initialized_scalers="config", workers=0)
# Or
time_based_dataset.apply_scaler(scale_with=ScalerType.MIN_MAX_SCALER, create_scaler_per_time_series=True, partial_fit_initialized_scalers="config", workers=0)

[2025-04-09 11:46:57,146][cesnet_dataset][INFO] - Re-initialization is required.
[2025-04-09 11:46:57,230][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,234][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 235.19it/s]
[2025-04-09 11:46:57,247][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 999.83it/s]
[2025-04-09 11:46:57,249][cesnet_dataset][INFO] - Config initialized successfully.
[2025-04-09 11:46:57,250][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:46:57,250][cesnet_dataset][INFO] - Re-initialization is required.
[2025-04-09 11:46:57,332][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,335][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 275.30it/s

##### One scaler for every time series

- One scaler is used for all time series.
- Scaler must implement `partial_fit` (unless scaler is already fitted).
- Scaler will be used on time series from `test_ts_ids`.

In [10]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=ScalerType.MIN_MAX_SCALER, create_scaler_per_time_series=False)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:46:57,357][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:57,443][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,447][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 285.03it/s]
[2025-04-09 11:46:57,461][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 708.14it/s]
[2025-04-09 11:46:57,463][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_10_MINUTES
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1367 1368], Length=2
        Test time series IDS: [1370], Length=1
    Time periods
        Train time periods: range(0, 20149)
        Val time periods: range(20149, 28208)
        Test time periods: range(28208, 32237)
        All time periods: range(0, 32237)
    Features
        Taken features: ['n_flows', 'n_packets']
        Default values: [0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: min_max_scaler
        Is scaler per Time series: False
        Are scalers premade: False
        Are prem

In [11]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1367.0,0.0,0.044306,0.003506
1,1367.0,1.0,0.051163,0.001737
2,1367.0,2.0,0.04162,0.002721
3,1367.0,3.0,0.039535,0.002645
4,1367.0,4.0,0.037931,0.004881
5,1367.0,5.0,0.044266,0.001865
6,1367.0,6.0,0.050521,0.003689
7,1367.0,7.0,0.051243,0.002084
8,1367.0,8.0,0.061949,0.008406
9,1367.0,9.0,0.056937,0.012202


In [12]:
time_based_dataset.get_test_other_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1370.0,28208.0,0.010144,0.00061
1,1370.0,28209.0,0.008099,0.00011
2,1370.0,28210.0,0.011147,0.000279
3,1370.0,28211.0,0.009583,0.000175
4,1370.0,28212.0,0.007979,0.000279
5,1370.0,28213.0,0.009423,0.000168
6,1370.0,28214.0,0.010345,0.000201
7,1370.0,28215.0,0.010425,0.000156
8,1370.0,28216.0,0.00838,0.00013
9,1370.0,28217.0,0.008861,0.000172


In [13]:
time_based_dataset.get_scalers()

<cesnet_tszoo.utils.scaler.MinMaxScaler at 0x2537059ff80>

#### Custom

You can create your own custom scaler. It is recommended to derive from Scaler base class.

In [14]:
class CustomScaler(Scaler):
    def __init__(self):
        super().__init__()
        
        self.max = None
        self.min = None
    
    def transform(self, data):
        return (data - self.min) / (self.max - self.min)
    
    def fit(self, data):
        self.partial_fit(data)
    
    def partial_fit(self, data):
        
        if self.max is None and self.min is None:
            self.max = np.max(data, axis=0)
            self.min = np.min(data, axis=0)
            return
        
        temp_max = np.max(data, axis=0)
        temp = np.vstack((self.max, temp_max)) 
        self.max = np.max(temp, axis=0)
        
        temp_min = np.min(data, axis=0)
        temp = np.vstack((self.min, temp_min)) 
        self.min = np.min(temp, axis=0)            

##### Scaler per time series

- One scaler per time series is created.
- Scaler must implement `fit` (unless scalers are already fitted).
- Scalers wont be used on time series from `test_ts_ids`.

In [15]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=CustomScaler, create_scaler_per_time_series=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:46:57,518][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:57,603][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,606][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 284.94it/s]
[2025-04-09 11:46:57,619][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 500.16it/s]
[2025-04-09 11:46:57,622][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_10_MINUTES
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1367 1368], Length=2
        Test time series IDS: [1370], Length=1
    Time periods
        Train time periods: range(0, 20149)
        Val time periods: range(20149, 28208)
        Test time periods: range(28208, 32237)
        All time periods: range(0, 32237)
    Features
        Taken features: ['n_flows', 'n_packets']
        Default values: [0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: CustomScaler (Custom)
        Is scaler per Time series: True
        Are scalers premade: False
        Ar

In [16]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1367.0,0.0,0.166391,0.008877
1,1367.0,1.0,0.19214,0.004397
2,1367.0,2.0,0.156302,0.006889
3,1367.0,3.0,0.148472,0.006699
4,1367.0,4.0,0.142448,0.01236
5,1367.0,5.0,0.16624,0.004723
6,1367.0,6.0,0.18973,0.00934
7,1367.0,7.0,0.192441,0.005277
8,1367.0,8.0,0.232646,0.021287
9,1367.0,9.0,0.213823,0.0309


In [17]:
time_based_dataset.get_test_other_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1370.0,28208.0,294.0,5411.0
1,1370.0,28209.0,243.0,1141.0
2,1370.0,28210.0,319.0,2583.0
3,1370.0,28211.0,280.0,1702.0
4,1370.0,28212.0,240.0,2589.0
5,1370.0,28213.0,276.0,1637.0
6,1370.0,28214.0,299.0,1922.0
7,1370.0,28215.0,301.0,1540.0
8,1370.0,28216.0,250.0,1316.0
9,1370.0,28217.0,262.0,1669.0


In [18]:
time_based_dataset.get_scalers()

array([<__main__.CustomScaler object at 0x0000025370DA7500>,
       <__main__.CustomScaler object at 0x0000025370C4CEC0>], dtype=object)

Or later with:

In [19]:
time_based_dataset.update_dataset_config_and_initialize(scale_with=CustomScaler, create_scaler_per_time_series=True, partial_fit_initialized_scalers="config", workers=0)
# Or
time_based_dataset.apply_scaler(scale_with=CustomScaler, create_scaler_per_time_series=True, partial_fit_initialized_scalers="config", workers=0)

[2025-04-09 11:46:57,668][cesnet_dataset][INFO] - Re-initialization is required.
[2025-04-09 11:46:57,751][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,754][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 307.26it/s]
[2025-04-09 11:46:57,767][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]
[2025-04-09 11:46:57,770][cesnet_dataset][INFO] - Config initialized successfully.
[2025-04-09 11:46:57,770][cesnet_dataset][INFO] - Configuration has been changed successfuly.
[2025-04-09 11:46:57,771][cesnet_dataset][INFO] - Re-initialization is required.
[2025-04-09 11:46:57,853][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,856][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 243.61it/

##### One scaler for every time series

- One scaler is used for all time series.
- Scaler must implement `partial_fit` (unless scaler is already fitted).
- Scaler will be used on time series from `test_ts_ids`.

In [20]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=CustomScaler, create_scaler_per_time_series=False)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=True, workers=0)

[2025-04-09 11:46:57,880][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:57,964][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:57,967][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 285.10it/s]
[2025-04-09 11:46:57,980][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 473.40it/s]
[2025-04-09 11:46:57,983][cesnet_dataset][INFO] - Config initialized successfully.



Config Details
    Used for database: CESNET-TimeSeries24
    Aggregation: AgreggationType.AGG_10_MINUTES
    Source: SourceType.IP_ADDRESSES_SAMPLE

    Time series
        Time series IDS: [1367 1368], Length=2
        Test time series IDS: [1370], Length=1
    Time periods
        Train time periods: range(0, 20149)
        Val time periods: range(20149, 28208)
        Test time periods: range(28208, 32237)
        All time periods: range(0, 32237)
    Features
        Taken features: ['n_flows', 'n_packets']
        Default values: [0. 0.]
        Time series ID included: True
        Time included: True    
        Time format: TimeFormat.ID_TIME
    Sliding window
        Sliding window size: None
        Sliding window prediction size: None
        Sliding window step size: 1
        Set shared size: 0
    Fillers
        Filler type: None
    Scalers
        Scaler type: CustomScaler (Custom)
        Is scaler per Time series: False
        Are scalers premade: False
        A

In [21]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1367.0,0.0,0.044306,0.003506
1,1367.0,1.0,0.051163,0.001737
2,1367.0,2.0,0.04162,0.002721
3,1367.0,3.0,0.039535,0.002645
4,1367.0,4.0,0.037931,0.004881
5,1367.0,5.0,0.044266,0.001865
6,1367.0,6.0,0.050521,0.003689
7,1367.0,7.0,0.051243,0.002084
8,1367.0,8.0,0.061949,0.008406
9,1367.0,9.0,0.056937,0.012202


In [22]:
time_based_dataset.get_scalers()

<__main__.CustomScaler at 0x25370e1f3b0>

#### Using already fitted scaler/s

- When `partial_fit_initialized_scaler` is False (default value), scaler/s have no requirement for `partial_fit` nor for train set.

In [23]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=CustomScaler, create_scaler_per_time_series=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

list_of_prefitted_scalers = time_based_dataset.get_scalers()

[2025-04-09 11:46:58,066][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,152][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,155][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 307.59it/s]
[2025-04-09 11:46:58,167][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]
[2025-04-09 11:46:58,170][cesnet_dataset][INFO] - Config initialized successfully.


In [24]:
config = TimeBasedConfig(ts_ids=[1367, 1368], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=CustomScaler, create_scaler_per_time_series=False)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

one_prefitted_scaler = time_based_dataset.get_scalers()

[2025-04-09 11:46:58,175][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,260][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,263][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 266.61it/s]
[2025-04-09 11:46:58,277][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1000.79it/s]
[2025-04-09 11:46:58,280][cesnet_dataset][INFO] - Config initialized successfully.


##### Scaler per time series

- One scaler per time series in `ts_ids`.
- All scalers in list must be of the same type.
- Must provide list of scalers with length equal to time series in `ts_ids`.
- Scalers wont be used on time series from `test_ts_ids`.

In [25]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=list_of_prefitted_scalers, create_scaler_per_time_series=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,285][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,369][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,373][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 253.02it/s]
[2025-04-09 11:46:58,387][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]
[2025-04-09 11:46:58,390][cesnet_dataset][INFO] - Config initialized successfully.


In [26]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.092305,0.019907
1,103.0,1.0,0.09095,0.012769
2,103.0,2.0,0.07785,0.032889
3,103.0,3.0,0.079958,0.009592
4,103.0,4.0,0.102997,0.011615
5,103.0,5.0,0.106309,0.029562
6,103.0,6.0,0.107966,0.037642
7,103.0,7.0,0.15585,0.012679
8,103.0,8.0,0.113085,0.025278
9,103.0,9.0,0.128445,0.03264


Below you can see how scalers work even without train set.

In [27]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=None, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=list_of_prefitted_scalers, create_scaler_per_time_series=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,415][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,470][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,474][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 999.24it/s]
[2025-04-09 11:46:58,481][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 920.81it/s]
[2025-04-09 11:46:58,484][cesnet_dataset][INFO] - Config initialized successfully.


In [28]:
time_based_dataset.get_val_df(workers=0)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.092305,0.019907
1,103.0,1.0,0.090950,0.012769
2,103.0,2.0,0.077850,0.032889
3,103.0,3.0,0.079958,0.009592
4,103.0,4.0,0.102997,0.011615
...,...,...,...,...
16113,118.0,8054.0,0.005259,0.001669
16114,118.0,8055.0,0.005178,0.007140
16115,118.0,8056.0,0.001254,0.004929
16116,118.0,8057.0,0.000647,0.000593


##### One scaler for every time series

- One scaler is used for all time series.
- Must provide one scaler.
- Scaler will be used on time series from `test_ts_ids`.

In [29]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=one_prefitted_scaler, create_scaler_per_time_series=False)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,505][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,587][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,591][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 331.97it/s]
[2025-04-09 11:46:58,602][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1003.18it/s]
[2025-04-09 11:46:58,605][cesnet_dataset][INFO] - Config initialized successfully.


In [30]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.024579,0.007862
1,103.0,1.0,0.024218,0.005043
2,103.0,2.0,0.02073,0.012988
3,103.0,3.0,0.021291,0.003788
4,103.0,4.0,0.027426,0.004587
5,103.0,5.0,0.028308,0.011674
6,103.0,6.0,0.028749,0.014865
7,103.0,7.0,0.0415,0.005007
8,103.0,8.0,0.030112,0.009983
9,103.0,9.0,0.034202,0.01289


In [31]:
time_based_dataset.get_test_other_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,1370.0,28208.0,0.010144,0.00061
1,1370.0,28209.0,0.008099,0.00011
2,1370.0,28210.0,0.011147,0.000279
3,1370.0,28211.0,0.009583,0.000175
4,1370.0,28212.0,0.007979,0.000279
5,1370.0,28213.0,0.009423,0.000168
6,1370.0,28214.0,0.010345,0.000201
7,1370.0,28215.0,0.010425,0.000156
8,1370.0,28216.0,0.00838,0.00013
9,1370.0,28217.0,0.008861,0.000172


Below you can see how scaler works even without train set.

In [32]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=None, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=one_prefitted_scaler, create_scaler_per_time_series=False)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,646][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,704][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,709][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 996.75it/s]
[2025-04-09 11:46:58,718][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 1001.03it/s]
[2025-04-09 11:46:58,722][cesnet_dataset][INFO] - Config initialized successfully.


In [33]:
time_based_dataset.get_val_df(workers=0)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.024579,0.007862
1,103.0,1.0,0.024218,0.005043
2,103.0,2.0,0.020730,0.012988
3,103.0,3.0,0.021291,0.003788
4,103.0,4.0,0.027426,0.004587
...,...,...,...,...
16113,118.0,8054.0,0.014034,0.002224
16114,118.0,8055.0,0.013953,0.007691
16115,118.0,8056.0,0.010064,0.005482
16116,118.0,8057.0,0.009463,0.001148


##### Partial fitting on train set

Makes already fitted scaler/s to be fitted on new train set too. Must implement `partial_fit`.

In [34]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=one_prefitted_scaler, create_scaler_per_time_series=True, partial_fit_initialized_scalers=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,745][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,828][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,832][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 285.70it/s]
[2025-04-09 11:46:58,845][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 999.12it/s]
[2025-04-09 11:46:58,849][cesnet_dataset][INFO] - Config initialized successfully.


In [35]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.021068,0.007761
1,103.0,1.0,0.020778,0.004986
2,103.0,2.0,0.017976,0.012806
3,103.0,3.0,0.018427,0.003752
4,103.0,4.0,0.023355,0.004538
5,103.0,5.0,0.024064,0.011513
6,103.0,6.0,0.024419,0.014653
7,103.0,7.0,0.034663,0.004951
8,103.0,8.0,0.025514,0.009848
9,103.0,9.0,0.0288,0.012709


In [36]:
config = TimeBasedConfig(ts_ids=[103, 118], train_time_period=0.5, val_time_period=0.2, test_time_period=0.1, test_ts_ids=[1370], features_to_take=['n_flows', 'n_packets'],
                         scale_with=list_of_prefitted_scalers, create_scaler_per_time_series=True, partial_fit_initialized_scalers=True)

time_based_dataset.set_dataset_config_and_initialize(config, display_config_details=False, workers=0)

[2025-04-09 11:46:58,877][config][INFO] - Quick validation succeeded.
[2025-04-09 11:46:58,963][config][INFO] - Finalization and validation completed successfully.
[2025-04-09 11:46:58,966][cesnet_dataset][INFO] - Updating config on train/val/test/all and selected time series.
100%|██████████| 2/2 [00:00<00:00, 266.09it/s]
[2025-04-09 11:46:58,980][cesnet_dataset][INFO] - Updating config on test_other and selected time series.
100%|██████████| 1/1 [00:00<00:00, 491.54it/s]
[2025-04-09 11:46:58,984][cesnet_dataset][INFO] - Config initialized successfully.


In [37]:
time_based_dataset.get_train_df(workers=0).head(10)

Unnamed: 0,id_ip,id_time,n_flows,n_packets
0,103.0,0.0,0.019837,0.007737
1,103.0,1.0,0.019546,0.004963
2,103.0,2.0,0.01674,0.012783
3,103.0,3.0,0.017192,0.003728
4,103.0,4.0,0.022127,0.004514
5,103.0,5.0,0.022836,0.01149
6,103.0,6.0,0.023191,0.01463
7,103.0,7.0,0.033448,0.004928
8,103.0,8.0,0.024288,0.009825
9,103.0,9.0,0.027578,0.012686
