# Explore data

Select the dataset, its size, and a data root where it will be downloaded.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from cesnet_datazoo.datasets import CESNET_QUIC22
from cesnet_datazoo.config import DatasetConfig, ScalerEnum

dataset = CESNET_QUIC22(data_root="data/CESNET-QUIC22/", size="XS")

Downloading CESNET-QUIC22-XS dataset
File size: 2.53GB
Remaining: 2.53GB


100%|██████████| 2.53G/2.53G [05:12<00:00, 8.70MB/s]


Select configuration. We want train data from the 44th week of 2022 and disable feature scaling.

In [2]:
dataset_config = DatasetConfig(
    dataset=dataset,
    train_period="W-2022-44",
    train_size=1_000_000,
    flowstats_scaler=ScalerEnum.NO_SCALER,
    psizes_scaler=ScalerEnum.NO_SCALER,
    ipt_scaler=ScalerEnum.NO_SCALER,
)
dataset.set_dataset_config_and_initialize(dataset_config)
train_df = dataset.get_train_df()
train_df.shape

Loading data from dataloader


100%|██████████| 5209/5209 [00:10<00:00, 492.45it/s] 


(1000000, 45)

In [3]:
train_df[:100]

Unnamed: 0,PPI,BYTES,BYTES_REV,PACKETS,PACKETS_REV,PPI_LEN,PPI_ROUNDTRIPS,PPI_DURATION,DURATION,FEND_IDLE,...,IPT_BIN8,IPT_BIN1_REV,IPT_BIN2_REV,IPT_BIN3_REV,IPT_BIN4_REV,IPT_BIN5_REV,IPT_BIN6_REV,IPT_BIN7_REV,IPT_BIN8_REV,APP
0,"[[0.0, 7.0, 14.0, 0.0, 2.0, 5.0, 0.0, 5.0, 2.0...",6180.0,61062.0,41.0,58.0,30.0,8.0,1.255,1.397947,1.0,...,0.000000,0.842105,0.035088,0.017544,0.017544,0.070175,0.017544,0.000000,0.000000,62
1,"[[0.0, 20.0, 22.0, 7.0, 0.0, 0.0, 6.0, 0.0, 0....",3844.0,2676.0,8.0,7.0,15.0,5.0,0.138,0.138569,1.0,...,0.000000,0.500000,0.000000,0.500000,0.000000,0.000000,0.000000,0.000000,0.000000,91
2,"[[0.0, 1.0, 0.0, 20.0, 0.0, 0.0, 0.0, 6.0, 0.0...",3348.0,2629.0,7.0,7.0,14.0,2.0,0.078,0.077457,1.0,...,0.000000,0.833333,0.000000,0.166667,0.000000,0.000000,0.000000,0.000000,0.000000,46
3,"[[0.0, 0.0, 1.0, 20.0, 0.0, 0.0, 0.0, 3.0, 0.0...",3762.0,3897.0,20.0,21.0,30.0,9.0,0.757,1.173890,1.0,...,0.000000,0.500000,0.050000,0.150000,0.200000,0.000000,0.100000,0.000000,0.000000,59
4,"[[0.0, 2.0, 0.0, 0.0, 1.0, 105.0, 0.0, 0.0, 0....",1228.0,6356.0,1.0,6.0,7.0,1.0,0.108,0.108623,1.0,...,0.000000,0.800000,0.000000,0.000000,0.200000,0.000000,0.000000,0.000000,0.000000,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"[[0.0, 0.0, 0.0, 20.0, 0.0, 1.0, 0.0, 0.0, 21....",2625.0,2877.0,8.0,8.0,16.0,2.0,0.138,0.137810,1.0,...,0.000000,0.714286,0.142857,0.000000,0.142857,0.000000,0.000000,0.000000,0.000000,41
96,"[[0.0, 9.0, 16.0, 0.0, 0.0, 6.0, 0.0, 0.0, 0.0...",9128.0,7653.0,30.0,25.0,30.0,4.0,0.080,60.798359,1.0,...,0.137931,0.541667,0.208333,0.041667,0.000000,0.000000,0.000000,0.041667,0.166667,5
97,"[[0.0, 2.0, 0.0, 0.0, 0.0, 15.0, 0.0, 49.0, 3....",4849.0,5115.0,18.0,11.0,29.0,5.0,0.154,0.154567,1.0,...,0.000000,0.500000,0.400000,0.000000,0.100000,0.000000,0.000000,0.000000,0.000000,28
98,"[[0.0, 21.0, 0.0, 0.0, 5.0, 1.0, 5.0, 12.0, 28...",5014.0,4703.0,11.0,11.0,22.0,5.0,0.352,0.352283,1.0,...,0.000000,0.800000,0.000000,0.100000,0.000000,0.100000,0.000000,0.000000,0.000000,12


In [4]:
train_df.columns

Index(['PPI', 'BYTES', 'BYTES_REV', 'PACKETS', 'PACKETS_REV', 'PPI_LEN',
       'PPI_ROUNDTRIPS', 'PPI_DURATION', 'DURATION', 'FEND_IDLE',
       'FEND_ACTIVE', 'FEND_OTHER', 'PSIZE_BIN1', 'PSIZE_BIN2', 'PSIZE_BIN3',
       'PSIZE_BIN4', 'PSIZE_BIN5', 'PSIZE_BIN6', 'PSIZE_BIN7', 'PSIZE_BIN8',
       'PSIZE_BIN1_REV', 'PSIZE_BIN2_REV', 'PSIZE_BIN3_REV', 'PSIZE_BIN4_REV',
       'PSIZE_BIN5_REV', 'PSIZE_BIN6_REV', 'PSIZE_BIN7_REV', 'PSIZE_BIN8_REV',
       'IPT_BIN1', 'IPT_BIN2', 'IPT_BIN3', 'IPT_BIN4', 'IPT_BIN5', 'IPT_BIN6',
       'IPT_BIN7', 'IPT_BIN8', 'IPT_BIN1_REV', 'IPT_BIN2_REV', 'IPT_BIN3_REV',
       'IPT_BIN4_REV', 'IPT_BIN5_REV', 'IPT_BIN6_REV', 'IPT_BIN7_REV',
       'IPT_BIN8_REV', 'APP'],
      dtype='object')

The description of available features in the CESNET-QUIC22 dataset. 

| **Column Name**          | **Column Description**                                                                                               |
|--------------------------|----------------------------------------------------------------------------------------------------------------------|
| DURATION                 | Duration of the flow in seconds                                                                                      |
| BYTES                    | Number of transmitted bytes from client to server                                                                    |
| BYTES_REV                | Number of transmitted bytes from server to client                                                                    |
| PACKETS                  | Number of packets transmitted from client to server                                                                  |
| PACKETS_REV              | Number of packets transmitted from server to client                                                                  |
| PPI                      | Packet metadata sequence in the format: {[[inter-packet times], [packet directions], [packet sizes]]}                 |
| PPI_LEN                  | Number of packets in the PPI sequence                                                                                |
| PPI_DURATION             | Duration of the PPI sequence in seconds                                                                              |
| PPI_ROUNDTRIPS           | Number of roundtrips in the PPI sequence                                                                             |
| PSIZE_BIN*x*             | Histogram bin *x* of packet sizes from client to server                                                              |
| PSIZE_BIN*x*_REV         | Histogram bin *x* of packet sizes from server to client                                                              |
| IPT_BIN*x*               | Histogram bin *x* of inter-packet times from client to server                                                        |
| IPT_BIN*x*_REV           | Histogram bin *x* of inter-packet times from server to client                                                        |
| FEND_IDLE                | Flow was terminated because it was idle                                                                              |
| FEND_ACTIVE              | Flow was terminated because it reached the active timeout                                                            |
| FEND_OTHER               | Flow was terminated for other reasons                                                                                |
| APP                      | Web service label                                                                                                    |

The following cell shows an example of a PPI sequence. Zeros are used as padding for sequences shorter than 30 (ppi_example2).

In [5]:
from cesnet_datazoo.constants import IPT_POS, DIR_POS, SIZE_POS

ppi_example1 = train_df.iloc[0].PPI.astype(int)
print("Example 1:")
print(ppi_example1)
print(f"\nInter-packet times: {ppi_example1[IPT_POS, :].tolist()}")
print(f"Packet directions: {ppi_example1[DIR_POS, :].tolist()}")
print(f"Packet sizes: {ppi_example1[SIZE_POS, :].tolist()}\n")

ppi_example2 = train_df.iloc[1].PPI.astype(int)
print("Example 2:")
print(ppi_example2)
print(f"\nInter-packet times: {ppi_example2[IPT_POS, :].tolist()}")
print(f"Packet directions: {ppi_example2[DIR_POS, :].tolist()}")
print(f"Packet sizes: {ppi_example2[SIZE_POS, :].tolist()}")


Example 1:
[[   0    7   14    0    2    5    0    5    2   11   17    5  175    5
     1    0    0   14    0    0    6   12  259    0    6  247    6  211
     6  239]
 [   1   -1   -1   -1    1   -1   -1    1    1    1    1   -1    1    1
     1   -1   -1    1    1    1   -1    1    1    1   -1    1   -1    1
    -1    1]
 [1250 1250 1250 1250 1250 1250  783   41   41   41   41   41   83   72
    34  834  121 1149   34   33   27   33   33   33   25   33   25   33
    25   33]]

Inter-packet times: [0, 7, 14, 0, 2, 5, 0, 5, 2, 11, 17, 5, 175, 5, 1, 0, 0, 14, 0, 0, 6, 12, 259, 0, 6, 247, 6, 211, 6, 239]
Packet directions: [1, -1, -1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1]
Packet sizes: [1250, 1250, 1250, 1250, 1250, 1250, 783, 41, 41, 41, 41, 41, 83, 72, 34, 834, 121, 1149, 34, 33, 27, 33, 33, 33, 25, 33, 25, 33, 25, 33]

Example 2:
[[   0   20   22    7    0    0    6    0    0    2   34    3   11    2
    31    0    0    0    0    0 