## Workflow of drift analysis
### QUIC Dataset

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from cesnet_datazoo.datasets import CESNET_TLS_Year22, CESNET_QUIC22
from cesnet_datazoo.config import DatasetConfig, AppSelection
from datetime import datetime, timedelta


from xgboost import XGBClassifier
import sklearn.metrics as metrics
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split
from copy import deepcopy

from scipy import stats
from joblib import dump, load

import warnings
warnings.filterwarnings('ignore') 

Create reference classifier from the first week of data

In [49]:
data = CESNET_QUIC22("/home/dosoukup/Dataset/QUIC", size="XS")
common_params = {
    "dataset": data,
    "train_period_name": "W-2022-44",
    "apps_selection": AppSelection.ALL_KNOWN,
    "use_packet_histograms": True,
}

hist_df = pd.DataFrame()
current_date = datetime(2022, 10, 31)
while current_date <= datetime(2022, 11, 6):
    dataset_config = DatasetConfig(**common_params, test_period_name=current_date.strftime("M-%Y-%m"), test_dates=[current_date.strftime("%Y%m%d")])
    data.set_dataset_config_and_initialize(dataset_config)
    curr_df = data.get_test_df(flatten_ppi=True)
    curr_sample = curr_df.sample(25000, random_state = 42, replace=True)
    curr_sample["date"] = current_date
    hist_df = pd.concat([hist_df,curr_sample])
    current_date += timedelta(days=1)

Xdata = hist_df.drop(columns=["APP","date"])
ydata = hist_df.APP
X_train, X_test, y_train, y_test = train_test_split(Xdata, ydata, test_size=0.33, random_state=42)

ref_clf = XGBClassifier().fit(X_train, y_train)
y_pred = ref_clf.predict(X_test)

print(f"F1 Score: {f1_score(y_test, y_pred, average = 'weighted')}")

Loading data from dataloader


100%|██████████| 105/105 [00:01<00:00, 58.60it/s]


Loading data from dataloader


100%|██████████| 105/105 [00:01<00:00, 61.64it/s]


Loading data from dataloader


100%|██████████| 213/213 [00:03<00:00, 66.10it/s]


Loading data from dataloader


100%|██████████| 212/212 [00:03<00:00, 58.34it/s]


Loading data from dataloader


100%|██████████| 165/165 [00:02<00:00, 58.85it/s]


Loading data from dataloader


100%|██████████| 72/72 [00:01<00:00, 52.11it/s]


Loading data from dataloader


100%|██████████| 87/87 [00:01<00:00, 58.23it/s]


F1 Score: 0.849924739648822


In [53]:
from detector.detector import DriftDetector, Config
from detector.logger import Logger
from detector.test import KSTest, WassersteinTest
from detector.analyser import LastWeekAnalyser

feat_names = [col for col in ref_clf.feature_names_in_ if "FLOW_" not in col and  "week" not in col]

global_config = Config(
    chosen_features = feat_names,
    # Note: comment the line below if no feature importances are needed
    feature_importances = pd.Series(ref_clf.feature_importances_,index = ref_clf.feature_names_in_),
    drift_test=WassersteinTest(drift_threshold_global=0.04,drift_threshold_single = 0.1)
)

class_config = Config(
    chosen_features = feat_names,
    # Note: comment the line below if no feature importances are needed
    feature_importances = pd.Series(ref_clf.feature_importances_,index = ref_clf.feature_names_in_),
    drift_test=KSTest(drift_threshold_global=0.475,drift_threshold_single = 0.05),
    class_name="APP"
)

analyser_config = Config(
    chosen_features = feat_names,
    # Note: comment the line below if no feature importances are needed
    feature_importances = pd.Series(ref_clf.feature_importances_,index = ref_clf.feature_names_in_),
    drift_test=WassersteinTest(drift_threshold_global=0.04)
)

#Logger of the single reference model without retraining
logger_ref = Logger("Reference model trained on week 1 with no retraining")
analyser_ref = LastWeekAnalyser(analyser_config)
detector_ref = DriftDetector(global_config, class_config, logger=logger_ref, analyser=analyser_ref)

#Loggers for the retrained models
logger_drift = Logger("Retraining model")
analyser_drift = LastWeekAnalyser(analyser_config)
detector_drift = DriftDetector(global_config, class_config, logger=logger_drift, analyser=analyser_drift)

test_data = CESNET_QUIC22("/home/dosoukup/Dataset/QUIC", size="XS")
common_params = {
    "dataset": test_data,
    "train_period_name": "W-2022-44",
    "use_packet_histograms": True,
}

retraining_clf = ref_clf
old_clf = None

ref_df = hist_df.copy()

current_date = datetime(2022, 11, 7)
while current_date <= datetime(2022, 11, 27):
    try:
        #Get current data
        dataset_config = DatasetConfig(**common_params, test_period_name=current_date.strftime("M-%Y-%m"), test_dates=[current_date.strftime("%Y%m%d")])
        test_data.set_dataset_config_and_initialize(dataset_config)
        test_df = test_data.get_test_df(flatten_ppi=True)
        # Uncomment for Nettisa features
        #test_df = test_data.get_test_df()
        # Uncomment for Nettisa features
        #test_df = ntc.update_df_with_nettisa_features(test_df)

        print("day lenght",len(test_df),current_date)
        if len(test_df) > 25000:
            test_df = test_df.sample(25000, random_state = 42)

        #Test model with no retraining
        Xdata = test_df.drop(columns=["APP"])
        ydata = test_df.APP
        
        y_pred_ref = ref_clf.predict(Xdata)   
        detector_ref.detect(ref_df,test_df,current_date,y_pred_ref)

        #Test retraining model
        y_pred = retraining_clf.predict(Xdata)
        
        is_drifted = detector_drift.detect(hist_df,test_df,current_date,y_pred)

        #Retrain the model if drift was detected
        if is_drifted:
            print("Drift detected, retraining")

            #Update training dataset
            hist_df = hist_df.tail(len(hist_df)-len(test_df))
            test_df["date"] = current_date
            hist_df = pd.concat([hist_df,test_df])
            Xdata = hist_df.drop(columns=["APP","date"])
            ydata = hist_df.encoded
            retraining_clf = XGBClassifier().fit(Xdata, ydata)

    except Exception as error:
        print("An error occurred:", error)
        print(f"{current_date} missing from the dataset")
    current_date += timedelta(days=1)

Loading data from dataloader



  0%|          | 0/238 [00:00<?, ?it/s][A
  0%|          | 1/238 [00:00<00:29,  7.90it/s][A
  3%|▎         | 8/238 [00:00<00:05, 40.34it/s][A
  7%|▋         | 16/238 [00:00<00:03, 56.25it/s][A
 11%|█         | 25/238 [00:00<00:03, 66.84it/s][A
 14%|█▍        | 34/238 [00:00<00:02, 72.64it/s][A
 18%|█▊        | 43/238 [00:00<00:02, 75.58it/s][A
 21%|██▏       | 51/238 [00:00<00:02, 76.68it/s][A
 25%|██▍       | 59/238 [00:00<00:02, 77.33it/s][A
 29%|██▊       | 68/238 [00:00<00:02, 78.19it/s][A
 32%|███▏      | 76/238 [00:01<00:02, 78.66it/s][A
 36%|███▌      | 85/238 [00:01<00:01, 79.37it/s][A
 39%|███▉      | 94/238 [00:01<00:01, 79.74it/s][A
 43%|████▎     | 102/238 [00:01<00:01, 79.62it/s][A
 47%|████▋     | 111/238 [00:01<00:01, 79.80it/s][A
 50%|█████     | 120/238 [00:01<00:01, 80.29it/s][A
 54%|█████▍    | 129/238 [00:01<00:01, 80.70it/s][A
 58%|█████▊    | 138/238 [00:01<00:01, 80.78it/s][A
 62%|██████▏   | 147/238 [00:01<00:01, 80.22it/s][A
 66%|██████▌   |

day lenght 486527 2022-11-07 00:00:00


 13%|█▎        | 339M/2.53G [09:16<1:01:33, 639kB/s]


Loading data from dataloader


100%|██████████| 242/242 [00:03<00:00, 75.78it/s]


day lenght 495511 2022-11-08 00:00:00
Loading data from dataloader


100%|██████████| 236/236 [00:03<00:00, 75.33it/s]


day lenght 482877 2022-11-09 00:00:00
Loading data from dataloader


100%|██████████| 218/218 [00:02<00:00, 75.45it/s]


day lenght 444706 2022-11-10 00:00:00
Loading data from dataloader


100%|██████████| 159/159 [00:02<00:00, 72.25it/s]


day lenght 324765 2022-11-11 00:00:00
Loading data from dataloader


100%|██████████| 70/70 [00:01<00:00, 65.28it/s]


day lenght 141354 2022-11-12 00:00:00
Drift detected, retraining
An error occurred: 'DataFrame' object has no attribute 'encoded'
2022-11-12 00:00:00 missing from the dataset
Loading data from dataloader


100%|██████████| 87/87 [00:01<00:00, 65.11it/s]


day lenght 176161 2022-11-13 00:00:00
Loading data from dataloader


100%|██████████| 236/236 [00:03<00:00, 70.54it/s]


day lenght 481511 2022-11-14 00:00:00
Loading data from dataloader


100%|██████████| 235/235 [00:03<00:00, 72.60it/s]


day lenght 480095 2022-11-15 00:00:00
Loading data from dataloader


100%|██████████| 193/193 [00:02<00:00, 73.85it/s]


day lenght 394328 2022-11-16 00:00:00
Loading data from dataloader


100%|██████████| 79/79 [00:01<00:00, 65.34it/s]


day lenght 160155 2022-11-17 00:00:00
Loading data from dataloader


100%|██████████| 106/106 [00:01<00:00, 70.36it/s]


day lenght 215866 2022-11-18 00:00:00
Loading data from dataloader


100%|██████████| 60/60 [00:00<00:00, 63.58it/s]


day lenght 122008 2022-11-19 00:00:00
Loading data from dataloader


100%|██████████| 80/80 [00:01<00:00, 66.25it/s]


day lenght 162770 2022-11-20 00:00:00
Loading data from dataloader


100%|██████████| 242/242 [00:03<00:00, 75.60it/s]


day lenght 494697 2022-11-21 00:00:00
Loading data from dataloader


100%|██████████| 251/251 [00:03<00:00, 75.21it/s]


day lenght 513668 2022-11-22 00:00:00
Loading data from dataloader


100%|██████████| 250/250 [00:03<00:00, 76.22it/s]


day lenght 510465 2022-11-23 00:00:00
Loading data from dataloader


100%|██████████| 225/225 [00:02<00:00, 76.03it/s]


day lenght 460114 2022-11-24 00:00:00
Loading data from dataloader


100%|██████████| 162/162 [00:02<00:00, 72.26it/s]


day lenght 331122 2022-11-25 00:00:00
Loading data from dataloader


100%|██████████| 73/73 [00:01<00:00, 65.24it/s]


day lenght 147510 2022-11-26 00:00:00
Loading data from dataloader


100%|██████████| 89/89 [00:01<00:00, 67.69it/s]


day lenght 181865 2022-11-27 00:00:00


# Save experiment result

In [54]:
import pickle
with open('logs_cesnet_quic_22_xs.pkl', 'wb') as outp:
    pickle.dump(logger_drift, outp, pickle.HIGHEST_PROTOCOL)
    pickle.dump(logger_ref, outp, pickle.HIGHEST_PROTOCOL)