# Test the KSWIN function to detect drift in stream

This is to test if KSWIN is really good in drift detection, if yes we can apply it on IForest ASD to detect drift in windows in order to update the moel if neccessary

## Install skmultiflow if needed
You need to install git

In [1]:
#print("scikit-multiflow package installation")
#!pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

In [2]:
try:
    import skmultiflow
except ImportError as e:
    print("scikit-multiflow package installation")
    !pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

## Importations and configurations

In [3]:
%matplotlib notebook
import matplotlib as plt
plt.interactive(True)
from source import functions
func = functions.Comparison()
from source import ndkswin as ndk
import datetime
import copy

## General parameters for the evaluation

## Execution Function
https://github.com/scikit-multiflow/scikit-multiflow/blob/master/src/skmultiflow/drift_detection/kswin.py
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html

In [4]:
#************************ Execution settings *******************************

window = 100 # Number of sample in one window, window size.
max_sample = 10000 # Total size of data to examine (windows number = max_sample/window)
window_number = int(max_sample/window) # The number of window to save on .csv file
#window_number = 3 # The number of window to save on .csv file


In [5]:
def execute_KSWIN(stream, window_size=100, window_number=1000):
    from skmultiflow.drift_detection import KSWIN
    import pandas as pd
    import numpy as np
    # Initialize KSWIN and a data stream
    first_window = stream.next_sample(window_size)[0][:,0]
    #print(first_window)
    kswin = KSWIN(alpha=0.01, data=first_window)
    # Store detections
    detections = []
    # Process stream via KSWIN and print detections
    for i in range(window_number-1):
        data = stream.next_sample(window_size)
        batch = data[0][0][0]
        kswin.add_element(batch)
        if kswin.detected_change():
            #print("\rIteration {}".format(i))
            #print("\r KSWINReject Null Hyptheses")
            detections.append(i)
            kswin.reset()
            kswin = KSWIN(alpha=0.01, data=data[0][:,0])
    print("Drift detected in window n° "+str(detections))
    print("Number of detections: "+str(len(detections)))

In [6]:
def execute_NDKSWIN(stream, window_size=100, window_number=1000, n_dimensions=1, 
                    n_tested_samples=0.1, fixed_checked_dimension = True, fixed_checked_sample=True):
    #from skmultiflow.drift_detection import KSWIN
    import pandas as pd
    import numpy as np
    # Initialize KSWIN and a data stream
    #first_window = stream.next_sample(window_size)[0][:,0]
    first_window = stream.next_sample(window_size)[0]
    #print(first_window)
    #print(type(first_window))
    #ndkswin = ndk.NDKSWIN(alpha=0.01, data=first_window, n_dimensions=1, n_tested_samples=(1/window_size),
    #                      fixed_checked_dimension = True, fixed_checked_sample=True)
    ndkswin = ndk.NDKSWIN(alpha=0.01, data=first_window, n_dimensions=n_dimensions, n_tested_samples=n_tested_samples,
                          fixed_checked_dimension = fixed_checked_dimension, fixed_checked_sample=fixed_checked_sample)
    # Store detections
    detections = []
    # Process stream via KSWIN and print detections
    for i in range(window_number-1):
        #data = stream.next_sample(window_size)
        data = stream.next_sample(window_size)
        batch = data[0]
        ndkswin.add_element(batch)
        if ndkswin.detected_change():
            #print("\rIteration {}".format(i))
            #print("\r KSWINReject Null Hyptheses")
            detections.append(i)
            #ndkswin.reset()
            #ndkswin = ndk.NDKSWIN(alpha=0.01, data=batch, n_dimensions=1, n_tested_samples=0.1,
            #              fixed_checked_dimension = True, fixed_checked_sample=True)
    print("Drift detected in window n° "+str(detections))
    print("Number of detections: "+str(len(detections)))

In [7]:
def execute(file_path, n_dimensions, n_tested_samples):
    
    stream, stream2, stream3, stream4, stream5, stream6 = func.get_file_stream6(path=file_path)
    
    print("********************************** KSWIN Execution*********************************")
    execute_KSWIN(stream=stream, window_size=window, window_number=window_number)
    print("********************************** NDKSWIN Execution similar to KSWIN*********************************")
    execute_NDKSWIN(stream=stream2, window_size=window, window_number=window_number, n_dimensions=1, 
                        n_tested_samples=(1/window), fixed_checked_dimension = True, fixed_checked_sample=True)
    print("********************************** NDKSWIN Execution Fixe*********************************")
    execute_NDKSWIN(stream=stream3, window_size=window, window_number=window_number, n_dimensions=n_dimensions, 
                        n_tested_samples=n_tested_samples, fixed_checked_dimension = True, fixed_checked_sample=True)
    print("********************************** NDKSWIN Execution Random sample*********************************")
    execute_NDKSWIN(stream=stream4, window_size=window, window_number=window_number, n_dimensions=n_dimensions, 
                        n_tested_samples=n_tested_samples, fixed_checked_dimension = True, fixed_checked_sample=False)
    print("********************************** NDKSWIN Execution Random dimension*********************************")
    execute_NDKSWIN(stream=stream5, window_size=window, window_number=window_number, n_dimensions=n_dimensions, 
                        n_tested_samples=n_tested_samples, fixed_checked_dimension = False, fixed_checked_sample=True)
    print("********************************** NDKSWIN Execution Full Random*********************************")
    execute_NDKSWIN(stream=stream6, window_size=window, window_number=window_number, n_dimensions=n_dimensions, 
                        n_tested_samples=n_tested_samples, fixed_checked_dimension = False, fixed_checked_sample=False)

## Applied On Simple Stream Dataset


In [8]:
dataset_name = "Generator"
test_name = dataset_name+'_'+str(datetime.datetime.now())

stream = func.get_dataset(dataset_name=dataset_name, classification_function=0,noise_percentage=0.0, random_state=1)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)
n_dimensions = int(3/2)
n_tested_samples = 0.1

execute(file_path=file_path, n_dimensions=n_dimensions, n_tested_samples=n_tested_samples)


Please find the data used on results/Generator_2020-12-03 14:04:59.673295/Generator_2020-12-03 14:04:59.673295_dataUsed.csv
********************************** KSWIN Execution*********************************
Drift detected in window n° []
Number of detections: 0
********************************** NDKSWIN Execution similar to KSWIN*********************************
Drift detected in window n° []
Number of detections: 0
********************************** NDKSWIN Execution Fixe*********************************
Drift detected in window n° [28, 59, 69, 82, 93]
Number of detections: 5
********************************** NDKSWIN Execution Random sample*********************************
Drift detected in window n° [8, 25, 44, 53, 92]
Number of detections: 5
********************************** NDKSWIN Execution Random dimension*********************************
Drift detected in window n° [7, 41, 53, 65, 86]
Number of detections: 5
********************************** NDKSWIN Execution Full Random***

## Applied On Stream data containning anomalies

In [9]:
dataset_name = "AnomalySineGenerator"
test_name = dataset_name+'_'+str(datetime.datetime.now())

anomalies_percentage = 0.1
n_anomalies= int(max_sample*anomalies_percentage)
#print(str(n_anomalies))
stream = func.get_anomalies_data_generated(n_samples=max_sample, n_anomalies=n_anomalies, contextual=False,
                 n_contextual=0.0, shift=4, noise=0.0, replace=True, random_state=None)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)
n_dimensions = int(2/2)
n_tested_samples = 0.1

execute(file_path=file_path, n_dimensions=n_dimensions, n_tested_samples=n_tested_samples)


Please find the data used on results/AnomalySineGenerator_2020-12-03 14:05:02.071143/AnomalySineGenerator_2020-12-03 14:05:02.071143_dataUsed.csv
********************************** KSWIN Execution*********************************
Drift detected in window n° [12, 49, 67, 83, 85]
Number of detections: 5
********************************** NDKSWIN Execution similar to KSWIN*********************************
Drift detected in window n° [11, 86]
Number of detections: 2
********************************** NDKSWIN Execution Fixe*********************************
Drift detected in window n° [2, 10, 18, 28, 36, 44, 53, 60, 68, 78, 86]
Number of detections: 11
********************************** NDKSWIN Execution Random sample*********************************
Drift detected in window n° [15, 39, 78]
Number of detections: 3
********************************** NDKSWIN Execution Random dimension*********************************
Drift detected in window n° [3, 11, 19, 28, 36, 45, 53, 62, 70, 77, 85, 92]


## Applied On Simple ConceptDrift Stream Dataset

In [11]:
dataset_name = "DriftStreamGenerator"
test_name = dataset_name+'_'+str(datetime.datetime.now())
#drift_rate = 0.7
stream = func.get_dataset(dataset_name=dataset_name, classification_function=0, noise_percentage=0.0, random_state=1,
                         drift_classification_function = 3, drift_random_state = 112, drift_noise_percentage = 0.0,
                          drift_start_position = 101, drift_width = 50, n_num_features = 2, n_cat_features = 0)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)
n_dimensions = int(9/2)
n_tested_samples = 0.1

execute(file_path=file_path, n_dimensions=n_dimensions, n_tested_samples=n_tested_samples)


Please find the data used on results/DriftStreamGenerator_2020-12-03 14:06:50.769518/DriftStreamGenerator_2020-12-03 14:06:50.769518_dataUsed.csv
********************************** KSWIN Execution*********************************
Drift detected in window n° []
Number of detections: 0
********************************** NDKSWIN Execution similar to KSWIN*********************************
Drift detected in window n° []
Number of detections: 0
********************************** NDKSWIN Execution Fixe*********************************
Drift detected in window n° [2, 10, 19, 30, 38, 53, 73, 88]
Number of detections: 8
********************************** NDKSWIN Execution Random sample*********************************
Drift detected in window n° [2, 20, 28, 36, 44, 56, 68, 79, 98]
Number of detections: 9
********************************** NDKSWIN Execution Random dimension*********************************
Drift detected in window n° [3, 11, 20, 29, 39, 53, 65, 74, 88]
Number of detections: 9
**