# Test the KSWIN function to detect drift in stream

To compute the best parameters for NDKSWIN : number of dimension and percentage of data to compute

## Install skmultiflow if needed
You need to install git

In [7]:
#print("scikit-multiflow package installation")
#!pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

In [8]:
try:
    import skmultiflow
except ImportError as e:
    print("scikit-multiflow package installation")
    !pip install -U git+https://github.com/scikit-multiflow/scikit-multiflow

## Importations and configurations

In [9]:
%matplotlib notebook
import matplotlib as plt
plt.interactive(True)
from source import functions
func = functions.Comparison()
from source import ndkswin as ndk
import datetime
import copy
import pandas as pd
import numpy as np

## General parameters for the evaluation

## Execution Function
https://github.com/scikit-multiflow/scikit-multiflow/blob/master/src/skmultiflow/drift_detection/kswin.py
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html

In [10]:
#************************ Execution settings *******************************

window = 100 # Number of sample in one window, window size.
max_sample = 10000 # Total size of data to examine (windows number = max_sample/window)
window_number = int(max_sample/window) # The number of window to save on .csv file
#window_number = 3 # The number of window to save on .csv file


In [11]:
def execute_NDKSWIN(stream, window_size=100, window_number=1000, n_dimensions=1, n_tested_samples=0.1):
    #from skmultiflow.drift_detection import KSWIN
    import pandas as pd
    import numpy as np
    # Initialize KSWIN and a data stream
    #first_window = stream.next_sample(window_size)[0][:,0]
    first_window = stream.next_sample(window_size)[0]
    #print(first_window)
    #print(type(first_window))
    #ndkswin = ndk.NDKSWIN(alpha=0.01, data=first_window, n_dimensions=1, n_tested_samples=(1/window_size),
    #                      fixed_checked_dimension = True, fixed_checked_sample=True)
    ndkswin = ndk.NDKSWIN(alpha=0.01, data=first_window, n_dimensions=n_dimensions, n_tested_samples=n_tested_samples,
                          fixed_checked_dimension = False, fixed_checked_sample=False)
    # Store detections
    detections = []
    # Process stream via KSWIN and print detections
    for i in range(window_number-1):
        #data = stream.next_sample(window_size)
        data = stream.next_sample(window_size)
        batch = data[0]
        ndkswin.add_element(batch)
        if ndkswin.detected_change():
            #print("\rIteration {}".format(i))
            #print("\r KSWINReject Null Hyptheses")
            detections.append(i)
            #ndkswin.reset()
            #ndkswin = ndk.NDKSWIN(alpha=0.01, data=batch, n_dimensions=1, n_tested_samples=0.1,
            #              fixed_checked_dimension = True, fixed_checked_sample=True)
    #print("Drift detected in window n° "+str(detections))
    print("Number of detections: "+str(len(detections)))
    return len(detections)

## Applied On Simple ConceptDrift Stream Dataset

In [16]:
dataset_name = "DriftStreamGenerator"
test_name = dataset_name+'_'+str(datetime.datetime.now())

stream = func.get_dataset(dataset_name=dataset_name, classification_function=0, noise_percentage=0.0, random_state=1,
                         drift_classification_function = 3, drift_random_state = 112, drift_noise_percentage = 0.0,
                          drift_start_position = 101, drift_width = 50, n_num_features = 2, n_cat_features = 0)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)

test = []
for n_dimension in range(1,9+1,1):
    for n_tested_sample in np.arange(0.1, 1, 0.1):

        stream, stream2, stream3 = func.get_file_stream2(path=file_path)
        print("")
        print("****NDKSWIN Execution Random with n_dimension = "+str(n_dimension)+" n_tested_sample = "
              +str(n_tested_sample)+"*******")
        detection = execute_NDKSWIN(stream=stream3, window_size=window, window_number=window_number, 
                         n_dimensions=n_dimension, n_tested_samples=n_tested_sample)
        test.append([n_dimension, n_tested_sample, detection])
results_drift = pd.DataFrame(test, columns=["n_dimension", "n_tested_sample(%)", "detection_number"])
print(results_drift)


Please find the data used on results/DriftStreamGenerator_2020-12-03 13:42:14.202151/DriftStreamGenerator_2020-12-03 13:42:14.202151_dataUsed.csv

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.1*******
Number of detections: 9

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.2*******




Number of detections: 8

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.30000000000000004*******
Number of detections: 15

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.4*******
Number of detections: 23

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.5*******
Number of detections: 21

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.6*******
Number of detections: 26

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.7000000000000001*******
Number of detections: 31

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.8*******
Number of detections: 36

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.9*******
Number of detections: 28

****NDKSWIN Execution Random with n_dimension = 2 n_tested_sample = 0.1*******
Number of detections: 6

****NDKSWIN Execution Random with n_dimension = 2 n_tested_sample = 0.2*******
Number of detections: 17



Number of detections: 48

****NDKSWIN Execution Random with n_dimension = 9 n_tested_sample = 0.7000000000000001*******
Number of detections: 52

****NDKSWIN Execution Random with n_dimension = 9 n_tested_sample = 0.8*******
Number of detections: 67

****NDKSWIN Execution Random with n_dimension = 9 n_tested_sample = 0.9*******
Number of detections: 79
    n_dimension  n_tested_sample(%)  detection_number
0             1                 0.1                 9
1             1                 0.2                 8
2             1                 0.3                15
3             1                 0.4                23
4             1                 0.5                21
..          ...                 ...               ...
76            9                 0.5                47
77            9                 0.6                48
78            9                 0.7                52
79            9                 0.8                67
80            9                 0.9                

## Applied On Simple Stream Dataset


In [15]:
dataset_name = "Generator"
test_name = dataset_name+'_'+str(datetime.datetime.now())

stream = func.get_dataset(dataset_name=dataset_name, classification_function=0,noise_percentage=0.0, random_state=1)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)
test = []
for n_dimension in range(1,3+1,1):
    for n_tested_sample in np.arange(0.1, 1, 0.1):

        stream, stream2, stream3 = func.get_file_stream2(path=file_path)
        print("")
        print("****NDKSWIN Execution Random with n_dimension = "+str(n_dimension)+" n_tested_sample = "
              +str(n_tested_sample)+"*******")
        detection = execute_NDKSWIN(stream=stream3, window_size=window, window_number=window_number, 
                         n_dimensions=n_dimension, n_tested_samples=n_tested_sample)
        test.append([n_dimension, n_tested_sample, detection])
results_simple = pd.DataFrame(test, columns=["n_dimension", "n_tested_sample(%)", "detection_number"])
print(results_simple)


Please find the data used on results/Generator_2020-12-03 13:41:11.399144/Generator_2020-12-03 13:41:11.399144_dataUsed.csv

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.1*******
Number of detections: 6

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.2*******
Number of detections: 7

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.30000000000000004*******
Number of detections: 16

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.4*******
Number of detections: 26

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.5*******
Number of detections: 28

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.6*******
Number of detections: 33

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.7000000000000001*******
Number of detections: 35

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.8*******
Number of detections: 39

****N

## Applied On Stream data containning anomalies

In [14]:
dataset_name = "AnomalySineGenerator"
test_name = dataset_name+'_'+str(datetime.datetime.now())

anomalies_percentage = 0.1
n_anomalies= int(max_sample*anomalies_percentage)
#print(str(n_anomalies))
stream = func.get_anomalies_data_generated(n_samples=max_sample, n_anomalies=n_anomalies, contextual=False,
                 n_contextual=0.0, shift=4, noise=0.0, replace=True, random_state=None)

file_path = func.save_stream_data_generated(stream=stream, window = window, result_folder=test_name, 
                                                 window_number = window_number)
test = []
for n_dimension in range(1,2+1,1):
    for n_tested_sample in np.arange(0.1, 1, 0.1):

        stream, stream2, stream3 = func.get_file_stream2(path=file_path)
        print("")
        print("****NDKSWIN Execution Random with n_dimension = "+str(n_dimension)+" n_tested_sample = "
              +str(n_tested_sample)+"*******")
        detection = execute_NDKSWIN(stream=stream3, window_size=window, window_number=window_number, 
                         n_dimensions=n_dimension, n_tested_samples=n_tested_sample)
        test.append([n_dimension, n_tested_sample, detection])
results_anomalies = pd.DataFrame(test, columns=["n_dimension", "n_tested_sample(%)", "detection_number"])
print(results_anomalies)


Please find the data used on results/AnomalySineGenerator_2020-12-03 13:40:31.272243/AnomalySineGenerator_2020-12-03 13:40:31.272243_dataUsed.csv

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.1*******
Number of detections: 7

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.2*******
Number of detections: 10

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.30000000000000004*******
Number of detections: 11

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.4*******
Number of detections: 22

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.5*******
Number of detections: 20

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.6*******
Number of detections: 26

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.7000000000000001*******
Number of detections: 30

****NDKSWIN Execution Random with n_dimension = 1 n_tested_sample = 0.8*******
Number o