In [204]:
import numpy as np
import pandas as pd
from scipy.special import rel_entr, kl_div
from scipy.stats import entropy, ks_2samp
from scipy.spatial.distance import jensenshannon
from skmultiflow.drift_detection import DDM, PageHinkley, ADWIN
from skmultiflow.data import ConceptDriftStream

# Lifelong learning

###### Continual Lifelong Learning with Neural Networks: A Review
https://arxiv.org/abs/1802.07569

# Feature drift

### A survey on feature drift adaptation: Definition, benchmark, challenges and future directions
https://www.sciencedirect.com/science/article/pii/S0164121216301030#bib0046
# $$\Downarrow$$
#### Dynamic Feature Space and Incremental Feature Selection for the Classiflcation of Textual Data Streams
www.researchgate.net/publication/250302474_Dynamic_Feature_Space_and_Incremental_Feature_Selection_for_the_Classiflcation_of_Textual_Data_Streams
###### Dataset:
https://spamassassin.apache.org/old/publiccorpus/

###### Another interesting dataset about spam (): 
https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam

#### Heterogeneous Ensemble for Feature Drifts in Data Streams
https://www.researchgate.net/publication/229067302_Heterogeneous_Ensemble_for_Feature_Drifts_in_Data_Streams
###### Dataset:
http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

### Drift Detection Using Uncertainty Distribution Divergence
https://www.researchgate.net/publication/220766509_Drift_Detection_Using_Uncertainty_Distribution_Divergence

###### Datasets:
1) http://www.daviddlewis.com/resources/testcollections/reuters21578/  
2) http://qwone.com/~jason/20Newsgroups/

## Total Variance Distance 
### tvd = $\frac{\sum diff_{abs}}{2}$

http://data8.org/fa15/text/3_inference.html \
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-dg.pdf#clarify-data-bias-metric-total-variation-distance

In [162]:
dist1 = np.array([0.15, 0.18, 0.12, 0.54, 0.01])
dist2 = np.array([0.26, 0.08, 0.08, 0.54, 0.04])

In [163]:
tvd = sum(abs(dist1 - dist2))/2
tvd

0.14

## Kullback-Leibler Divergence

### $D_{KL}(P||Q) = \sum P(x)\log(\frac{P(x)}{Q(x)})$

https://machinelearningmastery.com/divergence-between-probability-distributions/

In [164]:
print('rel_entr: ', sum(rel_entr(dist1, dist2)))
print('kl_div: ', sum(kl_div(dist1, dist2))) # kl_div = rel_entr( x, y ) - x + y
print('entropy: ', entropy(dist1, dist2))

rel_entr:  0.09825335774282917
kl_div:  0.0982533577428292
entropy:  0.09825335774282917


## Jensen-Shannon Divergence

$D_{JS}(P || Q) = \frac{1}{2} * D_{KL}(P || M) + \frac{1}{2} * D_{KL}(Q || M)$ \
$M = \frac{Q+P}{2}$ \
$DIST_{JS} = \sqrt{D_{JS}}$

In [165]:
js_div = lambda x, y: (entropy(x, (x + y)/2) + entropy(y, (x + y)/2))/2
print('jensenshannon: ', jensenshannon(dist1, dist2))
print('js_div: ', js_div(dist1, dist2))

jensenshannon:  0.15546410214041506
js_div:  0.024169087054325404


## Kolmogorov-Smirnov test

https://www.datadoghq.com/blog/engineering/robust-statistical-distances-for-machine-learning/

In [166]:
ks_2samp(dist1, dist2)

Ks_2sampResult(statistic=0.4, pvalue=0.873015873015873)

# Concept drift

https://scikit-multiflow.readthedocs.io/en/stable/api/api.html#module-skmultiflow.drift_detection

## DDM

https://scikit-multiflow.readthedocs.io/en/stable/api/generated/skmultiflow.drift_detection.DDM.html#skmultiflow.drift_detection.DDM

In [188]:
ddm = DDM()

data_stream = np.random.randint(2, size=2000)
for i in range(999, 1500):
    data_stream[i] = 0
    
for i in range(2000):
    ddm.add_element(data_stream[i])
#     if ddm.detected_warning_zone():
#         print('Warning zone has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))
    if ddm.detected_change():
        print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))

Change has been detected in data: 1 - of index: 1740


In [182]:
stream = ConceptDriftStream()

In [183]:
stream.n_features #= 1

9

## PageHinkley

https://scikit-multiflow.readthedocs.io/en/stable/api/generated/skmultiflow.drift_detection.PageHinkley.html?highlight=PageHinkley

In [186]:
ph = PageHinkley()

data_stream = np.random.randint(2, size=2000)

for i in range(999, 2000):
    data_stream[i] = np.random.randint(4, high=8)

for i in range(2000):
    ph.add_element(data_stream[i])
    if ph.detected_change():
        print('Change has been detected in data: ' + str(data_stream[i]) + ' - of index: ' + str(i))

Change has been detected in data: 6 - of index: 1009


## ADWIN

https://scikit-multiflow.readthedocs.io/en/stable/api/generated/skmultiflow.drift_detection.ADWIN.html?highlight=ADWIN

In [202]:
adwin = ADWIN()

data_stream = np.random.randint(2, size=2000)

for i in range(999, 2000):
    data_stream[i] = np.random.randint(4, high=8)
    
for i in range(2000):
    adwin.add_element(data_stream[i])
    if adwin.detected_change():
        print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i))

Change detected in data: 6 - at index: 1023
Change detected in data: 6 - at index: 1055
Change detected in data: 4 - at index: 1087
Change detected in data: 6 - at index: 1311


## Paired Learners

http://cs.brown.edu/people/sbach/files/bach-icdm08.pdf