# Stream-based machine learning pipeline

In [None]:
%conda install -c conda-forge scikit-multiflow

## libraries

In [45]:
import numpy as np
import pandas as pd

#https://scikit-multiflow.github.io/scikit-multiflow/documentation.html#learning-methods
from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection.eddm import EDDM
from skmultiflow.drift_detection import PageHinkley
from skmultiflow.drift_detection.adwin import ADWIN

from skmultiflow.meta import AdaptiveRandomForest
from skmultiflow.evaluation import EvaluatePrequential

from skmultiflow.data import DataStream

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline
import glob

In [34]:
dataset = pd.read_csv('workshop_dataset.csv',index_col=False)
dataset_test = pd.read_csv('workshop_dataset_stream_testing.csv',index_col=False)

# 1. Concept Drift Detection
I stole this picture from :

    "Gama, J., Žliobaitė, I., Bifet, A., Pechenizkiy, M., & Bouchachia, A. (2014). A survey on concept drift adaptation. ACM computing surveys (CSUR), 46(4), 1-37."
    
I think it nicely shows categoies of concept drifts.
![missing_image](Pattern_of_change_over_time_(outlier_is_not_concept_drift).png "Pattern of change over time (outlier is not concept drift)")

In [3]:
# check where the fault_id changes - i.e. the concept_drift should occur and mark that sample with "1" (initial sample has NA - fill with 0)
dataset['fault_id_change'] = dataset['fault_id'].diff().fillna(0)

In [4]:
# magnitude of row vectors - concept drift detectors take as input single value not list/vector
dataset['magnitude'] = dataset[dataset.columns[:-1]].apply(np.linalg.norm, axis=1)
data_stream = dataset['magnitude'].values

In [9]:
adwin = ADWIN()
ddm = DDM()
eddm = EDDM()
ph = PageHinkley()

# Adding stream elements to ADWIN and verifying if drift occurred
for i in range(len(data_stream))[:100]:
    #adwin.add_element(data_stream[i])
    #ddm.add_element(data_stream[i])
    #eddm.add_element(data_stream[i])
    ph.add_element(data_stream[i])
    if ph.detected_change():
        print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i))

Change detected in data: 7644.600911861115 - at index: 28
Change detected in data: 7619.368761792521 - at index: 57


# 2. Classification pipeline

## 2.1 Prequential evaluation

In [13]:
samples = dataset.drop(columns=['fault_id', 'fault_id_change', 'magnitude'])
labels  = dataset['fault_id'].to_frame()

stream = DataStream(data = samples, y = labels)
stream.prepare_for_use()

ARF = AdaptiveRandomForest()

evaluator = EvaluatePrequential(n_wait=200, pretrain_size=200, output_file="py_ARF_results.csv")
# Run evaluation
evaluator.evaluate(stream=stream, model=ARF, model_names=['ARF'])

Prequential Evaluation
Evaluating 1 target(s).
Pre-training on 200 sample(s).
Evaluating...
 ###----------------- [15%] [54.88s]

  return sum_value / self.sample_count


 #################### [100%] [413.05s]
Processed samples: 31700
Mean performance:
ARF - Accuracy     : 0.9896
ARF - Kappa        : 0.9891


[AdaptiveRandomForest(binary_split=False, disable_weighted_vote=False,
                      drift_detection_method=ADWIN(delta=0.001), grace_period=50,
                      lambda_value=6, leaf_prediction='nba',
                      max_byte_size=33554432, max_features=7,
                      memory_estimate_period=2000000, n_estimators=10,
                      nb_threshold=0, no_preprune=False, nominal_attributes=None,
                      performance_metric='acc', random_state=None,
                      remove_poor_atts=False, split_confidence=0.01,
                      split_criterion='info_gain', stop_mem_management=False,
                      tie_threshold=0.05,

In [14]:
# skmultiflow saves results to file with leading 5 lines containing configuraiton of evaluation, learner etc
# skmultiflow also did not evaluate last 200 samples
# for the sake of comparisson we shrink the MOA results
# accuracy in MOA is in % and in skmultiflow fraction
py_ARF_results = pd.read_csv('py_ARF_results.csv',skiprows=[0,1,2,3,4],index_col=False)
py_ARF_results['mean_acc_[ARF]'] = py_ARF_results['mean_acc_[ARF]']*100

## 2.2. Real-world scenario

In [39]:
samples_train = dataset.drop(columns=['fault_id']).values
labels_train  = dataset['fault_id'].to_frame().values

samples_test = dataset_test.drop(columns=['fault_id']).values
labels_test  = dataset_test['fault_id'].to_frame().values

In [50]:
ARF.reset()
stream = DataStream(data=samples_train, y=labels_train)
stream.prepare_for_use()

for sample in range(len(labels_train)):
    X, Y = stream.next_sample()
    ARF.partial_fit(X, Y)

stream_test = DataStream(data = samples_test, y = labels_test)
stream_test.prepare_for_use()

labels_test_predicted = []
for sample in range(len(labels_test)):
    X, Y = stream_test.next_sample()
    labels_test_predicted.extend(ARF.predict(X))

In [52]:
print('Classification report :\n' + str(classification_report(labels_test, labels_test_predicted)))

Classification report :
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1376
           1       0.00      0.00      0.00        57
           2       0.00      0.00      0.00       162
           3       0.00      0.00      0.00       378
           4       0.00      0.00      0.00        53
           5       0.00      0.00      0.00       190
           6       0.00      0.00      0.00       115
           7       0.00      0.00      0.00       690
           8       0.00      0.00      0.00       167
           9       0.00      0.00      0.00       449
          10       0.00      0.00      0.00        45
          11       0.00      0.00      0.00      1121
          12       0.00      0.00      0.00       535
          13       0.00      0.00      0.00       395
          14       0.00      0.00      0.00       220
          15       0.00      0.00      0.00       547
          16       0.00      0.00      0.00      1071
   

In [60]:
ARF.predict_proba(X)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1.]])