In [24]:
# classifiers
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.lazy import KNNClassifier, KNNADWINClassifier
from skmultiflow.drift_detection.adwin import ADWIN

from float.feature_selection import OFS, FIRES      # online feature methods
from skmultiflow.data import FileStream             # create stream from file
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid   # hyperparameter combinations
import numpy as np
import time
import copy
import matplotlib.pyplot as plt

In [4]:
data_loader = FileStream(filepath='out.csv')

In [5]:
ref_sample, _ = data_loader.next_sample(50)
data_loader.reset()

ofs_fs = OFS(n_total_features=data_loader.n_features,
            n_selected_features=7,
            baseline='gaussian',
            ref_sample=ref_sample)

In [6]:
stream = data_loader
adwin = ADWIN()

arf_model = AdaptiveRandomForestClassifier()
arf_noadwin_model = AdaptiveRandomForestClassifier(drift_detection_method=None)

In [25]:
def run_prequential(setup_name, classifier, stream, drift_detector, feature_selector, n_pretrain=200):
    """
    Parameters
    ----------
    setup (str): 
    classifier
    etc...
    """
    stream.restart()
    n_samples, correct_predictions = 0, 0
    # correct_predictions = 0
    true_labels, pred_labels = [], []
    # pred_labels = []

    print(f"Evaluating {setup_name} configuration.")

    # pretrain samples
    for _ in range(n_pretrain):
        X, y = stream.next_sample()
        classifier.partial_fit(X, [y[0]])
    
    print(f"Model pretrained on {n_pretrain} samples.")

    while n_samples < 50000 and stream.has_more_samples():
        X, y = stream.next_sample()
        n_samples += 1

        if feature_selector is not None:
            # with dynamic feature selection
            feature_selector.weight_features(copy.copy(X), copy.copy(y))
            X_select = feature_selector.select_features(copy.copy(X), rng=np.random.default_rng())
            y_pred = classifier.predict(X_select)
            
            # Train incrementally
            classifier.partial_fit(copy.copy(X_select), [y[0]])

        else:
            # no feature selection
            y_pred = classifier.predict(X)
            
            # Train incrementally
            classifier.partial_fit(copy.copy(X), [y[0]])
        
        if y_pred == y:
            correct_predictions += 1
        
        true_labels.append(y[0])
        pred_labels.append(y_pred[0])

        # check for drift
        if drift_detector is not None:
            drift_detector.add_element(np.float64(y_pred == y))
            if drift_detector.detected_change():
                print(f"drift detected at {n_samples}")


    # Calculate accuracy
    accuracy = correct_predictions / n_samples
    print(accuracy)

In [15]:
run_prequential(
    setup_name='ARF-ADWIN-OFS',
    classifier=arf_model, 
    stream=stream,
    drift_detector=adwin, 
    feature_selector=ofs_fs
)

Evaluating ARF-ADWIN-OFS configuration.
Model pretrained on 200 samples.
drift detected at 44032


  self.mdbl_width += self.width


0.9985


In [16]:
run_prequential(
    setup_name='ARF-OFS',
    classifier=arf_noadwin_model, 
    stream=stream,
    drift_detector=None, 
    feature_selector=ofs_fs
)

Evaluating ARF-OFS configuration.
Model pretrained on 200 samples.
0.99523


In [17]:
run_prequential(
    setup_name='ARF-ADWIN',
    classifier=arf_model, 
    stream=stream,
    drift_detector=adwin, 
    feature_selector=None
)

Evaluating ARF-ADWIN configuration.
Model pretrained on 200 samples.
drift detected at 13856
drift detected at 55648
drift detected at 68192
0.99958


In [17]:
parameters = {
    'n_selected_features': [5, 10, 15, 20, 25, 30, 50, 70]
}

ofs_grid = ParameterGrid(parameters)

In [18]:
for params in ofs_grid:
    
    print(params)
    

{'n_selected_features': 5}
{'n_selected_features': 10}
{'n_selected_features': 15}
{'n_selected_features': 20}
{'n_selected_features': 25}
{'n_selected_features': 30}
{'n_selected_features': 50}
{'n_selected_features': 70}


In [19]:
for params in ofs_grid:
    ofs_fs = OFS(n_total_features=data_loader.n_features,
                 n_selected_features=params['n_selected_features'],
                 baseline='gaussian',
                 ref_sample=ref_sample)
    
    run_prequential(
        setup_name=f'ARF-ADWIN-OFS-{params["n_selected_features"]}',
        classifier=arf_model, 
        stream=stream,
        drift_detector=adwin, 
        feature_selector=ofs_fs
    )
    

Evaluating ARF-ADWIN-OFS-5 configuration.
Model pretrained on 200 samples.




drift detected at 30720
drift detected at 40960
drift detected at 43008
drift detected at 44032
0.9987
Evaluating ARF-ADWIN-OFS-10 configuration.
Model pretrained on 200 samples.


  self.mdbl_width += self.width


0.99858
Evaluating ARF-ADWIN-OFS-15 configuration.
Model pretrained on 200 samples.
0.99848
Evaluating ARF-ADWIN-OFS-20 configuration.
Model pretrained on 200 samples.
0.9986
Evaluating ARF-ADWIN-OFS-25 configuration.
Model pretrained on 200 samples.
0.99886
Evaluating ARF-ADWIN-OFS-30 configuration.
Model pretrained on 200 samples.
0.99902
Evaluating ARF-ADWIN-OFS-50 configuration.
Model pretrained on 200 samples.
0.9995
Evaluating ARF-ADWIN-OFS-70 configuration.
Model pretrained on 200 samples.
0.9997
