In [1]:
# classifiers
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.lazy import KNNClassifier, KNNADWINClassifier
from skmultiflow.drift_detection.adwin import ADWIN

from float.feature_selection import OFS, FIRES      # online feature methods
from skmultiflow.data import FileStream             # create stream from file
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ParameterGrid   # hyperparameter combinations
import numpy as np
import time
import copy
import matplotlib.pyplot as plt

In [2]:
data_loader = FileStream(filepath='out.csv')

In [3]:
ref_sample, _ = data_loader.next_sample(50)
data_loader.reset()

fires_fs = FIRES(n_total_features=data_loader.n_features,
                n_selected_features=7,
                classes=data_loader.target_values,
                baseline='gaussian',
                ref_sample=ref_sample)

In [4]:
stream = data_loader
adwin = ADWIN()

model = AdaptiveRandomForestClassifier()

In [5]:
def run_prequential(classifier, stream, n_pretrain, drift_detector, feature_selector):
    n_samples = 0
    correct_predictions = 0
    true_labels = []
    pred_labels = []

    # pretrain samples
    for _ in range(n_pretrain):
        X, y = stream.next_sample()
        classifier.partial_fit(X, [y[0]])
    
    print(f"Model pretrained on {n_pretrain} samples.")

    while n_samples < 50000 and stream.has_more_samples():
        X, y = stream.next_sample()

        # Online Feature Selection
        if feature_selector is not None:
            feature_selector.weight_features(copy.copy(X), copy.copy(y))
            X_select = feature_selector.select_features(copy.copy(X), rng=np.random.default_rng())
            y_pred = classifier.predict(X_select)
        
        # no feature selection
        else:
            y_pred = model.predict(X)

        if y_pred == y:
            correct_predictions += 1
        
        true_labels.append(y[0])
        pred_labels.append(y_pred[0])

        # check for drift
        drift_detector.add_element(np.float64(y_pred == y))
        if drift_detector.detected_change():
            print(f"drift detected at {n_samples}")

        # Train incrementally
        classifier.partial_fit(copy.copy(X_select), [y[0]])
        n_samples += 1

    # Calculate accuracy
    accuracy = correct_predictions / n_samples
    print(accuracy)

In [6]:
run_prequential(model, stream, 200, adwin, fires_fs)

Model pretrained on 200 samples.
drift detected at 30719
drift detected at 31743
0.99848


In [7]:
# # Define the parameter grid for ARF
# arf_param_grid = {
#     'n_estimators': [5, 10, 20, 30],    # default=10
#     'grace_period': [50, 100],          # default=50
#     'split_confidence': [0.01, 0.05],   # default=0.01
#     'leaf_prediction': ['mc', 'nba']    # default='nba'
# }

# parameters = {
#     'arf': {
#         'n_estimators': [5, 10, 20, 30],    # default=10
#         'grace_period': [50, 100],          # default=50
#         'split_confidence': [0.01, 0.05],   # default=0.01
#         'leaf_prediction': ['mc', 'nba']    # default='nba'    
#     },
#     # search spaces based on FIRES paper appendix
#     'fires': {
#         'penalty_s': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],    # default=0.01
#         'penalty_r': [0.01, 0.1, 1],          # default=0.01
#         'lr_mu': [0.01, 0.025, 0.1, 1, 10],   # default=0.01
#         'lr_sigma': [0.01, 0.025, 0.1, 1, 10]    # default=0.01    
#     }
# }

# # Generate hyperparameter combinations
# arf_grid = ParameterGrid(arf_param_grid)