In [1]:
import numpy as np
import pandas as pd
from skmultiflow.meta import AdaptiveRandomForestClassifier
from skmultiflow.data import FileStream  # Create stream from file
from sklearn.model_selection import ParameterGrid  # Generate hyperparameter combinations

In [2]:
data_loader = FileStream(filepath='merged_cesnet.csv')

In [3]:
ref_sample, _ = data_loader.next_sample(50)
data_loader.reset()
print(ref_sample)

[[4.43000000e+02 1.33350000e+04 1.00000000e+00 ... 1.49090909e+01
  4.43138693e+01 1.96371901e+03]
 [4.43000000e+02 9.82100000e+03 1.00000000e+00 ... 5.24819214e-04
  5.45202988e-04 2.97246298e-07]
 [4.43000000e+02 1.51690000e+04 1.00000000e+00 ... 6.50000000e+00
  8.17006732e+00 6.67500000e+01]
 ...
 [4.43000000e+02 1.51690000e+04 1.00000000e+00 ... 2.15948695e-03
  5.36846433e-04 2.88204093e-07]
 [4.43000000e+02 1.51690000e+04 1.00000000e+00 ... 1.09090909e+00
  1.37870463e+00 1.90082645e+00]
 [4.43000000e+02 9.82100000e+03 1.00000000e+00 ... 1.48829838e-03
  8.76703398e-04 7.68608848e-07]]


In [4]:
# Define hyperparameter grid
parameters = {
    'n_estimators': [6],  # default=10
    'max_features': [None],  # 'None' is not a valid string for skmultiflow, using None
    'drift_detection_method': [None],  # ADWIN should be passed as an instance, not a string
    'grace_period': [25],  # default=50
    'split_criterion': ['gini'],  # default=info_gain
    'split_confidence': [0.01],  # default=0.01
    'tie_threshold': [0.01],  # default=0.05
    'leaf_prediction': ['nba'],  # default=nba
}

arf_grid = ParameterGrid(parameters)

In [6]:
best_params = None
best_score = 0
results = []

for params in arf_grid:
    print(f"\nEvaluating parameters: {params}")
    
    data_loader.restart()

    train_data, train_labels = data_loader.next_sample(80000)
    test_data, test_labels = data_loader.next_sample(40000)

    print("Train data shape:", train_data.shape)
    print("Train labels shape:", train_labels.shape)
    print("Test data shape:", test_data.shape)
    print("Test labels shape:", test_labels.shape)
    print("Unique train labels:", np.unique(train_labels))
    print("Unique test labels:", np.unique(test_labels))

    arf = AdaptiveRandomForestClassifier(**params)
    arf.fit(train_data, train_labels)

    print("Number of trees in the forest:", len(arf.ensemble))

    predictions = arf.predict(test_data)
    print("Unique predictions:", np.unique(predictions))

    accuracy = np.mean(predictions == test_labels)
    print(f"Accuracy: {accuracy:.8f}")

    results.append({**params, 'accuracy': accuracy})

    if accuracy > best_score:
        best_params = params
        best_score = accuracy



Evaluating parameters: {'drift_detection_method': None, 'grace_period': 25, 'leaf_prediction': 'nba', 'max_features': None, 'n_estimators': 6, 'split_confidence': 0.01, 'split_criterion': 'gini', 'tie_threshold': 0.01}
Train data shape: (80000, 45)
Train labels shape: (80000,)
Test data shape: (40000, 45)
Test labels shape: (40000,)
Unique train labels: [0 1]
Unique test labels: [0 1]
Number of trees in the forest: 6
Unique predictions: [0 1]
Accuracy: 0.78935000


In [7]:
# Print the best parameters
print(f"\nBest Parameters: {best_params}")
print(f"Best Accuracy: {best_score:.8f}\n")

print(pd.DataFrame(results))


Best Parameters: {'drift_detection_method': None, 'grace_period': 25, 'leaf_prediction': 'nba', 'max_features': None, 'n_estimators': 6, 'split_confidence': 0.01, 'split_criterion': 'gini', 'tie_threshold': 0.01}
Best Accuracy: 0.78935000

  drift_detection_method  grace_period leaf_prediction max_features  \
0                   None            25             nba         None   

   n_estimators  split_confidence split_criterion  tie_threshold  accuracy  
0             6              0.01            gini           0.01   0.78935  
