In [1]:
from capymoa.datasets import ElectricityTiny
from capymoa.anomaly import HalfSpaceTrees, TreeBasedUnsupervised, Autoencoder, StreamRHF
from capymoa.evaluation import AnomalyDetectionEvaluator
from capymoa.stream import stream_from_file
import pandas as pd
import gzip
import os

In [2]:
from capymoa.anomaly import StreamRHFParallel

### Convert dataset from paper to .csv file

In [2]:
# Variables
path_to_dataset = r"C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public"
dataset_name = "abalone"

# Construct input and output paths
input_path = os.path.join(path_to_dataset, f"{dataset_name}.gz")
output_path = os.path.join(path_to_dataset, f"{dataset_name}.csv")

# Read the gzipped file and save it as a CSV
with gzip.open(input_path, 'rt') as gz_file:
    df = pd.read_csv(gz_file)
    df.to_csv(output_path, index=False)

print(f"CSV saved to: {output_path}")

CSV saved to: C:\Users\aleja\OneDrive - Universidad Nacional de Colombia\Documentos\Institut Polytechnique de Paris\courses\P1\Data Streaming\project\actual code\datasets\forStefan\data\public\abalone.csv


### Create stream from .csv file

In [3]:
stream = stream_from_file(output_path, dataset_name="Abalone")
schema = stream.get_schema()
instance = stream.next_instance()
actual_value = instance.x
label = instance.y_index
print(actual_value)
print(label)
print(schema.get_num_classes())
print(schema.get_num_attributes())
stream.restart()

[0.53   0.42   0.135  0.677  0.2565 0.1415 0.21  ]
0
2
7




In [5]:
schema = stream.get_schema()
learner = Autoencoder(schema)
evaluator = AnomalyDetectionEvaluator(schema)
print('wtf?')
while stream.has_more_instances():
    instance = stream.next_instance()
    #print('x ' + str(instance.x))
    #print("type: ", type(instance.x))
    #print('label ' + str(instance.y_index))
    #print("type: ", type(instance.y_index))
    proba = learner.score_instance(instance)
    print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with Autoencoder")
stream.restart()

wtf?
0.8774290331160938
0.8555266386800031
0.8177722080146602
0.8620425703650495
0.840001704912347
0.8935968806176906
0.856244771740355
0.8519156527609877
0.8421369360224743
0.805011186559849
0.9088471231590479
0.9090071324811125
0.9255157158843001
0.8751943673371475
0.8836664232953603
0.8402965232205333
0.8690979330324775
0.907888310660789
0.9331363429962979
0.9063852298553087
0.9035558558616646
0.895006113583953
0.9261873732373981
0.8912134946318214
0.9193682667987317
0.9243275306694072
0.9337092920046256
0.920308462162236
0.907860490628564
0.9170747241957155
0.8832024099566724
0.9441135239682296
0.9474747546098574
0.9486755800615829
0.9091320764040367
0.9519625375379709
0.924595709294461
0.9222486258251571
0.9495678688246568
0.9270690640332248
0.9337098330986197
0.9522554026640504
0.9474527848261304
0.947184368011141
0.9065030405630239
0.9391973081402658
0.9243828741686522
0.8988382848436502
0.9152425135502222
0.9200148122882348
0.9654885685651357
0.8897968031200695
0.92795849373320

In [4]:
import time
schema = stream.get_schema()
learner = StreamRHF(schema)
evaluator = AnomalyDetectionEvaluator(schema)
start_time = time.time()
processed_instances = 0
#while stream.has_more_instances():
while processed_instances < 1000:
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    print(proba)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
    processed_instances += 1
    # Print progress and elapsed time every 100 instances
    if processed_instances % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {processed_instances} instances. Time spent: {elapsed_time:.2f} seconds.")
        start_time = time.time()  # Reset start time for the next 100 instances
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with StreamRHF")
stream.restart()


our StreamRHF initialized
0.0
0.0021560305774557253
0.0009404629183898949
0.0009404629183898949
0.0009404629183898949
0.0009404629183898949
0.0009404629183898949
0.0009404629183898949
0.0035544826006961294
0.0035544826006961294
0.0009404629183898949
0.0009404629183898949
0.0009404629183898949
0.010854083556354532
0.018918236261905585
0.00878252196530871
0.057487802659527576
0.014454748656614114
0.0009404629183898949
0.013124314699970108
0.13941739532700892
0.1694187527653167
0.05184214748476135
0.16668913429178656
0.15142320111581753
0.13529843252588347
0.1275119793972631
0.1873098423628483
0.22069297258888243
0.20886074220492
0.10210110717636023
0.028095071182880127
0.043261071022331676
0.066256873727084
0.0382647848214438
0.06843957575696724
0.19290252820329679
0.00711349279994844
0.10699106090167265
0.19714190044819713
0.09454375164644446
0.09466636814258922
0.14003109306092332
0.10511833638387624
0.05851239640700645
0.14291835334289904
0.09144441118798963
0.04873286800429211
0.0778

In [None]:
stream = ElectricityTiny()
schema = stream.get_schema()
learner = Autoencoder(schema)
evaluator = AnomalyDetectionEvaluator(schema)
while stream.has_more_instances():
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
auc = evaluator.auc()
print(f"AUC: {auc:.2f}")

AUC: 0.42


In [7]:
stream = ElectricityTiny()
schema = stream.get_schema()
learner = StreamRHF(schema)
evaluator = AnomalyDetectionEvaluator(schema)
start_time = time.time()
processed_instances = 0
while stream.has_more_instances():
    instance = stream.next_instance()
    proba = learner.score_instance(instance)
    #score – The predicted scores. Should be in the range [0, 1].
    evaluator.update(instance.y_index, proba)
    learner.train(instance)
    processed_instances += 1
    # Print progress and elapsed time every 100 instances
    if processed_instances % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {processed_instances} instances. Time spent: {elapsed_time:.2f} seconds.")
        start_time = time.time()  # Reset start time for the next 100 instances
auc = evaluator.auc()
print(f"AUC: {auc:.2f} with StreamRHF")
stream.restart()


our StreamRHF initialized
Processed 100 instances. Time spent: 13.63 seconds.
Processed 200 instances. Time spent: 18.65 seconds.
Processed 300 instances. Time spent: 16.75 seconds.
Processed 400 instances. Time spent: 16.11 seconds.
Processed 500 instances. Time spent: 16.91 seconds.
Processed 600 instances. Time spent: 20.14 seconds.
Processed 700 instances. Time spent: 18.93 seconds.
Processed 800 instances. Time spent: 17.48 seconds.
Processed 900 instances. Time spent: 17.78 seconds.
Processed 1000 instances. Time spent: 17.97 seconds.
Processed 1100 instances. Time spent: 18.38 seconds.
Processed 1200 instances. Time spent: 17.63 seconds.
Processed 1300 instances. Time spent: 17.52 seconds.
Processed 1400 instances. Time spent: 17.25 seconds.
Processed 1500 instances. Time spent: 18.95 seconds.
Processed 1600 instances. Time spent: 18.77 seconds.
Processed 1700 instances. Time spent: 16.37 seconds.
Processed 1800 instances. Time spent: 19.25 seconds.
Processed 1900 instances. Tim

In [8]:
print("ElectricityTiny type: ", type(stream))
print(schema)

ElectricityTiny type:  <class 'capymoa.datasets._datasets.ElectricityTiny'>
@relation electricity

@attribute period numeric
@attribute nswprice numeric
@attribute nswdemand numeric
@attribute vicprice numeric
@attribute vicdemand numeric
@attribute transfer numeric
@attribute class {0,1}

@data


In [12]:
# Define the function to evaluate a given learner on the stream
def evaluate_anomaly_learner(stream, learner, label):
    schema = stream.get_schema()
    evaluator = AnomalyDetectionEvaluator(schema)
    while stream.has_more_instances():
        instance = stream.next_instance()
        proba = learner.score_instance(instance)
        evaluator.update(instance.y_index, proba)
        learner.train(instance)
    auc = evaluator.auc()
    print(f"{label}: AUC = {auc:.2f}")
    return {
        "learner": label,
        "auc": auc,
        "evaluator": evaluator,
    }

# Stream setup
stream = Electricity()

# Define configurations of TreeBasedUnsupervised to evaluate
learners = [
    {"label": "Default Configuration", "params": {}},
    {"label": "More Trees", "params": {"num_trees": 100}},
    {"label": "Deeper Trees", "params": {"max_height": 1000}},
    {"label": "Smaller Window", "params": {"window_size": 50}},
]

# Evaluate all configurations
results = []
for config in learners:
    stream = Electricity()  # Reset the stream for each learner
    learner = TreeBasedUnsupervised(schema=stream.get_schema(), **config["params"])
    result = evaluate_anomaly_learner(stream, learner, config["label"])
    results.append(result)

# Print summary of all results
print("\nSummary of AUC Scores:")
for result in results:
    print(f"{result['learner']}: {result['auc']:.2f}")

# Optional: Access metrics for further analysis or visualization
for result in results:
    print(f"\nMetrics for {result['learner']}:")
    print(result["evaluator"].metrics())


Default Configuration: AUC = 0.50
More Trees: AUC = 0.50
Deeper Trees: AUC = 0.50
Smaller Window: AUC = 0.50

Summary of AUC Scores:
Default Configuration: 0.50
More Trees: 0.50
Deeper Trees: 0.50
Smaller Window: 0.50

Metrics for Default Configuration:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for More Trees:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for Deeper Trees:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]

Metrics for Smaller Window:
[45312.0, 0.5, -0.5, 0.5754546257062146, 0.0, 1.3554608306908562, 0.0, 1.0, -0.0002079758747985337]
