In [1]:
import pandas as pd
import numpy as np
import sklearn
import os

# Data Manipulation

In [2]:
home = '~/UNI/DCML-CPS/'
dataset = pd.read_csv(os.path.join(home, 'risorse/dataset_arancino_monitor.csv'))

In [12]:
X = dataset.drop(columns=['_timestamp', 'label'])
y_binary = [1 if r != 'normal' else 1 for r in dataset['label'].values]
y = dataset['label'].values

# Supervised Learning

In [82]:
from sklearn.tree import DecisionTreeClassifier as Model
clf = Model()
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_binary, test_size=0.33, shuffle=True)

In [83]:
clf.fit(X_train, y_train)

In [84]:
predicted_prob = clf.predict_proba(X_test)
predicted = clf.predict(X_test)

In [None]:
# model, train, test
# knn: 0.2, 38.3
# tree: 22.4, 0.1

In [76]:
sklearn.metrics.confusion_matrix(y_test, predicted)

array([[ 5433,  3339],
       [  981, 41067]])

In [77]:
sklearn.metrics.accuracy_score(y_test, predicted)

0.9149940968122786

# Unsupervised Learning

In [13]:
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from pyod.models.copod import COPOD

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y_binary, test_size=0.33, shuffle=False)
_contamination = 0.173

In [14]:
models = [HBOS(contamination=_contamination), 
          ABOD(contamination=_contamination), 
          COPOD(contamination=_contamination)]

In [15]:
for model in models:
    model.fit(X_train)
    predicted = model.predict(X_test)
    print("%.4f" % sklearn.metrics.accuracy_score(y_test, predicted))
    

0.5401
