# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from tqdm import tqdm
from time import time

In [3]:
X_train = np.load('data/x_train.npy')
y_train = np.load('data/y_train.npy')

X_test = np.load('data/x_test.npy')
y_test = np.load('data/y_test.npy')

#X_val = np.load('data/x_val.npy')
#y_val = np.load('data/y_val.npy')

In [4]:
#X_train = np.concatenate((X_train, X_val), axis=0)
#y_train = np.concatenate((y_train, y_val), axis=0)

# Playground

In [5]:
np.unique(y_train, return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([ 7928,  8357, 10489,  8184, 11611,   682,   286, 19323,  7913,
         7186,  7679,  8258,  8198,  8006,  7656]))

7 is normal class

In [6]:
normal_data = X_train[y_train == 7] # normal class

# Helpers

In [7]:
def evaluate_model(model, y_test, y_pred):
    positive_class = 'normal'
    
    accuracy = accuracy_score(y_test, y_pred)
    
    precision = precision_score(y_test, y_pred, pos_label=positive_class)
    recall = recall_score(y_test, y_pred, pos_label=positive_class)
    f1 = f1_score(y_test, y_pred, pos_label=positive_class)
    
    metrics = {
        'model': [model],
        'accuracy': [accuracy],
        'precision': [precision],
        'recall': [recall],
        'f1': [f1],
    }
    
    return metrics

# SGD-OCSVM

In [8]:
from sklearn.linear_model import SGDOneClassSVM

In [18]:
sgd_ocsvm_start = time()

sgd_ocsvm = SGDOneClassSVM(nu=0.01, max_iter=1000, tol=0.001, learning_rate='invscaling', eta0=0.1, random_state=42)
sgd_ocsvm.fit(normal_data)
        
print(f"seconds: {time() - sgd_ocsvm_start}")

seconds: 0.016887664794921875


In [19]:
y_pred_test_sgd_ocsvm = sgd_ocsvm.predict(X_test)
y_pred_test_sgd_ocsvm = np.where(y_pred_test_sgd_ocsvm == 1, 'normal', 'anomaly')
y_test_converted_sgd_ocsvm = np.where(y_test == 7, 'normal', 'anomaly')

In [20]:
sgd_ocsvm_df = pd.DataFrame(evaluate_model("sgd_ocsvm", y_test_converted_sgd_ocsvm, y_pred_test_sgd_ocsvm))
sgd_ocsvm_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,sgd_ocsvm,0.99977,1.0,0.998535,0.999267


# PCA Reconstruction

In [21]:
from sklearn.decomposition import PCA

In [35]:
pca_start = time()

pca = PCA(n_components=5, whiten=True, svd_solver='auto')

normal_data_pca = pca.fit_transform(normal_data)
normal_data_reconstructed = pca.inverse_transform(normal_data_pca)

reconstruction_error = np.mean((normal_data - normal_data_reconstructed) ** 2, axis=1)

threshold = np.percentile(reconstruction_error, 95)

X_test_pca = pca.transform(X_test)
X_test_reconstructed = pca.inverse_transform(X_test_pca)
X_test_reconstruction_error = np.mean((X_test - X_test_reconstructed) ** 2, axis=1)

print(f"seconds: {time() - pca_start}")

seconds: 0.5393750667572021


In [36]:
y_pred_test_pca = np.where(X_test_reconstruction_error > threshold, 'anomaly', 'normal')
y_test_converted_pca = np.where(y_test == 7, 'normal', 'anomaly')

In [37]:
pca_df = pd.DataFrame(evaluate_model('PCA Reconstruction', y_test_converted_pca, y_pred_test_pca))
pca_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,PCA Reconstruction,0.993035,1.0,0.95563,0.977312


# Results (2)

In [38]:
results_df = pd.concat([sgd_ocsvm_df, pca_df], ignore_index=True)
results_df.to_csv('results/results_edge_2.csv', index=False)

results_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,sgd_ocsvm,0.99977,1.0,0.998535,0.999267
1,PCA Reconstruction,0.993035,1.0,0.95563,0.977312


# OCSVM

In [33]:
from sklearn.svm import OneClassSVM

In [34]:
ocsvm = OneClassSVM(kernel='rbf', gamma=0.01, nu=0.01)

batch_size = 1000
n_batches = 1# int(np.ceil(len(normal_data) / batch_size))

for i in tqdm(range(n_batches), desc="Training OCSVM"):
    start = i * batch_size
    end = min((i + 1) * batch_size, len(normal_data))
    batch_data = normal_data[start:end]
    
    if i == 0:
        ocsvm.fit(batch_data)
    else:
        ocsvm.fit(np.vstack((ocsvm.support_vectors_, batch_data)))

Training OCSVM: 100%|██████████| 1/1 [00:00<00:00, 467.75it/s]


In [37]:
y_pred_test_ocsvm = ocsvm.predict(X_test)
y_pred_test_ocsvm = np.where(y_pred_test_ocsvm == 1, 'normal', 'anomaly')
y_test_converted_ocsvm = np.where(y_test == 7, 'normal', 'anomaly')

In [38]:
evaluate_model("ocsvm", y_test_converted_ocsvm, y_pred_test_ocsvm)

{'model': ['ocsvm'],
 'accuracy': [0.9966819973718791],
 'precision': [1.0],
 'recall': [0.97886144830473],
 'f1': [0.9893178212585934]}

In [50]:
ocsvm_df = pd.DataFrame(evaluate_model("ocsvm", y_test_converted_ocsvm, y_pred_test_ocsvm)) 
ocsvm_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.998127,1.0,0.98807,0.993999


# LOF

In [19]:
from sklearn.neighbors import LocalOutlierFactor

In [20]:
lof_start = time()

lof = LocalOutlierFactor(n_neighbors=50, contamination=0.001, novelty=True)
lof.fit(X_train)

print(f"seconds: {time() - lof_start}")

seconds: 29.015689849853516


In [21]:
y_pred_test_lof = lof.predict(X_test)
y_pred_test_lof = np.where(y_pred_test_lof == 1, 'normal', 'anomaly')
y_test_converted_lof = np.where(y_test == 1, 'normal', 'anomaly')

In [22]:
evaluate_model("lof", y_test_converted_lof, y_pred_test_lof)

{'model': ['lof'],
 'accuracy': [0.07158344283837056],
 'precision': [0.07033126089674002],
 'recall': [1.0],
 'f1': [0.1314196145926176]}

In [91]:
lof_df = pd.DataFrame(evaluate_model("lof", y_test_converted_lof, y_pred_test_lof)) 
lof_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,lof,0.15726,0.156808,0.998116,0.271035


# IF

In [23]:
from sklearn.ensemble import IsolationForest

In [24]:
if_start = time()

iforest = IsolationForest(n_estimators=200, max_samples=128, contamination=0.001, random_state=42)
iforest.fit(X_train)

print(f"seconds: {time() - if_start}")

seconds: 21.571798086166382


In [25]:
y_pred_test_if = iforest.predict(X_test)
y_pred_test_if = np.where(y_pred_test_if == 1, 'normal', 'anomaly')
y_test_converted_if = np.where(y_test == 7, 'normal', 'anomaly')

In [26]:
evaluate_model("if", y_test_converted_if, y_pred_test_if)

{'model': ['if'],
 'accuracy': [0.15653745072273326],
 'precision': [0.1566043316791008],
 'recall': [0.9972791963164503],
 'f1': [0.27070018463286466]}

In [129]:
if_df = pd.DataFrame(evaluate_model("if", y_test_converted_if, y_pred_test_if)) 
if_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,if,0.155946,0.156105,0.993512,0.269816


# Results

In [130]:
results_df = pd.concat([ocsvm_df, lof_df, if_df], ignore_index=True)
results_df.to_csv('results/results_wustl.csv', index=False)

results_df

Unnamed: 0,model,accuracy,precision,recall,f1
0,ocsvm,0.998127,1.0,0.98807,0.993999
1,lof,0.15726,0.156808,0.998116,0.271035
2,if,0.155946,0.156105,0.993512,0.269816
