# PyOD models comparison (LOF, KNN, iForest, GMM, OCSVM)

In [1]:
import glob
import os
import pathlib
import pickle
import sys
import warnings

import IPython.display as ipd
import ipywidgets as widgets
import librosa
import librosa.core
import librosa.display
import librosa.feature
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import scipy
import yaml
from dataset import Mimii_due, Toyadmos
from ipywidgets import interact, interact_manual
from keras.layers import Dense, Input
from keras.models import Model
from pyod.models.gmm import GMM
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.utils.data import evaluate_print, generate_data
from pyod.utils.example import visualize
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

# from import
from tqdm import tqdm

import preprocess as preproc


########################################################################


########################################################################
# import additional python-library
########################################################################


warnings.filterwarnings("ignore")

In [2]:
import torch
from torch import nn
from torch.autograd import Variable
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset, TensorDataset

## 1D Feature representation - amplitude values

### Dataset MIMII_DUE

In [3]:
# train data dir
target_dir = r"datasets\MIMII_DUE\dev_data\gearbox"
section_name = "section_00"
dir_name_train = r"\train"
# target_test data dir
dir_name_test = r"\target_test"

In [4]:
dataset_train_mimii_ts = Mimii_due(
    target_dir, section_name, dir_name_train, extraction_type="amplitude"
)
dataset_test_mimii_ts = Mimii_due(
    target_dir, section_name, dir_name_test, extraction_type="amplitude"
)

100%|██████████████████████████████████████████████████████████████████████████████| 1004/1004 [00:34<00:00, 29.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:06<00:00, 29.26it/s]


In [5]:
X_train_mimii_ts, y_train_mimii_ts = (
    dataset_train_mimii_ts.data,
    dataset_train_mimii_ts.labels,
)
X_test_mimii_ts, y_test_mimii_ts = (
    dataset_test_mimii_ts.data,
    dataset_test_mimii_ts.labels,
)
X_train_mimii_ts.shape, X_test_mimii_ts.shape, y_train_mimii_ts.shape, y_test_mimii_ts.shape

((1004, 160000), (204, 160000), (1004,), (204,))

In [6]:
# Anomaly data ratio
contamination_mimii = np.round(y_test_mimii_ts.sum() / y_test_mimii_ts.shape, 2)
print(f"Mimii_due_anomaly ratio = {contamination_mimii[0]}")
contamination_mimii = 0.4

Mimii_due_anomaly ratio = 0.53


### Dataset ToyAdmos2

In [7]:
# data dir
target_dir_toyadm = r"datasets\ToyAdmos2"
dir_name_toyadm_anomaly = r"\toyad2_car_A_anomaly"
dir_name_toyadm_normal = r"\toyad2_car_A1_normal"

In [8]:
dataset_toy = Toyadmos(
    target_dir_toyadm,
    dir_name_toyadm_normal,
    dir_name_toyadm_anomaly,
    extraction_type="amplitude",
)

100%|██████████████████████████████████████████████████████████████████████████████| 3545/3545 [11:32<00:00,  5.12it/s]


In [9]:
# Train - test - val stratified split
(
    X_train_toy_ts,
    X_test_toy_ts,
    X_val_toy_ts,
    y_train_toy_ts,
    y_test_toy_ts,
    y_val_toy_ts,
) = preproc.mix_data([dataset_toy.data], [dataset_toy.labels])

In [10]:
X_train_toy_ts.shape, X_test_toy_ts.shape, X_val_toy_ts.shape, y_train_toy_ts.shape, y_test_toy_ts.shape, y_val_toy_ts.shape

((2871, 192000), (319, 192000), (355, 192000), (2871,), (319,), (355,))

In [11]:
# Anomaly data ratio
contamination_toy = np.round(y_test_toy_ts.sum() / y_test_toy_ts.shape, 2)
print(f"ToyAdmos anomaly ratio = {contamination_toy[0]}")

ToyAdmos anomaly ratio = 0.23


In [12]:
def anomaly_detection(model_name, X_train, X_test, y_train, y_test):

    clf = models[model_name]
    clf.fit(X_train)

    # get the prediction labels and outlier scores of the training data
    y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
    y_train_scores = clf.decision_scores_  # raw outlier scores

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # metrics
    accuracy = metrics.accuracy_score(y_test, y_test_pred)
    precision = metrics.precision_score(y_test, y_test_pred)
    recall = metrics.recall_score(y_test, y_test_pred)
    f1_score = metrics.f1_score(y_test, y_test_pred)
    scores = pd.DataFrame(
        [
            {
                "Dataset": None,
                "Extraction_type": None,
                "Model_name": model_name,
                "Accuracy": accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1_score": f1_score,
            }
        ]
    )
    # evaluate and print the results
    report = metrics.classification_report(y_test, y_test_pred)
    # print(f'On Test Data:')
    evaluate_print(model_name, y_test, y_test_scores)
    # print(f'\n{report} \n')

    return scores

In [13]:
# Key algorithms, PyOD implementation
def PyOD_classification_report(
    X_train,
    X_test,
    y_train,
    y_test,
    dataset=None,
    extraction_type="aggregate_MFCC",
    contamination=0.1,
):
    """
    dataset : str
        'MIMII_DUE'
        'ToyAdm'
    extraction_type : str
        'aggregate_MFCC' - by default
        'amplitude' - original signal, amplitude values timeseries

    """
    models = {
        "IForest": IForest(
            behaviour="old",
            contamination=contamination,
            max_features=max(1, int(X_train.shape[1] // 2)),
            max_samples="auto",
            n_estimators=10,
            n_jobs=-1,
            random_state=42,
            verbose=0,
        ),
        "LOF": LOF(contamination=contamination),
        "KNN": KNN(contamination=contamination),
        "GMM": GMM(contamination=contamination),
        "OCSVM": OCSVM(contamination=0.1),
    }

    pyod_models = pd.DataFrame(
        columns={
            "Dataset",
            "Extraction_type",
            "Model_name",
            "Accuracy",
            "Precision",
            "Recall",
            "F1_score",
        }
    )
    for model_name, model in models.items():
        if extraction_type == "amplitude" and model_name in ["KNN", "GMM"]:
            continue

        clf = models[model_name]
        clf.fit(X_train)

        # get the prediction on the test data
        y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
        y_test_scores = clf.decision_function(X_test)  # outlier scores

        # metrics
        accuracy = metrics.accuracy_score(y_test, y_test_pred)
        precision = metrics.precision_score(y_test, y_test_pred)
        recall = metrics.recall_score(y_test, y_test_pred)
        f1_score = metrics.f1_score(y_test, y_test_pred)
        scores = pd.DataFrame(
            [
                {
                    "Dataset": None,
                    "Extraction_type": None,
                    "Model_name": model_name,
                    "Accuracy": accuracy,
                    "Precision": precision,
                    "Recall": recall,
                    "F1_score": f1_score,
                }
            ]
        )
        pyod_models = pyod_models.append(scores)
        pyod_models["Dataset"] = dataset
        pyod_models["Extraction_type"] = extraction_type

    return pyod_models

In [14]:
# MIMII_DUE_ts
pyod_mimii_ts = PyOD_classification_report(
    X_train_mimii_ts,
    X_test_mimii_ts,
    y_train_mimii_ts,
    y_test_mimii_ts,
    dataset="MIMII_DUE",
    extraction_type="amplitude",
    contamination=contamination_mimii,
)
pyod_mimii_ts

Unnamed: 0,Precision,F1_score,Model_name,Accuracy,Dataset,Recall,Extraction_type
0,0.564356,0.545455,IForest,0.534314,MIMII_DUE,0.527778,amplitude
0,0.608696,0.682927,LOF,0.617647,MIMII_DUE,0.777778,amplitude
0,0.583333,0.666667,OCSVM,0.588235,MIMII_DUE,0.777778,amplitude


In [15]:
# ToyAdmos_ts
pyod_toy_ts = PyOD_classification_report(
    X_train_toy_ts,
    X_test_toy_ts,
    y_train_toy_ts,
    y_test_toy_ts,
    dataset="ToyAdm",
    extraction_type="amplitude",
    contamination=contamination_toy,
)
pyod_toy_ts

Unnamed: 0,Precision,F1_score,Model_name,Accuracy,Dataset,Recall,Extraction_type
0,0.275362,0.269504,IForest,0.677116,ToyAdm,0.263889,amplitude
0,0.275362,0.269504,LOF,0.677116,ToyAdm,0.263889,amplitude
0,0.655172,0.376238,OCSVM,0.802508,ToyAdm,0.263889,amplitude


## aggregate_MFCC

In [16]:
# MIMII dataset  with aggregate_MFCC features
dataset_train_mimii_mfcc = Mimii_due(
    target_dir, section_name, dir_name_train, extraction_type="aggregate_MFCC"
)
dataset_test_mimii_mfcc = Mimii_due(
    target_dir, section_name, dir_name_test, extraction_type="aggregate_MFCC"
)

100%|██████████████████████████████████████████████████████████████████████████████| 1004/1004 [02:05<00:00,  8.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:25<00:00,  8.04it/s]


In [17]:
# MIMII train - test
X_train_mimii_mfcc, y_train_mimii_mfcc = (
    dataset_train_mimii_mfcc.data,
    dataset_train_mimii_mfcc.labels,
)
X_test_mimii_mfcc, y_test_mimii_mfcc = (
    dataset_test_mimii_mfcc.data,
    dataset_test_mimii_mfcc.labels,
)
X_train_mimii_mfcc.shape, X_test_mimii_mfcc.shape, y_train_mimii_mfcc.shape, y_test_mimii_mfcc.shape

((1004, 165), (204, 165), (1004,), (204,))

In [18]:
# ToyAdm dataset with aggregate_MFCC features
dataset_toy_mfcc = Toyadmos(
    target_dir_toyadm,
    dir_name_toyadm_normal,
    dir_name_toyadm_anomaly,
    extraction_type="aggregate_MFCC",
)

100%|██████████████████████████████████████████████████████████████████████████████| 3545/3545 [18:01<00:00,  3.28it/s]


In [19]:
# Train - test - val stratified split
(
    X_train_toy_mfcc,
    X_test_toy_mfcc,
    X_val_toy_mfcc,
    y_train_toy_mfcc,
    y_test_toy_mfcc,
    y_val_toy_mfcc,
) = preproc.mix_data([dataset_toy_mfcc.data], [dataset_toy_mfcc.labels])
X_train_toy_mfcc.shape, X_test_toy_mfcc.shape, X_val_toy_mfcc.shape, y_train_toy_mfcc.shape, y_test_toy_mfcc.shape, y_val_toy_mfcc.shape

((2871, 165), (319, 165), (355, 165), (2871,), (319,), (355,))

In [20]:
# MIMII_DUE_mfcc
pyod_mimii_mfcc = PyOD_classification_report(
    X_train_mimii_mfcc,
    X_test_mimii_mfcc,
    y_train_mimii_mfcc,
    y_test_mimii_mfcc,
    dataset="MIMII_DUE",
    extraction_type="aggregate_MFCC",
    contamination=contamination_mimii,
)
pyod_mimii_mfcc

Unnamed: 0,Precision,F1_score,Model_name,Accuracy,Dataset,Recall,Extraction_type
0,0.639535,0.56701,IForest,0.588235,MIMII_DUE,0.509259,aggregate_MFCC
0,0.663717,0.678733,LOF,0.651961,MIMII_DUE,0.694444,aggregate_MFCC
0,0.504274,0.524444,KNN,0.47549,MIMII_DUE,0.546296,aggregate_MFCC
0,0.680851,0.771084,GMM,0.720588,MIMII_DUE,0.888889,aggregate_MFCC
0,0.529412,0.692308,OCSVM,0.529412,MIMII_DUE,1.0,aggregate_MFCC


In [21]:
# ToyAdmos_mfcc
pyod_toy_mfcc = PyOD_classification_report(
    X_train_toy_mfcc,
    X_test_toy_mfcc,
    y_train_toy_mfcc,
    y_test_toy_mfcc,
    dataset="ToyAdm",
    extraction_type="aggregate_MFCC",
    contamination=contamination_toy,
)
pyod_toy_mfcc

Unnamed: 0,Precision,F1_score,Model_name,Accuracy,Dataset,Recall,Extraction_type
0,0.736842,0.756757,IForest,0.887147,ToyAdm,0.777778,aggregate_MFCC
0,0.4375,0.460526,LOF,0.742947,ToyAdm,0.486111,aggregate_MFCC
0,0.6125,0.644737,KNN,0.830721,ToyAdm,0.680556,aggregate_MFCC
0,0.833333,0.866667,GMM,0.937304,ToyAdm,0.902778,aggregate_MFCC
0,0.225705,0.368286,OCSVM,0.225705,ToyAdm,1.0,aggregate_MFCC
