# Imports

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import math

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import optuna

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [56]:
X_train = np.load('data/x_train.npy')
y_train = np.load('data/y_train.npy')

X_test = np.load('data/x_test.npy')
y_test = np.load('data/y_test.npy')

X_val = np.load('data/x_val.npy')
y_val = np.load('data/y_val.npy')

# DT Feature Selection

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
import time

In [6]:
start_time = time.time()

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

print(f"Seconds: {time.time() - start_time}")

Seconds: 17.036347150802612


In [7]:
feature_importances = clf.feature_importances_
feature_importances = np.array(feature_importances)

feature_importances

array([4.25649439e-04, 2.27843747e-01, 5.67413500e-02, 6.01118228e-03,
       4.83521855e-03, 1.84897351e-02, 0.00000000e+00, 1.62580684e-02,
       1.94414533e-03, 1.00301575e-02, 1.15674132e-01, 1.05869682e-02,
       0.00000000e+00, 0.00000000e+00, 1.37696206e-02, 1.90166277e-02,
       3.01795020e-05, 5.64468352e-03, 3.68200803e-02, 1.22150200e-02,
       1.14292778e-03, 5.27178771e-03, 4.29308892e-03, 4.25429407e-04,
       2.16374501e-02, 3.65831527e-04, 1.16651471e-02, 1.73052354e-03,
       2.06840978e-02, 1.08066153e-03, 4.66241520e-03, 1.15952394e-03,
       3.86447118e-04, 5.47284838e-04, 1.09945242e-03, 5.23449920e-04,
       5.66000588e-04, 1.74863835e-03, 5.48326183e-04, 3.05622869e-04,
       3.60064823e-03, 5.41276029e-03, 1.20572715e-03, 6.35955422e-04,
       9.79870090e-03, 6.59661812e-04, 9.39793782e-03, 8.33772252e-03,
       2.19886657e-01, 7.66271495e-04, 1.39418605e-05, 9.75958288e-03,
       4.35042735e-02, 1.00931593e-04, 3.21843840e-04, 1.71627707e-04,
      

In [8]:
threshold = 0.001
selected_indices = np.where(feature_importances > threshold)[0]
print(f"Number of features selected: {len(selected_indices)}")

Number of features selected: 37


In [9]:
X_train_selected = X_train[:, selected_indices]
X_test_selected = X_test[:, selected_indices]
X_val_selected = X_val[:, selected_indices]

In [57]:
y_train = np.where(y_train == 11, 0, 1)
y_test = np.where(y_test == 11, 0, 1)
y_val = np.where(y_val == 11, 0, 1)

# PyDeepInsight

In [11]:
from pyDeepInsight import ImageTransformer

In [12]:
it = ImageTransformer(
    pixels=8,
    feature_extractor='tsne',
    discretization='lsa'
)

In [13]:
it.fit(X_train_selected)
X_train_images = it.transform(X_train_selected, 'pytorch')

X_test_images = it.transform(X_test_selected, 'pytorch')

X_val_images = it.transform(X_val_selected, 'pytorch')

# VAE

In [14]:
num_samples, channels, img_height, img_width = X_train_images.shape
latent_dim = 16

In [15]:
class Encoder(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32, latent_dim=2):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(img_channels, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.relu = nn.ReLU()
        
        self.fc1 = nn.Linear(32 * 2 * 2, feature_dim)
        self.fc_mu = nn.Linear(feature_dim, latent_dim)
        self.fc_logvar = nn.Linear(feature_dim, latent_dim)

    def forward(self, x):
        x = self.relu(self.conv1(x))  # Output: (batch_size, 16, 8, 8)
        x = self.pool(x)              # Output: (batch_size, 16, 4, 4)
        x = self.relu(self.conv2(x))  # Output: (batch_size, 32, 4, 4)
        x = self.pool(x)              # Output: (batch_size, 32, 2, 2)
        x = x.view(x.size(0), -1)     # Flatten to (batch_size, 128)
        x = self.relu(self.fc1(x))
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

In [16]:
class Decoder(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32, latent_dim=2):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, feature_dim)
        self.fc2 = nn.Linear(feature_dim, 32 * 2 * 2)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        self.deconv1 = nn.ConvTranspose2d(32, 16, kernel_size=3, stride=1, padding=1)
        self.deconv2 = nn.ConvTranspose2d(16, img_channels, kernel_size=3, stride=1, padding=1)
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')

    def forward(self, z):
        x = self.relu(self.fc1(z))           # Output: (batch_size, feature_dim)
        x = self.relu(self.fc2(x))           # Output: (batch_size, 128)
        x = x.view(x.size(0), 32, 2, 2)      # Reshape to (batch_size, 32, 2, 2)
        x = self.upsample(x)                 # Upsample to (batch_size, 32, 4, 4)
        x = self.relu(self.deconv1(x))       # Output: (batch_size, 16, 4, 4)
        x = self.upsample(x)                 # Upsample to (batch_size, 16, 8, 8)
        x = self.sigmoid(self.deconv2(x))    # Output: (batch_size, img_channels, 8, 8)
        return x

In [17]:
class VAE(nn.Module):
    def __init__(self, img_channels=1, feature_dim=32, latent_dim=2):
        super(VAE, self).__init__()
        self.encoder = Encoder(img_channels, feature_dim, latent_dim)
        self.decoder = Decoder(img_channels, feature_dim, latent_dim)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        reconstructed = self.decoder(z)
        return reconstructed, mu, logvar

In [18]:
def loss_function(reconstructed, original, mu, logvar):
    reconstruction_loss = F.mse_loss(reconstructed, original, reduction='sum')
    kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    total_loss = reconstruction_loss + kl_loss
    return total_loss, reconstruction_loss, kl_loss

In [19]:
model = VAE(img_channels=3, feature_dim=32, latent_dim=16).to(device)
model.load_state_dict(torch.load("deepinsight_vae_normal.pth"))

<All keys matched successfully>

# Extract

In [21]:
normal_indices = np.where(y_train == 0)[0]
X_train_normal = X_train_images[normal_indices]
y_train_normal = y_train[normal_indices]

X_train_tensor = torch.tensor(X_train_normal, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_images, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_images, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor)
test_dataset = TensorDataset(X_test_tensor, torch.tensor(y_test, dtype=torch.long))
val_dataset = TensorDataset(X_val_tensor, torch.tensor(y_val, dtype=torch.long))

batch_size = 32 
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

  X_train_tensor = torch.tensor(X_train_normal, dtype=torch.float32)
  X_test_tensor = torch.tensor(X_test_images, dtype=torch.float32)
  X_val_tensor = torch.tensor(X_val_images, dtype=torch.float32)


In [32]:
def extract_latent_features(model, data_loader, device):
    model.eval()  
    latent_features = []  

    with torch.no_grad(): 
        for batch in tqdm(data_loader, total=len(data_loader), desc="Extracting features"):
            if len(batch) == 2:
                data, _ = batch  
            else:
                (data,) = batch  
            
            data = data.to(device)
            mu, logvar = model.encoder(data)
            latent_features.append(mu.cpu().numpy())

    latent_features = np.concatenate(latent_features, axis=0)
    
    return latent_features


In [33]:
train_latent_features = extract_latent_features(model, train_loader, device)
val_latent_features = extract_latent_features(model, val_loader, device)
test_latent_features = extract_latent_features(model, test_loader, device)

Extracting features: 100%|██████████| 8124/8124 [00:04<00:00, 1724.54it/s]
Extracting features: 100%|██████████| 3506/3506 [00:02<00:00, 1504.62it/s]
Extracting features: 100%|██████████| 4383/4383 [00:02<00:00, 1577.30it/s]


# SGDOCSVM

In [39]:
from sklearn.linear_model import SGDOneClassSVM

In [67]:
def objective(trial):

    nu = trial.suggest_float('nu', 0.01, 0.5)  
    learning_rate = trial.suggest_categorical('learning_rate', ['constant', 'optimal', 'invscaling', 'adaptive'])
    eta0 = trial.suggest_float('eta0', 1e-6, 0.5)
    power_t = trial.suggest_float('power_t', -3, 3)

    sgdocsvm = SGDOneClassSVM(nu=nu, learning_rate=learning_rate, eta0=eta0, power_t=power_t)
    sgdocsvm.fit(train_latent_features)

    val_predictions = sgdocsvm.predict(val_latent_features)
    val_true_labels = np.where(y_val == 0, 1, -1)

    f1 = f1_score(val_true_labels, val_predictions, pos_label=1, average='binary')

    return f1 

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

[I 2024-09-24 20:32:59,173] A new study created in memory with name: no-name-6d0d4bd6-374e-46b7-9329-055049158174
[I 2024-09-24 20:32:59,502] Trial 0 finished with value: 0.6337881022615536 and parameters: {'nu': 0.4869103480425797, 'learning_rate': 'invscaling', 'eta0': 0.11575431238591075, 'power_t': -1.444059365680435}. Best is trial 0 with value: 0.6337881022615536.
[I 2024-09-24 20:32:59,811] Trial 1 finished with value: 0.7336362353605193 and parameters: {'nu': 0.3126020299445455, 'learning_rate': 'constant', 'eta0': 0.3994792075097269, 'power_t': 1.382520354843236}. Best is trial 1 with value: 0.7336362353605193.
[I 2024-09-24 20:33:00,102] Trial 2 finished with value: 0.62873982184452 and parameters: {'nu': 0.2956580371733146, 'learning_rate': 'constant', 'eta0': 0.2861430527014301, 'power_t': 2.7622718216807023}. Best is trial 1 with value: 0.7336362353605193.
[I 2024-09-24 20:33:00,358] Trial 3 finished with value: 0.7336362353605193 and parameters: {'nu': 0.02058251022843269

Best Hyperparameters: {'nu': 0.07594240984017661, 'learning_rate': 'adaptive', 'eta0': 0.33465579021024966, 'power_t': 2.3600427945890616}


In [68]:
best_sgdocsvm = SGDOneClassSVM(nu=best_params['nu'], 
                               learning_rate=best_params['learning_rate'], 
                               eta0=best_params['eta0'], 
                               random_state=42)

best_sgdocsvm.fit(train_latent_features)

test_predictions = best_sgdocsvm.predict(test_latent_features)
test_true_labels = np.where(y_test == 0, 1, -1)

test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_recall = recall_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_f1 = f1_score(test_true_labels, test_predictions, pos_label=1, average='binary')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.6894
Test Precision: 0.6572
Test Recall: 0.9695
Test F1 Score: 0.7834


# LOF

In [69]:
from sklearn.neighbors import LocalOutlierFactor

In [70]:
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 5, 50)  
    leaf_size = trial.suggest_int('leaf_size', 20, 50) 
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'chebyshev', 'minkowski'])  

    lof = LocalOutlierFactor(n_neighbors=n_neighbors, leaf_size=leaf_size, metric=metric, novelty=True)
    lof.fit(train_latent_features)

    val_predictions = lof.predict(val_latent_features)
    val_true_labels = np.where(y_val == 0, 1, -1)

    f1 = f1_score(val_true_labels, val_predictions, pos_label=1, average='binary')

    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

[I 2024-09-24 20:36:41,234] A new study created in memory with name: no-name-57949ad1-f577-4b23-a497-0d8bde8d301a
[I 2024-09-24 20:38:45,565] Trial 0 finished with value: 0.8874083066999188 and parameters: {'n_neighbors': 17, 'leaf_size': 38, 'metric': 'manhattan'}. Best is trial 0 with value: 0.8874083066999188.
[I 2024-09-24 20:40:49,583] Trial 1 finished with value: 0.8901131016746062 and parameters: {'n_neighbors': 19, 'leaf_size': 33, 'metric': 'manhattan'}. Best is trial 1 with value: 0.8901131016746062.
[I 2024-09-24 20:43:18,906] Trial 2 finished with value: 0.8310746167982435 and parameters: {'n_neighbors': 19, 'leaf_size': 35, 'metric': 'chebyshev'}. Best is trial 1 with value: 0.8901131016746062.
[I 2024-09-24 20:43:47,317] Trial 3 finished with value: 0.8365329745038501 and parameters: {'n_neighbors': 11, 'leaf_size': 49, 'metric': 'minkowski'}. Best is trial 1 with value: 0.8901131016746062.
[I 2024-09-24 20:45:53,774] Trial 4 finished with value: 0.8580930498920191 and pa

Best Hyperparameters: {'n_neighbors': 19, 'leaf_size': 33, 'metric': 'manhattan'}


In [71]:
best_lof = LocalOutlierFactor(n_neighbors=best_params['n_neighbors'], 
                              leaf_size=best_params['leaf_size'], 
                              metric=best_params['metric'], 
                              novelty=True)

best_lof.fit(train_latent_features)

test_predictions = best_lof.predict(test_latent_features)
test_true_labels = np.where(y_test == 0, 1, -1)

test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_recall = recall_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_f1 = f1_score(test_true_labels, test_predictions, pos_label=1, average='binary')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.8650
Test Precision: 0.8466
Test Recall: 0.9367
Test F1 Score: 0.8894


# IF

In [72]:
from sklearn.ensemble import IsolationForest

In [73]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)  
    max_samples = trial.suggest_float('max_samples', 0.5, 1.0)  
    contamination = trial.suggest_float('contamination', 0.01, 0.5) 
    max_features = trial.suggest_float('max_features', 0.5, 1.0)

    iso_forest = IsolationForest(n_estimators=n_estimators, 
                                 max_samples=max_samples, 
                                 contamination=contamination, 
                                 max_features=max_features, 
                                 random_state=42)

    iso_forest.fit(train_latent_features)
    
    val_predictions = iso_forest.predict(val_latent_features)
    val_true_labels = np.where(y_val == 0, 1, -1)

    f1 = f1_score(val_true_labels, val_predictions, pos_label=1, average='binary')
    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

[I 2024-09-24 22:12:51,381] A new study created in memory with name: no-name-4e67f4a4-3bf1-438f-991a-c4b40e18989b
[I 2024-09-24 22:12:57,333] Trial 0 finished with value: 0.8910027809586127 and parameters: {'n_estimators': 60, 'max_samples': 0.56979034153423, 'contamination': 0.16097507744919545, 'max_features': 0.7810057267045972}. Best is trial 0 with value: 0.8910027809586127.
[I 2024-09-24 22:13:28,885] Trial 1 finished with value: 0.8577970370496002 and parameters: {'n_estimators': 285, 'max_samples': 0.6776615220685762, 'contamination': 0.219816013832347, 'max_features': 0.9643304618166728}. Best is trial 0 with value: 0.8910027809586127.
[I 2024-09-24 22:13:55,127] Trial 2 finished with value: 0.8560993915351006 and parameters: {'n_estimators': 243, 'max_samples': 0.989553994537104, 'contamination': 0.015236160020729465, 'max_features': 0.5456240045906047}. Best is trial 0 with value: 0.8910027809586127.
[I 2024-09-24 22:14:14,352] Trial 3 finished with value: 0.8034812506715375

Best Hyperparameters: {'n_estimators': 299, 'max_samples': 0.5346634458297145, 'contamination': 0.0605371917615878, 'max_features': 0.8340395267584781}


In [74]:
best_iso_forest = IsolationForest(n_estimators=best_params['n_estimators'], 
                                  max_samples=best_params['max_samples'], 
                                  contamination=best_params['contamination'], 
                                  max_features=best_params['max_features'], 
                                  random_state=42)

best_iso_forest.fit(train_latent_features)
test_predictions = best_iso_forest.predict(test_latent_features)
test_true_labels = np.where(y_test == 0, 1, -1)

test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_recall = recall_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_f1 = f1_score(test_true_labels, test_predictions, pos_label=1, average='binary')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.9264
Test Precision: 0.9351
Test Recall: 0.9381
Test F1 Score: 0.9366


# PCA Reconstruction

In [75]:
from sklearn.decomposition import PCA

In [78]:
def reconstruction_error(X, pca):
    X_pca = pca.transform(X)
    X_reconstructed = pca.inverse_transform(X_pca)
    return np.mean((X - X_reconstructed) ** 2, axis=1)
    
def objective(trial):
    n_components = trial.suggest_int('n_components', 2, min(train_latent_features.shape[1], 50))
    percentile = trial.suggest_float('percentile', 90.0, 99.9)  

    pca = PCA(n_components=n_components, whiten=True, svd_solver='auto')

    normal_data_pca = pca.fit_transform(train_latent_features)
    normal_data_reconstructed = pca.inverse_transform(normal_data_pca)

    reconstruction_errors = np.mean((train_latent_features - normal_data_reconstructed) ** 2, axis=1)
    threshold = np.percentile(reconstruction_errors, percentile)
    val_reconstruction_errors = reconstruction_error(val_latent_features, pca)
    val_predictions = np.where(val_reconstruction_errors > threshold, -1, 1)

    val_true_labels = np.where(y_val == 0, 1, -1)

    f1 = f1_score(val_true_labels, val_predictions, pos_label=1, average='binary')

    return f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
print(f"Best Hyperparameters: {best_params}")

[I 2024-09-24 22:36:02,134] A new study created in memory with name: no-name-538de325-e4fc-4302-b20b-217c0f38c11b
[I 2024-09-24 22:36:02,263] Trial 0 finished with value: 0.8453731919528977 and parameters: {'n_components': 8, 'percentile': 94.3571235697011}. Best is trial 0 with value: 0.8453731919528977.
[I 2024-09-24 22:36:02,392] Trial 1 finished with value: 0.840554868229163 and parameters: {'n_components': 5, 'percentile': 93.66298739103162}. Best is trial 0 with value: 0.8453731919528977.
[I 2024-09-24 22:36:02,533] Trial 2 finished with value: 0.7448973596700469 and parameters: {'n_components': 3, 'percentile': 97.5783813785262}. Best is trial 0 with value: 0.8453731919528977.
[I 2024-09-24 22:36:02,669] Trial 3 finished with value: 0.7335198086600995 and parameters: {'n_components': 16, 'percentile': 97.44901626986382}. Best is trial 0 with value: 0.8453731919528977.
[I 2024-09-24 22:36:02,803] Trial 4 finished with value: 0.817567009173301 and parameters: {'n_components': 7, '

Best Hyperparameters: {'n_components': 9, 'percentile': 91.25247022845221}


In [79]:
best_pca = PCA(n_components=best_params['n_components'], whiten=True, svd_solver='auto')

normal_data_pca = best_pca.fit_transform(train_latent_features)
normal_data_reconstructed = best_pca.inverse_transform(normal_data_pca)

reconstruction_errors = np.mean((train_latent_features - normal_data_reconstructed) ** 2, axis=1)

threshold = np.percentile(reconstruction_errors, best_params['percentile'])

test_pca = best_pca.transform(test_latent_features)
test_reconstructed = best_pca.inverse_transform(test_pca)
test_reconstruction_errors = np.mean((test_latent_features - test_reconstructed) ** 2, axis=1)

test_predictions = np.where(test_reconstruction_errors > threshold, -1, 1) 

test_true_labels = np.where(y_test == 0, 1, -1)

test_accuracy = accuracy_score(test_true_labels, test_predictions)
test_precision = precision_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_recall = recall_score(test_true_labels, test_predictions, pos_label=1, average='binary')
test_f1 = f1_score(test_true_labels, test_predictions, pos_label=1, average='binary')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.8808
Test Precision: 0.8853
Test Recall: 0.9124
Test F1 Score: 0.8987
