In [21]:
from VAE import VAE
import sys

sys.path.append("../../")
from utils import load_csv, dataloader, load_save_models
import torch
from torchvision.transforms import v2
from preprocessing.tiff_handling import handle_tiff
import plotly.express as px
from stats import evaluate_perf_utils
import numpy as np

In [22]:
batch_size = 436
input_dim = 4800
hidden_dim = 4000
latent_dim = 320

In [23]:
model = VAE(input_dim,hidden_dim,latent_dim)
VAE.load_state_dict(model,state_dict=torch.load('./vae.pth'))


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



<All keys matched successfully>

# load in data necessary for computing the threshold

In [24]:
set = load_csv.load_pandas()
train, val, test = load_csv.split_data(set)

print(f'outlier test validation: {any(val["classification"] != 1)}')


Columns (91,94,209,213) have mixed types. Specify dtype option on import or set low_memory=False.



outlier test validation: False


In [25]:
transform = v2.Compose(
    [
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Resize((60, 80)),
        v2.ToTensor(),
        v2.Lambda(
            lambda x: (x.view(-1) - torch.min(x)) / (torch.max(x) - torch.min(x))
        ),
    ]
)

val_set = dataloader.ImagePathDataset(val, transform)

val_dataloader = torch.utils.data.DataLoader(
    val_set, batch_size=batch_size, shuffle=True
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


The transform `ToTensor()` is deprecated and will be removed in a future release. Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`.Output is equivalent up to float precision.



VAE(
  (encoder): Sequential(
    (0): Linear(in_features=4800, out_features=4000, bias=True)
    (1): SiLU()
    (2): Linear(in_features=4000, out_features=2000, bias=True)
    (3): SiLU()
    (4): Linear(in_features=2000, out_features=1000, bias=True)
    (5): SiLU()
    (6): Linear(in_features=1000, out_features=500, bias=True)
    (7): SiLU()
    (8): Linear(in_features=500, out_features=640, bias=True)
  )
  (softplus): Softplus(beta=1.0, threshold=20.0)
  (decoder): Sequential(
    (0): Linear(in_features=320, out_features=500, bias=True)
    (1): SiLU()
    (2): Linear(in_features=500, out_features=1000, bias=True)
    (3): SiLU()
    (4): Linear(in_features=1000, out_features=2000, bias=True)
    (5): SiLU()
    (6): Linear(in_features=2000, out_features=4000, bias=True)
    (7): SiLU()
    (8): Linear(in_features=4000, out_features=4800, bias=True)
    (9): Sigmoid()
  )
)

In [26]:
test["image"] = test["file_names"].apply(lambda x: transform(handle_tiff(x)))
test

Unnamed: 0,classification,file_names,label,image
51,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.0816), tensor(0.0795), tensor(0.0970..."
6622,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1591), tensor(0.1353), tensor(0.1500..."
9400,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1166), tensor(0.1164), tensor(0.1190..."
3022,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1287), tensor(0.1243), tensor(0.1208..."
1224,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1595), tensor(0.1544), tensor(0.1538..."
...,...,...,...,...
153,-1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.0396), tensor(0.0453), tensor(0.0609..."
9812,2,/home/bendm/machine_learning/project/majority-...,Doublet,"[tensor(0.1147), tensor(0.1244), tensor(0.1273..."
7403,2,/home/bendm/machine_learning/project/majority-...,Doublet,"[tensor(0.1007), tensor(0.1204), tensor(0.1291..."
7724,-1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1171), tensor(0.0756), tensor(0.1007..."


In [27]:
# optimize percentile of error threshold
import optuna

truth = [1 if value == 1 else 0 for value in test["classification"]]


def objective(trial):
    perc = trial.suggest_int("percentile", 0, 100)
    model.extract_error_threshold(
        val_dataloader,
        method="percentile",
        value=perc,
    )
    output = model.predict(test)

    perf_metrics = evaluate_perf_utils.evaluate_performance(truth, output["prediction"])
    return perf_metrics["f1"]


study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=100)

[I 2024-12-11 00:20:19,120] A new study created in memory with name: no-name-934c0d9c-6f8b-4944-833f-6058b623ac80


In [28]:
#print(f"best percentile: {study.best_params['percentile']}")
#print(f"best f1: {study.best_value}")

## perform predictions on the test set

In [29]:
model.extract_error_threshold(val_dataloader, method="percentile", value=85)
output = model.predict(test)
output

Computing threshold: 100%|██████████| 4/4 [00:02<00:00,  1.41it/s]


Unnamed: 0,classification,file_names,label,image,prediction,reconstruction_error
51,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.0816), tensor(0.0795), tensor(0.0970...",1,0.190615
6622,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1591), tensor(0.1353), tensor(0.1500...",1,0.272947
9400,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1166), tensor(0.1164), tensor(0.1190...",0,0.315536
3022,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1287), tensor(0.1243), tensor(0.1208...",0,0.330162
1224,1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1595), tensor(0.1544), tensor(0.1538...",1,0.248079
...,...,...,...,...,...,...
153,-1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.0396), tensor(0.0453), tensor(0.0609...",1,0.297780
9812,2,/home/bendm/machine_learning/project/majority-...,Doublet,"[tensor(0.1147), tensor(0.1244), tensor(0.1273...",0,0.316925
7403,2,/home/bendm/machine_learning/project/majority-...,Doublet,"[tensor(0.1007), tensor(0.1204), tensor(0.1291...",1,0.307266
7724,-1,/home/bendm/machine_learning/project/majority-...,Singlet,"[tensor(0.1171), tensor(0.0756), tensor(0.1007...",0,0.365307


## plot out the metrics

In [30]:


truth = [ 0 if value ==1 else 1 for value in  output['classification']]
predictions = [0 if val ==1 else 1 for val in  output['prediction']]



perf_metrics = evaluate_perf_utils.evaluate_performance(truth, predictions)
tn = perf_metrics['tn']
fp = perf_metrics['fp']
fn = perf_metrics['fn']
tp = perf_metrics['tp']
precision = perf_metrics['precision']
recall = perf_metrics['recall']
f1 = perf_metrics['f1']


perf_metrics

{'tn': np.int64(1234),
 'fp': np.int64(223),
 'fn': np.int64(334),
 'tp': np.int64(1123),
 'precision': np.float64(0.8343239227340268),
 'recall': np.float64(0.7707618393960192),
 'f1': np.float64(0.8012843382090618)}

In [31]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
print(f'accuracy: {accuracy}')

accuracy: 0.8088538091969801


In [32]:
predictions_partial_scans = output[output["classification"] > 1]["prediction"].tolist()
predictions_partial_scans = [0 if val == 1 else 1 for val in predictions_partial_scans]

partial_truths = np.ones(len(predictions_partial_scans))

parital_perf_metrics = evaluate_perf_utils.evaluate_performance(
    partial_truths, predictions_partial_scans
)
parital_perf_metrics

# print out percentages
print(parital_perf_metrics)
print(f"tp: {parital_perf_metrics['tp']/len(partial_truths)}")
print(f"fn: {parital_perf_metrics['fn']/len(partial_truths)}")

{'tn': np.int64(0), 'fp': np.int64(0), 'fn': np.int64(128), 'tp': np.int64(304), 'precision': np.float64(1.0), 'recall': np.float64(0.7037037037037037), 'f1': np.float64(0.8260869565217391)}
tp: 0.7037037037037037
fn: 0.2962962962962963


In [33]:
predictions_partial_scans = output[output["classification"] < 1]["prediction"].tolist()
predictions_partial_scans = [0 if val == 1 else 1 for val in predictions_partial_scans]

partial_truths = np.ones(len(predictions_partial_scans))

parital_perf_metrics = evaluate_perf_utils.evaluate_performance(
    partial_truths, predictions_partial_scans
)
parital_perf_metrics

# print out percentages
print(parital_perf_metrics)
print(f"tp: {parital_perf_metrics['tp']/len(partial_truths)}")
print(f"fn: {parital_perf_metrics['fn']/len(partial_truths)}")

{'tn': np.int64(0), 'fp': np.int64(0), 'fn': np.int64(206), 'tp': np.int64(819), 'precision': np.float64(1.0), 'recall': np.float64(0.7990243902439025), 'f1': np.float64(0.8882863340563991)}
tp: 0.7990243902439025
fn: 0.20097560975609757


## plot out the confusion matrix and the ROC curve

In [34]:
# Confusion matrix
import plotly.graph_objects as go
import numpy as np

cm = np.array([[tn, fp], [fn, tp]])

total = cm.sum()
cm_percentage = cm / total * 100

# Create a heatmap for the confusion matrix with percentages
fig = go.Figure(
    data=go.Heatmap(
        z=cm_percentage,
        x=["Pred: 0", "Pred: 1"],
        y=["True: 0", "True: 1"],
        colorscale="Peach",  # A more pleasant color palette
        colorbar=dict(title="Percentage", tickvals=[0, 100]),
        showscale=True,
    )
)

# Add titles, labels, and precision, recall, F1 score as annotation
fig.update_layout(
    title="Confusion Matrix (Percentage)",
    xaxis_title="Predicted Labels",
    yaxis_title="True Labels",
    width=500,  # Set the width of the plot
    height=400,  # Set the height of the plot
)

# Add text annotations to show percentage values in each cell
for i in range(cm_percentage.shape[0]):
    for j in range(cm_percentage.shape[1]):
        fig.add_annotation(
            x=j,
            y=i,
            text=f"{cm_percentage[i, j]:.2f}%",
            showarrow=False,
            font=dict(size=14, color="black"),
            align="center",
        )

# Show the plot
fig.show()

In [35]:
import torch.nn.functional as F

model(test["image"][9997])
px.imshow(test["image"][9997].view(60, 80))

reconstructed = (
    (model(test["image"][9997])["x_reconstructed"]).detach().numpy().reshape(60, 80)
)
px.imshow(reconstructed)
model.eval()
inlier = test["image"][9400]
outlier = test["image"][9997]
inlier_forwards = model(inlier)
outlier_forwards = model(outlier)
inlier_reconstructed = inlier_forwards["x_reconstructed"]
outlier_reconstructed = outlier_forwards["x_reconstructed"]

print(
    f"reconstruction error for inlier: {F.binary_cross_entropy(inlier_reconstructed, inlier,reduction='none').sum(dim=(-1))}"
)
print(
    f"reconstruction error for outlier: {F.binary_cross_entropy(outlier_reconstructed, outlier,reduction='none').sum(dim=(-1))}"
)
# show both images
fig = px.imshow(inlier.view(60, 80))
fig.show()
fig = px.imshow(inlier_reconstructed.detach().numpy().reshape(60, 80))
fig.show()
fig = px.imshow(outlier.view(60, 80))
fig.show()
fig = px.imshow(outlier_reconstructed.detach().numpy().reshape(60, 80))
fig.show()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed