<a href="https://colab.research.google.com/github/DrBlizzzz/Autoencoder/blob/main/Vae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import torch

from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset

from torch.distributions import Normal

import pandas as pd
import numpy as np
import math

# import ray 
# from ray import tune
# from ray.tune.schedulers import ASHAScheduler
# from ray.tune import Stopper

import seaborn as sns
import matplotlib.pyplot as plt

import os

In [12]:
class DataCustomDataset():

  def __init__(self, path):

    self.dataset = pd.read_csv(path)

    self.lables = self.dataset['Label']
    self.data = self.dataset.drop('Label', axis=1)

    self.size = self.dataset.shape[1]

  def Split(self):

    X_train, X_test, y_train, y_test = train_test_split(self.data, self.lables, test_size=0.2)
    X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5)

    self.TrainData = (X_train, y_train)
    self.ValidationData = (X_valid, y_valid)
    self.TestData = (X_test, y_test)

  def FilterAnomaly(self):

    self.NormalTrainData = self.TrainData[0][self.TrainData[1] == 0]
    self.NormalValidationData = self.ValidationData[0][self.ValidationData[1] == 0]
  
  def CreateDataLoader(self, data, labels, batchSize):

    loader = 0

    if len(labels) != 0:

      tensor_X = torch.tensor(data.values)
      tensor_y = torch.tensor(labels.values)
      loader = DataLoader(TensorDataset(tensor_X, tensor_y), batchSize)

    else:

      tensor_X = torch.tensor(data.values)
      loader = DataLoader(TensorDataset(tensor_X), batchSize)

    return loader


In [13]:
batch_size = 512

dataset_creator = DataCustomDataset(path='/content/drive/MyDrive/Colab Notebooks/Philips.csv')

dataset_creator.Split()
dataset_creator.FilterAnomaly()

normal_train_loader_batched = dataset_creator.CreateDataLoader(data = dataset_creator.NormalTrainData, 
                                                       labels = list(), 
                                                       batchSize = batch_size)

validation_loader_batched = dataset_creator.CreateDataLoader(data = dataset_creator.ValidationData[0], 
                                                     labels = dataset_creator.ValidationData[1],
                                                     batchSize = batch_size)


validation_loader_unbatched = dataset_creator.CreateDataLoader(data = dataset_creator.ValidationData[0], 
                                                     labels = dataset_creator.ValidationData[1],
                                                     batchSize = 1)

normal_validation_loader_unbatched = dataset_creator.CreateDataLoader(data = dataset_creator.NormalValidationData, 
                                                            labels = list(),
                                                            batchSize = 1)

test_loader_unbatched = dataset_creator.CreateDataLoader(data = dataset_creator.TestData[0], 
                                               labels = dataset_creator.TestData[1],
                                               batchSize = 1)


In [14]:
class VaeEncoder(torch.nn.Module):

  def __init__(self, input_dense, latent_size):

    super().__init__()

    self.linear_1 = torch.nn.Linear(input_dense, 64)
    self.linear_2 = torch.nn.Linear(64, 32)

    self.linear_mu = torch.nn.Linear(32, latent_size)
    self.linear_z_log_var = torch.nn.Linear(32, latent_size)

    self.linear_z_log_var.weight.data.fill_(0)
    self.linear_z_log_var.bias.data.fill_(0)

    self.KL_loss = 0

    self.decoder = torch.nn.Sequential(
        torch.nn.Linear(latent_size, 32),
        torch.nn.ReLU(),
        torch.nn.Linear(32, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, input_dense),
    )

  def forward(self, x):

    x = torch.nn.functional.relu(self.linear_1(x))
    x = torch.nn.functional.relu(self.linear_2(x))

    # print(x.shape)

    mu = self.linear_mu(x)
    z_log_var = self.linear_z_log_var(x)

    eps = Normal(0, 1).sample(mu.shape)
    z_sampled = mu + eps * torch.exp(z_log_var / 2)

    self.KL_loss = -0.5 * torch.sum( ( 1 + z_log_var - torch.square(mu) - torch.exp(z_log_var) ), dim = 1 )

    return self.KL_loss, self.decoder(z_sampled)

In [15]:
epochs = 100

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = VaeEncoder(dataset_creator.size - 1, 25)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

loss_function = torch.nn.MSELoss()

In [16]:
# print(model.forward(torch.empty(122).normal_(mean = 0, std = 1)))

for epoch in range(epochs):

  print('Now is epoch:', epoch)

  for sample in normal_train_loader_batched:

    optimizer.zero_grad()

    cuda_sample0 = sample[0].to(device).float()

    reconstructed = torch.FloatTensor().to(device)
    kl_loss, reconstructed = model(cuda_sample0)

    loss = torch.mean( torch.sum( (reconstructed - cuda_sample0) ** 2, dim = 1 ) + kl_loss, dim = 0)

    loss.backward()

    optimizer.step()
  
  sum_loss = 0

  for sample in validation_loader_batched:

    with torch.no_grad():

      cuda_sample0 = sample[0].to(device).float()

      reconstructed = torch.FloatTensor().to(device)
      kl_loss, reconstructed = model(cuda_sample0)

      loss = torch.mean( torch.sum( (reconstructed - cuda_sample0) ** 2, dim = 1 ) + kl_loss, dim = 0)

      sum_loss += loss.item()

  print(sum_loss)


Now is epoch: 0
1486.6947827339172
Now is epoch: 1
1395.2543601989746
Now is epoch: 2
1374.0878367424011
Now is epoch: 3
1363.5685963630676
Now is epoch: 4
1357.315547466278
Now is epoch: 5
1352.6551718711853
Now is epoch: 6
1349.7180948257446
Now is epoch: 7
1347.8791427612305
Now is epoch: 8
1346.431297302246
Now is epoch: 9
1345.522705078125
Now is epoch: 10
1344.9059238433838
Now is epoch: 11
1344.692228794098
Now is epoch: 12
1344.3714604377747
Now is epoch: 13
1344.2296843528748
Now is epoch: 14
1344.2418026924133
Now is epoch: 15
1344.3159432411194
Now is epoch: 16
1344.593457698822
Now is epoch: 17
1344.8472418785095
Now is epoch: 18
1344.7420892715454
Now is epoch: 19
1344.7617530822754
Now is epoch: 20
1344.4707193374634
Now is epoch: 21
1344.5034313201904
Now is epoch: 22
1344.5917553901672
Now is epoch: 23
1344.7780394554138
Now is epoch: 24
1344.7024273872375
Now is epoch: 25
1344.9907007217407
Now is epoch: 26
1344.9057130813599
Now is epoch: 27
1344.8480553627014
Now is 

In [17]:
# далее необходимо найти мат ожидание и дисперсию нормальных сэмплов на валидации
# тк мы не обучаемся градиенты не считаем

normal_validation_losses = np.array([])

for sample in normal_validation_loader_unbatched:

  with torch.no_grad():

    cuda_sample0 = sample[0].to(device).float()

    reconstructed = torch.FloatTensor().to(device)
    kl_loss, reconstructed = model(cuda_sample0)

    loss = torch.mean( torch.sum( (reconstructed - cuda_sample0) ** 2, dim = 1 ) + kl_loss, dim = 0)

    normal_validation_losses = np.append(normal_validation_losses, loss.item() )

mean = normal_validation_losses.mean()
variance = normal_validation_losses.var()

print(mean, variance)

# в этой части рассчитываем z-оценки для каждого обьекта из валидации
# z-оценка показывает в скольких стандартных отклонениях находится отмасштабированный обьект от мат ож

validation_losses = np.array([])

for sample in validation_loader_unbatched:

  with torch.no_grad():

    cuda_sample0 = sample[0].to(device).float()

    reconstructed = torch.FloatTensor().to(device)
    kl_loss, reconstructed = model(cuda_sample0)

    loss = torch.mean( torch.sum( (reconstructed - cuda_sample0) ** 2, dim = 1 ) + kl_loss, dim = 0)
  
    validation_losses = np.append(validation_losses, (loss.item() - mean) / (variance ** (0.5) ) )

# на основании полученных z-оценок ищем границу разделения классов

scores = list()

sorted_dimension = list(zip(validation_losses, dataset_creator.ValidationData[1]))
sorted_dimension.sort()

for threshold in np.linspace(-4, 4, 800):

  answers = list()
  true_answers = list()

  for pair in sorted_dimension:

    if pair[0] < threshold:

      answers.append(0)

      true_answers.append(pair[1])

    else:

      answers.append(1)

      true_answers.append(pair[1])
  
  tn, fp, fn, tp = confusion_matrix(true_answers, answers).ravel()

  positive_precision = tp / (tp + fp)
  positive_recall = tp / (tp + fn)
  positive_f1_score = (positive_precision * positive_recall) / (positive_precision + positive_recall)

  negative_precision = tn / (tn + fn)
  negative_recall = tn / (tn + fp)
  negative_f1_score = (negative_precision * negative_recall) / (negative_precision + negative_recall)

  scores.append( (threshold, positive_f1_score, negative_f1_score) )

print(scores)

# максимизируя сумму f1-score по каждому из класссов подбираем THRESHOLD
# THRESHOLD - точка разбиения наших классов

max_point = (0, 0, 0)

for score in scores:

  if sum(score[1:]) > sum(max_point[1:]):

    max_point = score

THRESHOLD = max_point[0]

results = list()

# здесь проверка качества

for test_sample in test_loader_unbatched:

  with torch.no_grad():

    cuda_sample0 = test_sample[0].to(device).float()

    reconstructed = torch.FloatTensor().to(device)
    kl_loss, reconstructed = model(cuda_sample0)

    loss = torch.mean( torch.sum( (reconstructed - cuda_sample0) ** 2, dim = 1 ) + kl_loss, dim = 0)

    if ((loss.item() - mean) / (variance ** (0.5))) <  THRESHOLD:

      results.append(0)

    else:

      results.append(1)

print(classification_report(list(dataset_creator.TestData[1]), results))

tn, fp, fn, tp = confusion_matrix(list(dataset_creator.TestData[1]), results).ravel()

print('Ошибок 1-ого рода:', fp)
print('Ошибок 2-ого рода:', fn)



0.4775984148796561 8.264177335486448




[(-4.0, 0.45662893119086834, nan), (-3.9899874843554444, 0.45662893119086834, nan), (-3.9799749687108887, 0.45662893119086834, nan), (-3.969962453066333, 0.45662893119086834, nan), (-3.959949937421777, 0.45662893119086834, nan), (-3.9499374217772214, 0.45662893119086834, nan), (-3.9399249061326658, 0.45662893119086834, nan), (-3.92991239048811, 0.45662893119086834, nan), (-3.9198998748435545, 0.45662893119086834, nan), (-3.909887359198999, 0.45662893119086834, nan), (-3.8998748435544432, 0.45662893119086834, nan), (-3.889862327909887, 0.45662893119086834, nan), (-3.8798498122653315, 0.45662893119086834, nan), (-3.869837296620776, 0.45662893119086834, nan), (-3.8598247809762203, 0.45662893119086834, nan), (-3.8498122653316647, 0.45662893119086834, nan), (-3.839799749687109, 0.45662893119086834, nan), (-3.8297872340425534, 0.45662893119086834, nan), (-3.8197747183979973, 0.45662893119086834, nan), (-3.8097622027534417, 0.45662893119086834, nan), (-3.799749687108886, 0.45662893119086834, 