<a href="https://colab.research.google.com/github/Meguazy/project_CSD/blob/main/notebook_models/graph_autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://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@github.com/Meguazy/project_CSD.git

Cloning into 'project_CSD'...
remote: Enumerating objects: 641, done.[K
remote: Counting objects: 100% (469/469), done.[K
remote: Compressing objects: 100% (365/365), done.[K
remote: Total 641 (delta 155), reused 366 (delta 91), pack-reused 172[K
Receiving objects: 100% (641/641), 48.34 MiB | 23.24 MiB/s, done.
Resolving deltas: 100% (178/178), done.


In [None]:
%cd project_CSD/

/content/project_CSD


In [None]:
#Usare ogni volta che si inizia a lavorare per accertarsi che non ci siano
#cambiamenti non sincronizzati

!git pull

Already up to date.


In [None]:
from google.colab import auth
auth.authenticate_user()

import requests
gcloud_token = !gcloud auth print-access-token
gcloud_tokeninfo = requests.get('https://www.googleapis.com/oauth2/v3/tokeninfo?access_token=' + gcloud_token[0]).json()
EMAIL = str(gcloud_tokeninfo['email'])

!echo $EMAIL

#Usare per fare commit atomici e frequenti.
#Ricordiamoci di usare mettere sempre dei messaggi di commit chiari in modo da
#poter rollbackare o cherry-pickare in caso di bisogno.

!git config --global user.email $EMAIL

!git add .
!git commit -m ""
!git push

## Graph autoencoder using spektral, which is the tensorflow library for graph neural networks
Sample

In [None]:
# imports
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
# Function to generate a sample graph with anomalies
def generate_sample_graph(num_nodes=10, anomalous_node=None):
    adjacency_matrix = np.random.randint(2, size=(num_nodes, num_nodes))
    adjacency_matrix = np.triu(adjacency_matrix, k=1) + np.triu(adjacency_matrix, k=1).T
    np.fill_diagonal(adjacency_matrix, 0)

    # Introduce anomaly by changing connections of a specific node
    if anomalous_node is not None:
        adjacency_matrix[anomalous_node] = np.random.randint(2, size=num_nodes)

    return adjacency_matrix.astype(np.float32)

In [None]:
# Function to generate a dataset with normal and anomalous graphs
def generate_dataset(num_samples, num_nodes=10, anomalous_node=None):
    X = [generate_sample_graph(num_nodes, anomalous_node) for _ in range(num_samples)]
    return np.array(X)

In [None]:
# Graph Autoencoder Model with TensorFlow's Dense layers
def create_graph_autoencoder(input_dim):
    encoder_inputs = tf.keras.Input(shape=(input_dim, input_dim))
    x = layers.Flatten()(encoder_inputs)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(16, activation='relu')(x)
    encoder_outputs = layers.Dense(8, activation='relu')(x)

    decoder_inputs = tf.keras.Input(shape=(8,))
    x = layers.Dense(16, activation='relu')(decoder_inputs)
    x = layers.Dense(32, activation='relu')(x)
    decoder_outputs = layers.Reshape((input_dim, input_dim))(layers.Dense(input_dim * input_dim, activation='sigmoid')(x))

    encoder = tf.keras.Model(encoder_inputs, encoder_outputs, name='encoder')
    decoder = tf.keras.Model(decoder_inputs, decoder_outputs, name='decoder')

    autoencoder_inputs = tf.keras.Input(shape=(input_dim, input_dim))
    autoencoder_outputs = decoder(encoder(autoencoder_inputs))

    autoencoder = tf.keras.Model(autoencoder_inputs, autoencoder_outputs, name='autoencoder')
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

    return autoencoder

In [None]:
# Parameters
num_nodes = 10
num_samples = 1000
anomalous_node = 5  # Introduce anomaly in node 5

In [None]:
# Generate dataset
X = generate_dataset(num_samples, num_nodes, anomalous_node)

In [None]:
# Create graph autoencoder
autoencoder = create_graph_autoencoder(num_nodes)

In [None]:
# Custom Training Loop
epochs = 50
batch_size = 32

for epoch in range(epochs):
    np.random.shuffle(X)

    for i in range(0, len(X), batch_size):
        batch = X[i:i+batch_size]
        autoencoder.train_on_batch(batch, batch)

In [None]:
# Evaluate on normal and anomalous samples
normal_sample = generate_sample_graph(num_nodes)
anomalous_sample = generate_sample_graph(num_nodes, anomalous_node)

In [None]:
normal_sample = np.expand_dims(normal_sample, axis=0)
anomalous_sample = np.expand_dims(anomalous_sample, axis=0)

In [None]:
# Predict on normal sample
reconstructed_normal = autoencoder.predict(normal_sample)

# Predict on anomalous sample
reconstructed_anomalous = autoencoder.predict(anomalous_sample)

# Calculate reconstruction errors
error_normal = np.mean(np.abs(normal_sample - reconstructed_normal))
error_anomalous = np.mean(np.abs(anomalous_sample - reconstructed_anomalous))

print("Reconstruction error on normal sample:", error_normal)
print("Reconstruction error on anomalous sample:", error_anomalous)

Reconstruction error on normal sample: 0.41946134
Reconstruction error on anomalous sample: 0.35970485


## Graph autoencoder using a Formal Approach
https://www.youtube.com/watch?v=qA6U4nIK62E

In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


In [5]:
import argparse
import os.path as osp
import time

import torch

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GAE, VGAE, GCNConv

parser = argparse.ArgumentParser()
parser.add_argument('--variational', action='store_true')
parser.add_argument('--linear', action='store_true')
parser.add_argument('--dataset', type=str, default='Cora',
                    choices=['Cora', 'CiteSeer', 'PubMed'])
parser.add_argument('--epochs', type=int, default=400)
parser.add_argument('--file', '-f', type=str)
args = parser.parse_args()

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])
# path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Planetoid')
dataset = Planetoid('./sample_data', args.dataset, transform=transform)
train_data, val_data, test_data = dataset[0]

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [6]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)


class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

In [7]:
in_channels, out_channels = dataset.num_features, 16

if not args.variational and not args.linear:
    model = GAE(GCNEncoder(in_channels, out_channels))
elif not args.variational and args.linear:
    model = GAE(LinearEncoder(in_channels, out_channels))
elif args.variational and not args.linear:
    model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
elif args.variational and args.linear:
    model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    loss = model.recon_loss(z, train_data.pos_edge_label_index)
    if args.variational:
        loss = loss + (1 / train_data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    return model.test(z, data.pos_edge_label_index, data.neg_edge_label_index)


times = []
for epoch in range(1, args.epochs + 1):
    start = time.time()
    loss = train()
    auc, ap = test(test_data)
    print(f'Epoch: {epoch:03d}, AUC: {auc:.4f}, AP: {ap:.4f}')
    times.append(time.time() - start)
print(f"Median time per epoch: {torch.tensor(times).median():.4f}s")

Epoch: 001, AUC: 0.6657, AP: 0.6997
Epoch: 002, AUC: 0.6576, AP: 0.6955
Epoch: 003, AUC: 0.6539, AP: 0.6937
Epoch: 004, AUC: 0.6525, AP: 0.6935
Epoch: 005, AUC: 0.6524, AP: 0.6954
Epoch: 006, AUC: 0.6534, AP: 0.6980
Epoch: 007, AUC: 0.6575, AP: 0.7032
Epoch: 008, AUC: 0.6622, AP: 0.7076
Epoch: 009, AUC: 0.6656, AP: 0.7106
Epoch: 010, AUC: 0.6666, AP: 0.7116
Epoch: 011, AUC: 0.6674, AP: 0.7124
Epoch: 012, AUC: 0.6685, AP: 0.7135
Epoch: 013, AUC: 0.6714, AP: 0.7156
Epoch: 014, AUC: 0.6760, AP: 0.7175
Epoch: 015, AUC: 0.6868, AP: 0.7226
Epoch: 016, AUC: 0.7055, AP: 0.7303
Epoch: 017, AUC: 0.7222, AP: 0.7385
Epoch: 018, AUC: 0.7286, AP: 0.7421
Epoch: 019, AUC: 0.7340, AP: 0.7453
Epoch: 020, AUC: 0.7465, AP: 0.7533
Epoch: 021, AUC: 0.7642, AP: 0.7672
Epoch: 022, AUC: 0.7782, AP: 0.7811
Epoch: 023, AUC: 0.7832, AP: 0.7862
Epoch: 024, AUC: 0.7824, AP: 0.7858
Epoch: 025, AUC: 0.7833, AP: 0.7859
Epoch: 026, AUC: 0.7827, AP: 0.7855
Epoch: 027, AUC: 0.7769, AP: 0.7807
Epoch: 028, AUC: 0.7719, AP:

## Graph autoencoder using RGraph from PyOD library

In [None]:
!pip install pyod

In [None]:
from re import X
import numpy as np
import pandas as pd
from pyod.models.vae import VAE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X1 = pd.read_csv("data/tsne_data/tsne_orizzontale/time_series_tsne1.csv")

X2 = pd.read_csv("data/tsne_data/tsne_orizzontale/time_series_tsne2.csv")

X_1 = X1.loc[:, X1.columns != 'Acquisition Number']
X_2 = X2.loc[:, X2.columns != 'Acquisition Number']

X_train = X_1.iloc[: , :1000]
X_test = X_2.iloc[: , :1000]

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

X_train = X_train.sample(frac = 1)
X_test = X_test.sample(frac = 1)

X_train, X_validate = train_test_split(X_train, test_size=0.2)

X_train.shape, X_validate.shape, X_test.shape

((169, 1000), (43, 1000), (198, 1000))

In [None]:
from pyod.models.lunar import LUNAR
clf = LUNAR(
    model_type = 'WEIGHT',
    n_epochs = 70,
    verbose = 1
)

clf.fit(X_train)

Epoch 0 	 Train Score 0.263461 	 Val Score 0.716263
Epoch 1 	 Train Score 0.585093 	 Val Score 0.712803
Epoch 2 	 Train Score 0.615002 	 Val Score 0.712803
Epoch 3 	 Train Score 0.622706 	 Val Score 0.712803
Epoch 4 	 Train Score 0.665859 	 Val Score 0.712803
Epoch 5 	 Train Score 0.691482 	 Val Score 0.716263
Epoch 6 	 Train Score 0.698927 	 Val Score 0.716263
Epoch 7 	 Train Score 0.70148 	 Val Score 0.716263
Epoch 8 	 Train Score 0.702519 	 Val Score 0.716263
Epoch 9 	 Train Score 0.702909 	 Val Score 0.716263
Epoch 10 	 Train Score 0.703341 	 Val Score 0.716263
Epoch 11 	 Train Score 0.703428 	 Val Score 0.716263
Epoch 12 	 Train Score 0.703471 	 Val Score 0.716263
Epoch 13 	 Train Score 0.703688 	 Val Score 0.716263
Epoch 14 	 Train Score 0.703731 	 Val Score 0.716263
Epoch 15 	 Train Score 0.703861 	 Val Score 0.716263
Epoch 16 	 Train Score 0.703731 	 Val Score 0.716263
Epoch 17 	 Train Score 0.704034 	 Val Score 0.716263
Epoch 18 	 Train Score 0.704034 	 Val Score 0.719723
Epoc

LUNAR(contamination=0.1, epsilon=0.1, lr=0.001, model_type='WEIGHT',
   n_epochs=70, n_neighbours=5, negative_sampling='MIXED', proportion=1.0,
   scaler=MinMaxScaler(), val_size=0.1, verbose=1, wd=0.1)

### Validation section

In [None]:
b = clf.predict(X_test) # (0: inliers, 1: outliers)

TN = np.count_nonzero(b == 1)
FP = np.count_nonzero(b == 0)
print(f"---------CASO 2----------")
print(f"Percentuale di time series anomale: {TN/len(b)*100}%")

c = clf.predict(X_validate)
FN = np.count_nonzero(c == 1)
TP = np.count_nonzero(c == 0)
print(f"---------CASO 1 (validate)----------")
print(f"Percentuale di time series anomale: {np.count_nonzero(c == 1)/len(c)*100}%")

---------CASO 2----------
Percentuale di time series anomale: 46.96969696969697%
---------CASO 1 (validate)----------
Percentuale di time series anomale: 18.6046511627907%


In [None]:
precision = TP/(TP + FP)
TPR = TP/(TP + FN)
TNR = TN/(TN + FP)
FPR = 1 - TNR
F1 = (2*precision*TPR)/(precision+TPR)

print(f"Precision: {str(round(precision*100, 2))}%")
print(f"Recall (True Positive Rate): {str(round(TPR*100, 2))}%")
print(f"Specificity (True Negative Rate): {str(round(TNR*100, 2))}%")
print(f"FPR (False Positive Rate): {str(round(FPR*100, 2))}%")
print(f"F1 score: {str(round(F1*100, 2))}%")

Precision: 25.0%
Recall (True Positive Rate): 81.4%
Specificity (True Negative Rate): 46.97%
FPR (False Positive Rate): 53.03%
F1 score: 38.25%
