In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!nvidia-smi

Fri Nov 17 14:12:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install faiss-gpu
!pip install torch_geometric
# python test.py --dataset MNIST --samples MIXED --k 100 --seed 42 --train_new_model --models 1 --plot
# PENDIGITS  MUSK  MAMMOGRAPHY  LYMPHO  SHUTTLE  THYROID  SATELLITE  MNIST
# 2, 10, 50, 100, 150, 200, 300

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0


# Paramaters

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

n_epochs = 300
learning_rate = 1e-5


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os
import torch
from torch_geometric.data import Data
from scipy.io import loadmat
import faiss

########################################### NEGATIVE SAMPLE FUNCTIONS################################################
def negative_samples(train_x, train_y, val_x, val_y, test_x, test_y, k, sample_type, proportion, epsilon):

    # training set negative samples
    neg_train_x, neg_train_y = generate_negative_samples(train_x, sample_type, proportion, epsilon)
    # validation set negative samples
    neg_val_x, neg_val_y = generate_negative_samples(val_x, sample_type, proportion, epsilon)

    # concat data
    x = np.vstack((train_x,neg_train_x,val_x,neg_val_x,test_x))
    y = np.hstack((train_y,neg_train_y,val_y,neg_val_y,test_y))

    # all training set
    train_mask = np.hstack((np.ones(len(train_x)),np.ones(len(neg_train_x)),
                            np.zeros(len(val_x)),np.zeros(len(neg_val_x)),
                            np.zeros(len(test_x))))
    # all validation set
    val_mask = np.hstack((np.zeros(len(train_x)),np.zeros(len(neg_train_x)),
                          np.ones(len(val_x)),np.ones(len(neg_val_x)),
                          np.zeros(len(test_x))))
    # all test set
    test_mask = np.hstack((np.zeros(len(train_x)),np.zeros(len(neg_train_x)),
                           np.zeros(len(val_x)),np.zeros(len(neg_val_x)),
                           np.ones(len(test_x))))
    # normal training points
    neighbor_mask = np.hstack((np.ones(len(train_x)), np.zeros(len(neg_train_x)),
                               np.zeros(len(val_y)), np.zeros(len(neg_val_x)),
                               np.zeros(len(test_y))))

    # find k nearest neighbours (idx) and their distances (dist) to each points in x within neighbour_mask==1
    dist, idx = find_neighbors(x, y, neighbor_mask, k)

    return x.astype('float32'), y.astype('float32'), neighbor_mask.astype('float32'), train_mask.astype('float32'), val_mask.astype('float32'), test_mask.astype('float32'), dist, idx

# loading negative samples
def generate_negative_samples(x, sample_type, proportion, epsilon):

    n_samples = int(proportion*(len(x)))
    n_dim = x.shape[-1]

    #M
    randmat = np.random.rand(n_samples,n_dim) < 0.3
    # uniform samples
    rand_unif = (epsilon* (1-2*np.random.rand(n_samples,n_dim)))
    #  subspace perturbation samples
    rand_sub = np.tile(x, (proportion,1)) + randmat*(epsilon*np.random.randn(n_samples,n_dim))

    if sample_type == 'UNIFORM':
        neg_x = rand_unif
    if sample_type == 'SUBSPACE':
        neg_x = rand_sub
    if sample_type == 'MIXED':
        # randomly sample from uniform and gaussian negative samples
        neg_x = np.concatenate((rand_unif, rand_sub),0)
        neg_x = neg_x[np.random.choice(np.arange(len(neg_x)), size = n_samples)]

    neg_y = np.ones(len(neg_x))

    return neg_x.astype('float32'), neg_y.astype('float32')


################################### GRAPH FUNCTIONS ###############################################
# find the k nearest neighbours of all x points out of the neighbour candidates
def find_neighbors(x, y, neighbor_mask, k):

    # nearest neighbour object
    index = faiss.IndexFlatL2(x.shape[-1])
    # add nearest neighbour candidates
    index.add(x[neighbor_mask==1])

    # distances and idx of neighbour points for the neighbour candidates (k+1 as the first one will be the point itself)
    dist_train, idx_train = index.search(x[neighbor_mask==1], k = k+1)
    # remove 1st nearest neighbours to remove self loops
    dist_train, idx_train = dist_train[:,1:], idx_train[:,1:]
    # distances and idx of neighbour points for the non-neighbour candidates
    dist_test, idx_test = index.search(x[neighbor_mask==0], k = k)
    #concat
    dist = np.vstack((dist_train, dist_test))
    idx = np.vstack((idx_train, idx_test))

    return dist, idx

# create graph object out of x, y, distances and indices of neighbours
def build_graph(x, y, dist, idx):

    # array like [0,0,0,0,0,1,1,1,1,1,...,n,n,n,n,n] for k = 5 (i.e. edges sources)
    idx_source = np.repeat(np.arange(len(x)),dist.shape[-1]).astype('int32')
    idx_source = np.expand_dims(idx_source,axis=0)

    # edge targets, i.e. the nearest k neighbours of point 0, 1,..., n
    idx_target = idx.flatten()
    idx_target = np.expand_dims(idx_target,axis=0).astype('int32')

    #stack source and target indices
    idx = np.vstack((idx_source, idx_target))

    # edge weights
    attr = dist.flatten()
    attr = np.sqrt(attr)
    attr = np.expand_dims(attr, axis=1)

    # into tensors
    x = torch.tensor(x, dtype = torch.float32)
    y = torch.tensor(y,dtype = torch.float32)
    idx = torch.tensor(idx, dtype = torch.long)
    attr = torch.tensor(attr, dtype = torch.float32)

    #build PyTorch geometric Data object
    data = Data(x = x, edge_index = idx, edge_attr = attr, y = y)

    return data

########################################## DATASET FUNCTIONS ####################################
#
# split training data into train set and validation set
def split_data(seed, all_train_x, all_train_y, all_test_x, all_test_y):
    np.random.seed(seed)

    val_idx = np.random.choice(np.arange(len(all_train_x)),size = int(0.15*len(all_train_x)), replace = False)
    val_mask = np.zeros(len(all_train_x))
    val_mask[val_idx] = 1
    val_x = all_train_x[val_mask == 1]; val_y = all_train_y[val_mask == 1]
    train_x = all_train_x[val_mask == 0]; train_y = all_train_y[val_mask == 0]

    scaler = MinMaxScaler()
    scaler.fit(train_x[train_y == 0])
    train_x = scaler.transform(train_x)
    val_x = scaler.transform(val_x)

    if all_test_x is None:
        test_x = val_x
        test_y = val_y

    test_x = scaler.transform(all_test_x)
    test_y = all_test_y

    return train_x.astype('float32'), train_y.astype('float32'), val_x.astype('float32'), val_y.astype('float32'),  test_x.astype('float32'), test_y.astype('float32')


#load data
def load_dataset(dataset,seed):
    np.random.seed(seed)

    if dataset == 'MI-V':
        df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/MI/experiment_01.csv")
        for i in ['02','03','11','12','13','14','15','17','18']:
            data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/MI/experiment_%s.csv" %i)
            df = df.append(data, ignore_index = True)
        normal_idx = np.ones(len(df))
        for i in ['06','08','09','10']:
            data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/MI/experiment_%s.csv" %i)
            df = df.append(data, ignore_index = True)
            normal_idx = np.append(normal_idx,np.zeros(len(data)))
        machining_process_one_hot = pd.get_dummies(df['Machining_Process'])
        df = pd.concat([df.drop(['Machining_Process'],axis=1),machining_process_one_hot],axis=1)
        data = df.to_numpy()
        idx = np.unique(data,axis=0, return_index = True)[1]
        data = data[idx]
        normal_idx = normal_idx[idx]
        normal_data = data[normal_idx == 1]
        anomaly_data = data[normal_idx == 0]
        test_idx = np.random.choice(np.arange(0,len(normal_data)), len(anomaly_data), replace = False)
        train_idx = np.setdiff1d(np.arange(0,len(normal_data)), test_idx)
        train_x = normal_data[train_idx]
        train_y = np.zeros(len(train_x))
        test_x = np.concatenate((anomaly_data,normal_data[test_idx]))
        test_y  = np.concatenate((np.ones(len(anomaly_data)),np.zeros(len(test_idx))))

    elif dataset == 'MI-F':
        df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/mi/experiment_01.csv")
        for i in ['02','03','06','08','09','10','11','12','13','14','15','17','18']:
            data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/mi/experiment_%s.csv" %i)
            df = df.append(data, ignore_index = True)
        normal_idx = np.ones(len(df))
        for i in ['04', '05', '07', '16']:
            data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/mi/experiment_%s.csv" %i)
            df = df.append(data, ignore_index = True)
            normal_idx = np.append(normal_idx,np.zeros(len(data)))
        machining_process_one_hot = pd.get_dummies(df['Machining_Process'])
        df = pd.concat([df.drop(['Machining_Process'],axis=1),machining_process_one_hot],axis=1)
        data = df.to_numpy()
        idx = np.unique(data,axis=0, return_index = True)[1]
        data = data[idx]
        normal_idx = normal_idx[idx]
        normal_data = data[normal_idx == 1]
        anomaly_data = data[normal_idx == 0]
        test_idx = np.random.choice(np.arange(0,len(normal_data)), len(anomaly_data), replace = False)
        train_idx = np.setdiff1d(np.arange(0,len(normal_data)), test_idx)
        train_x = normal_data[train_idx]
        train_y = np.zeros(len(train_x))
        test_x = np.concatenate((anomaly_data,normal_data[test_idx]))
        test_y  = np.concatenate((np.ones(len(anomaly_data)),np.zeros(len(test_idx))))

    elif dataset in ['OPTDIGITS', 'PENDIGITS','SHUTTLE']:
        if dataset == 'SHUTTLE':
            data = loadmat("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/SHUTTLE/shuttle.mat")
        elif dataset == 'OPTDIGITS':
            data = loadmat("/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/optdigits/optdigits.mat")
        elif dataset == 'PENDIGITS':
            data = loadmat('/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/PENDIGITS/pendigits.mat')
        label = data['y'].astype('float32').squeeze()
        data = data['X'].astype('float32')
        normal_data= data[label == 0]
        normal_label = label[label==0]
        anom_data = data[label == 1]
        anom_label = label[label ==1]
        test_idx = np.random.choice(np.arange(0,len(normal_data)), len(anom_data), replace = False)
        train_idx = np.setdiff1d(np.arange(0,len(normal_data)), test_idx)
        train_x = normal_data[train_idx]
        train_y = normal_label[train_idx]
        test_x = np.concatenate((normal_data[test_idx],anom_data))
        test_y = np.concatenate((normal_label[test_idx],anom_label))

    elif dataset in ['THYROID','HRSS']:
        if dataset == 'THYROID':
            data = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/THYROID/annthyroid_21feat_normalised.csv').to_numpy()
        if dataset == 'HRSS':
            data = pd.read_csv('data/HRSS/HRSS.csv').to_numpy()
        label = data[:,-1].astype('float32').squeeze()
        data = data[:,:-1].astype('float32')
        normal_data= data[label == 0]
        normal_label = label[label==0]
        anom_data = data[label == 1]
        anom_label = label[label ==1]
        test_idx = np.random.choice(np.arange(0,len(normal_data)), len(anom_data), replace = False)
        train_idx = np.setdiff1d(np.arange(0,len(normal_data)), test_idx)
        train_x = normal_data[train_idx]
        train_y = normal_label[train_idx]
        test_x = np.concatenate((normal_data[test_idx],anom_data))
        test_y = np.concatenate((normal_label[test_idx],anom_label))

    elif dataset == 'SATELLITE':
        data = loadmat('/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/SATELLITE/satellite.mat')
        label = data['y'].astype('float32').squeeze()
        data = data['X'].astype('float32')
        normal_data = data[label == 0]
        normal_label = label[label ==0]
        anom_data = data[label == 1]
        anom_label = label[label ==1]
        train_idx = np.random.choice(np.arange(0,len(normal_data)), 4000, replace = False)
        test_idx = np.setdiff1d(np.arange(0,len(normal_data)), train_idx)
        train_x = normal_data[train_idx]
        train_y = normal_label[train_idx]
        test_x = normal_data[test_idx]
        test_y = normal_label[test_idx]
        test_idx = np.random.choice(np.arange(0,len(anom_data)), int(len(test_x)), replace = False)
        test_x = np.concatenate((test_x,anom_data[test_idx]))
        test_y = np.concatenate((test_y, anom_label[test_idx]))

    train_x, train_y, val_x, val_y, test_x, test_y = split_data(seed, all_train_x = train_x, all_train_y = train_y, all_test_x = test_x, all_test_y = test_y)

    return train_x, train_y, val_x, val_y, test_x, test_y


In [None]:
import scipy.io as sio
import os
import numpy as np
import pandas as pd
import torch
import faiss
from sklearn.preprocessing import MinMaxScaler
from torch_geometric.data import Data

def split_data(random_state, all_train_x, all_train_y, all_test_x, all_test_y):
    val_idx = np.random.choice(np.arange(len(all_train_x)),size = int(0.15*len(all_train_x)), replace = False)
    val_mask = np.zeros(len(all_train_x))
    val_mask[val_idx] = 1
    val_x = all_train_x[val_mask == 1]; val_y = all_train_y[val_mask == 1]
    train_x = all_train_x[val_mask == 0]; train_y = all_train_y[val_mask == 0]

    scaler = MinMaxScaler()
    scaler.fit(train_x[train_y == 0])
    train_x = scaler.transform(train_x)
    val_x = scaler.transform(val_x)

    if all_test_x is None:
        test_x = val_x
        test_y = val_y

    test_x = scaler.transform(all_test_x)
    test_y = all_test_y

    return train_x.astype('float32'), train_y.astype('float32'), val_x.astype('float32'), val_y.astype('float32'),  test_x.astype('float32'), test_y.astype('float32')


def load_data(data_type, random_state):
    # path = os.path.abspath(".")
    path = "/content/drive/MyDrive/ColabNotebooks/ECE570/LUNAR/data/"
    if data_type in ["LYMPHO","MAMMOGRAPHY","MNIST","MUSK","PENDIGITS","SHUTTLE"]:
      data = sio.loadmat(path + data_type + "/" + data_type.lower() + ".mat")
      label = data['y'].astype('float32').squeeze()
      data = data['X'].astype('float32')
    elif data_type == "HRSS":
      data = pd.read_csv(path + data_type + "/" + data_type + ".csv").to_numpy()
      label = data[:,-1].astype('float32').squeeze()
      data = data[:,:-1].astype('float32')

    normal_data, normal_label = data[label == 0], label[label == 0]
    abnormal_data, abnormal_label = data[label == 1], label[label == 1]

    test_idx = np.random.choice(np.arange(0,len(normal_data)), len(abnormal_data), replace = False)
    train_idx = np.setdiff1d(np.arange(0,len(normal_data)), test_idx)

    train_x, train_y = normal_data[train_idx], normal_label[train_idx]
    test_x = np.concatenate((normal_data[test_idx], abnormal_data))
    test_y = np.concatenate((normal_label[test_idx], abnormal_label))

    train_x, train_y, val_x, val_y, test_x, test_y = split_data(random_state, train_x, train_y, test_x, test_y)
    return train_x, train_y, val_x, val_y, test_x, test_y

seed = np.random.seed(42)
load_data("PENDIGITS", seed)


(array([[4.6977001e-01, 1.0000000e+00, 2.7046305e-01, ..., 9.0100753e-01,
         4.0243921e-01, 9.8279488e-01],
        [0.0000000e+00, 5.6840950e-01, 3.2176843e-01, ..., 2.5583255e-01,
         1.6274789e-01, 0.0000000e+00],
        [0.0000000e+00, 1.0000000e+00, 7.0717312e-02, ..., 2.3112993e-01,
         6.6002822e-01, 0.0000000e+00],
        ...,
        [1.0000000e+00, 9.7696328e-01, 6.1103398e-01, ..., 0.0000000e+00,
         0.0000000e+00, 4.3749042e-02],
        [5.8592129e-01, 6.5025252e-01, 9.0870064e-01, ..., 1.6379336e-02,
         1.0000000e+00, 3.0695484e-16],
        [0.0000000e+00, 7.8092998e-01, 2.9444221e-01, ..., 3.5654092e-01,
         1.0000000e+00, 4.0091050e-01]], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([[0.        , 0.88715565, 0.27589116, ..., 0.01907241, 1.        ,
         0.0574176 ],
        [1.        , 1.        , 0.8806075 , ..., 0.1618272 , 0.19688374,
         0.2       ],
        [0.7300159 , 0.86952573, 0.30432

In [None]:
load_dataset("PENDIGITS", 42)

(array([[4.6977001e-01, 1.0000000e+00, 2.7046305e-01, ..., 9.0100753e-01,
         4.0243921e-01, 9.8279488e-01],
        [0.0000000e+00, 8.8715565e-01, 2.7589116e-01, ..., 1.9072406e-02,
         1.0000000e+00, 5.7417598e-02],
        [0.0000000e+00, 5.6840950e-01, 3.2176843e-01, ..., 2.5583255e-01,
         1.6274789e-01, 0.0000000e+00],
        ...,
        [1.0000000e+00, 9.7696328e-01, 6.1103398e-01, ..., 0.0000000e+00,
         0.0000000e+00, 4.3749042e-02],
        [5.8592129e-01, 6.5025252e-01, 9.0870064e-01, ..., 1.6379336e-02,
         1.0000000e+00, 3.0695484e-16],
        [0.0000000e+00, 7.8092998e-01, 2.9444221e-01, ..., 3.5654092e-01,
         1.0000000e+00, 4.0091050e-01]], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([[0.7300159 , 0.86952573, 0.30432102, ..., 0.        , 0.17747433,
         0.0083223 ],
        [0.49544212, 0.8327737 , 0.655236  , ..., 0.        , 0.        ,
         0.0702076 ],
        [0.        , 0.7242836 , 0.19623