In [1]:
from ast import literal_eval
from csv import reader
from os import listdir, makedirs, path
from pickle import dump
import pickle
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from model import Seq2Seq, RecurrentAutoencoder, DNNAE, SalTransformer, SalAE, SalSCINet, SalGATSCINet, SalGATSCINetV2, SalGATConvLSTM, SalGATConvGRU, ConvGRU, SalGATConvGRUwoSal, SalConvGRUwoALL


import os, random
import torch
from torch.nn import TransformerAE
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)
# from args import get_parser


def load_and_save(category, filename, dataset, dataset_folder, output_folder):
    temp = np.genfromtxt(
        path.join(dataset_folder, category, filename),
        dtype=np.float32,
        delimiter=",",
    )
    print(dataset, category, filename, temp.shape)
    with open(path.join(output_folder, dataset + "_" + category + ".pkl"), "wb") as file:
        dump(temp, file)


def load_data(dataset):
    """ Method from OmniAnomaly (https://github.com/NetManAIOps/OmniAnomaly) """

    if dataset == "SMD":
        dataset_folder = "/home/sangyup/saliency_anomaly_detection/dataset/ServerMachineDataset"
        output_folder = "/home/sangyup/saliency_anomaly_detection/dataset/ServerMachineDataset/processed"
        makedirs(output_folder, exist_ok=True)
        file_list = listdir(path.join(dataset_folder, "train"))
        for filename in file_list:
            if filename.endswith(".txt"):
                load_and_save(
                    "train",
                    filename,
                    filename.strip(".txt"),
                    dataset_folder,
                    output_folder,
                )
                load_and_save(
                    "test_label",
                    filename,
                    filename.strip(".txt"),
                    dataset_folder,
                    output_folder,
                )
                load_and_save(
                    "test",
                    filename,
                    filename.strip(".txt"),
                    dataset_folder,
                    output_folder,
                )

    elif dataset == "SMAP" or dataset == "MSL":
        dataset_folder = "/home/sangyup/saliency_anomaly_detection/dataset/SMAPMSL/data"
        output_folder = "/home/sangyup/saliency_anomaly_detection/dataset/SMAPMSL/data/processed"
        makedirs(output_folder, exist_ok=True)
        with open(path.join(dataset_folder, "labeled_anomalies.csv"), "r") as file:
            csv_reader = reader(file, delimiter=",")
            res = [row for row in csv_reader][1:]
        res = sorted(res, key=lambda k: k[0])
        data_info = [row for row in res if row[1] == dataset and row[0] != "P-2"]
        labels = []
        for row in data_info:
            anomalies = literal_eval(row[2])
            length = int(row[-1])
            label = np.zeros([length], dtype=np.bool_)
            for anomaly in anomalies:
                label[anomaly[0] : anomaly[1] + 1] = True
            labels.extend(label)

        labels = np.asarray(labels)
        print(dataset, "test_label", labels.shape)

        with open(path.join(output_folder, dataset + "_" + "test_label" + ".pkl"), "wb") as file:
            dump(labels, file)

        def concatenate_and_save(category):
            data = []
            for row in data_info:
                filename = row[0]
                temp = np.load(path.join(dataset_folder, category, filename + ".npy"))
                data.extend(temp)
            data = np.asarray(data)
            print(dataset, category, data.shape)
            with open(path.join(output_folder, dataset + "_" + category + ".pkl"), "wb") as file:
                dump(data, file)

        for c in ["train", "test"]:
            concatenate_and_save(c)

def normalize_data(data, scaler=None):
    data = np.asarray(data, dtype=np.float32)
    if np.any(sum(np.isnan(data))):
        data = np.nan_to_num(data)

    if scaler is None:
        scaler = MinMaxScaler()
        scaler.fit(data)
    data = scaler.transform(data)
    print("Data normalized")

    return data, scaler

def get_data_dim(dataset):
    """
    :param dataset: Name of dataset
    :return: Number of dimensions in data
    """
    if dataset == "SMAP":
        return 25
    elif dataset == "MSL":
        return 55
    elif str(dataset).startswith("machine"):
        return 38
    else:
        raise ValueError("unknown dataset " + str(dataset))

        
def get_data(dataset, max_train_size=None, max_test_size=None,
             normalize=False, spec_res=False, train_start=0, test_start=0):
    """
    Get data from pkl files

    return shape: (([train_size, x_dim], [train_size] or None), ([test_size, x_dim], [test_size]))
    Method from OmniAnomaly (https://github.com/NetManAIOps/OmniAnomaly)
    """
    prefix = "/home/sangyup/saliency_anomaly_detection/dataset"
    if str(dataset).startswith("machine"):
        prefix += "/ServerMachineDataset/processed"
    elif dataset in ["MSL", "SMAP"]:
        prefix += "/SMAPMSL/data/processed"
    if max_train_size is None:
        train_end = None
    else:
        train_end = train_start + max_train_size
    if max_test_size is None:
        test_end = None
    else:
        test_end = test_start + max_test_size
    print("load data of:", dataset)
    print("train: ", train_start, train_end)
    print("test: ", test_start, test_end)
    x_dim = get_data_dim(dataset)
    f = open(os.path.join(prefix, dataset + "_train.pkl"), "rb")
    train_data = pickle.load(f).reshape((-1, x_dim))[train_start:train_end, :]
    f.close()
    try:
        f = open(os.path.join(prefix, dataset + "_test.pkl"), "rb")
        test_data = pickle.load(f).reshape((-1, x_dim))[test_start:test_end, :]
        f.close()
    except (KeyError, FileNotFoundError):
        test_data = None
    try:
        f = open(os.path.join(prefix, dataset + "_test_label.pkl"), "rb")
        test_label = pickle.load(f).reshape((-1))[test_start:test_end]
        f.close()
    except (KeyError, FileNotFoundError):
        test_label = None

    if normalize:
        train_data, scaler = normalize_data(train_data, scaler=None)
        test_data, _ = normalize_data(test_data, scaler=scaler)

    print("train set shape: ", train_data.shape)
    print("test set shape: ", test_data.shape)
    print("test set label shape: ", None if test_label is None else test_label.shape)
    return (train_data, None), (test_data, test_label)

In [2]:
ds = 'MSL'
load_data(ds.upper())

MSL test_label (73729,)
MSL train (58317, 55)
MSL test (73729, 55)


In [3]:
# SMD 1-1, 1-2, 1-3
# (x_train, _), (x_test, y_test) = get_data('machine-1-1', normalize=True)

# MSL
# (x_train, _), (x_test, y_test) = get_data('MSL', normalize=True)

# SMAP
(x_train, _), (x_test, y_test) = get_data('SMAP', normalize=True)

load data of: MSL
train:  0 None
test:  0 None
Data normalized
Data normalized
train set shape:  (58317, 55)
test set shape:  (73729, 55)
test set label shape:  (73729,)


In [4]:
def split_series(series, n_past, n_future):
    '''

    :param series: input time series
    :param n_past: number of past observations
    :param n_future: number of future series
    :return: X, y(label)
    '''
    X, y = list(), list()
    for window_start in range(len(series)):
        past_end = window_start + n_past
        future_end = past_end + n_future
        if future_end > len(series):
            break
        # slicing the past and future parts of the window
        past, future = series[window_start:past_end, :], series[past_end:future_end, :]
        X.append(past)
        y.append(future)

    return X, y

x_test_windowed, _ = split_series(x_test, 100, 1)
np.array(x_test_windowed).shape

(73629, 100, 55)

In [5]:
from torch.utils.data import Dataset, DataLoader
class ReconDataset(Dataset):
    def __init__(self, data, window, target_cols):
        self.data = torch.Tensor(data)
        self.window = window
        self.target_cols = target_cols
        self.shape = self.__getshape__()
        self.size = self.__getsize__()

    def __getitem__(self, index):
        x = self.data[index:index+self.window]
        y = self.data[index:index+self.window]
        return x, y

    def __len__(self):
        return len(self.data) -  self.window 
    
    def __getshape__(self):
        return (self.__len__(), *self.__getitem__(0)[0].shape)

    def __getsize__(self):
        return (self.__len__())

class ForecastDataset(Dataset):
    def __init__(self, data, window, target_cols):
        self.data = torch.Tensor(data)
        self.window = window
        self.target_cols = target_cols
        self.shape = self.__getshape__()
        self.size = self.__getsize__()

    def __getitem__(self, index):
        x = self.data[index:index+self.window]
        y = self.data[index+self.window,0:self.target_cols]
        return x, y

    def __len__(self):
        return len(self.data) -  self.window 
    
    def __getshape__(self):
        return (self.__len__(), *self.__getitem__(0)[0].shape)
    
    def __getsize__(self):
        return (self.__len__())

class ReconForecastDataset(Dataset):
    def __init__(self, data, window, horizon):
        self.data = torch.Tensor(data)
        self.window = window
        self.horizon = horizon
        self.shape = self.__getshape__()
        self.size = self.__getsize__()

    def __getitem__(self, index):
        x = self.data[index:index+self.window]
        y_recon = self.data[index:index+self.window]
        y_fore = self.data[index+self.window:index+self.window+self.horizon]
        return x, y_recon, y_fore

    def __len__(self):
        return len(self.data) -  self.window 
    
    def __getshape__(self):
        return (self.__len__(), *self.__getitem__(0)[0].shape), (self.__len__(), *self.__getitem__(0)[1].shape), (self.__len__(), *self.__getitem__(0)[2].shape)
    
    def __getsize__(self):
        return (self.__len__())


def create_data_loaders(train_dataset, batch_size, val_split=0.1, shuffle=False, test_dataset=None):
    train_loader, val_loader, test_loader = None, None, None
    if val_split == 0.0:
        print(f"train_size: {len(train_dataset)}")
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, drop_last=True)

    else:
        dataset_size = len(train_dataset)
        indices = list(range(dataset_size))
        split = int(np.floor(val_split * dataset_size))
        if shuffle:
            np.random.shuffle(indices)
        train_indices, val_indices = indices[split:], indices[:split]

        train_sampler = SubsetRandomSampler(train_indices)
        valid_sampler = SubsetRandomSampler(val_indices)

        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, sampler=train_sampler, drop_last=True)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, sampler=valid_sampler, drop_last=True)

        print(f"train_size: {len(train_indices)}")
        print(f"validation_size: {len(val_indices)}")

    if test_dataset is not None:
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
        print(f"test_size: {len(test_dataset)}")

    return train_loader, val_loader, test_loader


# From Informer

class StandardScaler():
    def __init__(self):
        self.mean = 0.
        self.std = 1.
    
    def fit(self, data):
        self.mean = data.mean(0)
        self.std = data.std(0)

    def transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data - mean) / std

    def inverse_transform(self, data):
        mean = torch.from_numpy(self.mean).type_as(data).to(data.device) if torch.is_tensor(data) else self.mean
        std = torch.from_numpy(self.std).type_as(data).to(data.device) if torch.is_tensor(data) else self.std
        return (data * std) + mean

# 시간 특징을 freq에 따라 추출
def time_features(dates, freq='h'):
    dates['month'] = dates.date.apply(lambda row:row.month,1)
    dates['day'] = dates.date.apply(lambda row:row.day,1)
    dates['weekday'] = dates.date.apply(lambda row:row.weekday(),1)
    dates['hour'] = dates.date.apply(lambda row:row.hour,1)
    dates['minute'] = dates.date.apply(lambda row:row.minute,1)
    dates['minute'] = dates.minute.map(lambda x:x//15)
    freq_map = {
        'y':[],'m':['month'],'w':['month'],'d':['month','day','weekday'],
        'b':['month','day','weekday'],'h':['month','day','weekday','hour'],
        't':['month','day','weekday','hour','minute'],
    }
    return dates[freq_map[freq.lower()]].values

# 한번의 batch를 실행하는 코드
def _process_one_batch(batch_x, batch_y, batch_x_mark, batch_y_mark):
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float()
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    dec_inp = torch.zeros([batch_y.shape[0], pred_len, batch_y.shape[-1]]).float()
    dec_inp = torch.cat([batch_y[:,:label_len,:], dec_inp], dim=1).float().to(device)
    outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    batch_y = batch_y[:,-pred_len:,0:].to(device)
    return outputs, batch_y


class Dataset_Pred(Dataset):
    def __init__(self, dataframe, size=None, scale=True):
        self.seq_len = size[0]
        self.label_len = size[1]
        self.pred_len = size[2]
        self.dataframe = dataframe
        
        self.scale = scale
        self.__read_data__()

    def __read_data__(self):
        self.scaler = StandardScaler()
        df_raw = self.dataframe
        df_raw["date"] = pd.to_datetime(df_raw["date"])

        delta = df_raw["date"].iloc[1] - df_raw["date"].iloc[0]
        if delta>=timedelta(hours=1):
            self.freq='h'
        else:
            self.freq='t'

        border1 = 0
        border2 = len(df_raw)
        cols_data = df_raw.columns[1:]
        df_data = df_raw[cols_data]


        if self.scale:
            self.scaler.fit(df_data.values)
            data = self.scaler.transform(df_data.values)
        else:
            data = df_data.values
            
        tmp_stamp = df_raw[['date']][border1:border2]
        tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
        pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len+1, freq=self.freq)
        
        df_stamp = pd.DataFrame(columns = ['date'])
        df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
        data_stamp = time_features(df_stamp, freq=self.freq)

        self.data_x = data[border1:border2]
        self.data_y = data[border1:border2]
        self.data_stamp = data_stamp
    
    def __getitem__(self, index):
        s_begin = index
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        seq_x_mark = self.data_stamp[s_begin:s_end]
        seq_y_mark = self.data_stamp[r_begin:r_end]
        return seq_x, seq_y, seq_x_mark, seq_y_mark

    def __len__(self):
        return len(self.data_x) - self.seq_len- self.pred_len + 1

In [6]:
window_size = 100
batch_size = 512
horizon = 10

train_dataset = ReconForecastDataset(x_train, window_size, horizon)
indices = torch.arange(len(train_dataset)-horizon)
train_dataset = torch.utils.data.Subset(train_dataset, indices)
test_dataset = ReconForecastDataset(x_test, window_size, horizon)

train_loader, val_loader, test_loader = create_data_loaders(train_dataset, batch_size, val_split=0.3, shuffle=False, test_dataset=test_dataset)

train_size: 40745
validation_size: 17462
test_size: 73629


In [8]:
model_name = 'SalGATConvGRUwoAll_saliency_MSL'
with open("result/model_" + model_name + "_best.pt", "rb") as f:
    SAVED_MODEL = torch.load(f)

print(SAVED_MODEL['best_epoch'])

MODEL = SalGATConvGRU(seq_len=window_size, output_len=10, n_features=x_train.shape[1], out_n_features=x_train.shape[1], embedding_dim=int(x_train.shape[1]/2), kernel_size=3, cell='gru')
# MODEL = SalGATConvGRUwoSal(seq_len=window_size, output_len=10, n_features=x_train.shape[1], out_n_features=x_train.shape[1], embedding_dim=int(x_train.shape[1]/2), kernel_size=3, cell='gru')
# MODEL = SalConvGRUwoALL(seq_len=window_size, output_len=10, n_features=x_train.shape[1], out_n_features=x_train.shape[1], embedding_dim=int(x_train.shape[1]/2), kernel_size=3, cell='gru')

MODEL.cuda()    
MODEL.load_state_dict(SAVED_MODEL["state"])


21


<All keys matched successfully>

In [10]:

def inference_SAL(dataloader, model, batch_size, TF_alpha):
    dist, dist_sal, fin_dist1, fin_dist2, guess, sal_list = [], [], [], [], [], []
    mse = torch.nn.MSELoss()
    model.eval()
    with torch.no_grad():
        for i, (x,y_recon,y_fore) in enumerate(dataloader):
            x = x.cuda()
            y_recon = y_recon.cuda()
            y_fore = y_fore.cuda()
            
            sal, dec_x, out = model(x)
            
            for y_r, y_f, d, o in zip(y_recon, y_fore, dec_x, out):
                d_s = torch.sum(torch.square(y_r - d)).cpu().numpy()
                d_o = torch.sum(torch.square(y_f - o)).cpu().numpy()
                # d_s = torch.mean(torch.square(y_r - d)).cpu().numpy()
                # d_o = torch.mean(torch.square(y_f - o)).cpu().numpy()
                
                dist_sal.append(d_s)
                dist.append(d_o)

                # inference_score1 = (d_s + d_o)/(1+TF_alpha)
                inference_score1 = ((1-TF_alpha)*d_s + TF_alpha*d_o)/(1+TF_alpha)
                inference_score2 = (d_s + TF_alpha*d_o)/(1+TF_alpha)
                fin_dist1.append(inference_score1)
                fin_dist2.append(inference_score2)
                # fin_dist.append((1-TF_alpha)*mse(y_s,d).item() + TF_alpha*mse(y_s, o).item())
                # fin_dist.append((TF_alpha*torch.cdist(y_s, o, p=2.0) + (1-TF_alpha)*torch.cdist(y_s, d, p=2.0)).cpu().numpy())

            guess.append(out.cpu().numpy())
            sal_list.append(sal.cpu().numpy())
        
            
    return (
        dist,
        dist_sal,
        fin_dist1,
        fin_dist2,
        np.concatenate(guess),
        np.concatenate(sal_list)
    )

### Inference

In [11]:
%%time
from sklearn import metrics

MODEL.eval()
alpha = 1.0
DIST, DIST_SAL, FIN_DIST1, FIN_DIST2, GUESS, SAL_LIST = inference_SAL(test_loader, MODEL, batch_size, TF_alpha=alpha)
DIST_train, DIST_SAL_train, FIN_DIST1_train, FIN_DIST2_train, GUESS_train, SAL_LIST_train = inference_SAL(train_loader, MODEL, batch_size, TF_alpha=alpha)

# anomaly score of test dataset
res = SAL_LIST[:,-1:,:]
res = res.reshape(res.shape[0], res.shape[2])
res_mean = np.mean(res, axis=1)
FD1wSAL = np.array(FIN_DIST1)*res_mean

# anomaly score of train dataset --> to calculate the threshold
res_train = SAL_LIST_train[:,-1:,:]
res_train = res_train.reshape(res_train.shape[0], res_train.shape[2])
res_mean_train = np.mean(res_train, axis=1)
FD1wSAL_train = np.array(FIN_DIST1_train)*res_mean_train


fpr, tpr, thresholds = metrics.roc_curve(y_test[-len(FD1wSAL):].astype(int), FD1wSAL, pos_label=1)
metrics.auc(fpr, tpr)

CPU times: user 15min 1s, sys: 27.2 s, total: 15min 28s
Wall time: 23.2 s


0.6021214114845479