In [1]:
import tqdm
import torch
import argparse
import warnings
from models.metric import *
from models.utils import load_data
import time
from models.adapt.networks import Net
from models.adapt import util
from torch_geometric.data import DataLoader
from models.adapt.inj_cora_dataset import InjCoraDataset
from models.adapt.OTC_dataset import BitcoinOTC
import torch.nn.functional as F
import numpy as np
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import (from_networkx)
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    Dataset,
    download_url,
    extract_gz,
)
def minmaxscaler(data):
    min = torch.min(data)
    max = torch.max(data)
    return (data - min)/(max-min)

In [2]:
# Cora dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('str_pretrain.pth')
# pretrain feature reconstruction networks
model2 = torch.load('feat_pretrain.pth')

# load specific dataset
data = load_data('inj_cora')
k_all = sum(data.y)
# load sampled subgraph with 30 nodes
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_cora/eval_30')

# Inference with guided diffusion
# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 1433
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:0'

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_cora.pth'))
classifier_model.eval()

correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

model1.eval()
model2.eval()

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    
    s_recover = model1.forward_di(data_pre)
    f_recover = model2.forward_di(data_pre)
    
    for t in range(0,3):      
        out, prop = classifier_model(model1.toden(s_recover,f_recover))
        pred = out.max(dim=1)[1]
        prop_s = prop[:, 0]
        prop_f = prop[:, 1]
        correct += pred.eq(data_pre.y).sum().item()
        loss += F.nll_loss(out, data_pre.y, reduction='sum').item()
        s_recover = model1.conditional_di(s_recover,prop_s)
        f_recover = model2.conditional_di(f_recover,prop_f)

    ss = model1.loss_func(data_pre , s_recover)
    sf = model2.loss_func(data_pre.x, f_recover)
    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()

y_label = np.array(y_label_list)
pred_t = np.array(pred_list)
data.y = data.y.bool().int()
fs = torch.zeros_like(ss)
# 0-1 scale
ss = minmaxscaler(F.normalize(ss, p=2, dim=-1))
sf = minmaxscaler(F.normalize(sf, p=2, dim=-1))

for i, y in enumerate(data.y):
    fs[i] += args.lamba * pred_t[i] *ss[i] + (1-args.lamba)*(1-pred_t[i]) * sf[i]
auc.append(eval_roc_auc(data.y, fs))
rec.append(eval_recall_at_k(data.y, fs, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.8631	Recall: 0.5217


In [3]:
# Amazon dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('str_pretrain.pth')
# pretrain feature reconstruction networks
model2 = torch.load('feat_pretrain.pth')

# load specific dataset
data = load_data('inj_amazon')
k_all = sum(data.y)
# load sampled subgraph with 30 nodes
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_amazon/eval_30')

# Inference with guided diffusion
# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 767
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:1' # distributed training and inference

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_ama.pth'))
classifier_model.eval()

correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

model1.eval()
model2.eval()

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    
    s_recover = model1.forward_di(data_pre)
    f_recover = model2.forward_di(data_pre)
    
    for t in range(0,3):      
        out, prop = classifier_model(model1.toden(s_recover,f_recover))
        pred = out.max(dim=1)[1]
        prop_s = prop[:, 0]
        prop_f = prop[:, 1]
        correct += pred.eq(data_pre.y).sum().item()
        loss += F.nll_loss(out, data_pre.y, reduction='sum').item()
        s_recover = model1.conditional_di(s_recover,prop_s)
        f_recover = model2.conditional_di(f_recover,prop_f)

    ss = model1.loss_func(data_pre , s_recover)
    sf = model2.loss_func(data_pre.x, f_recover)
    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()

y_label = np.array(y_label_list)
pred_t = np.array(pred_list)
data.y = data.y.bool().int()
fs = torch.zeros_like(ss)
# 0-1 scale
ss = minmaxscaler(F.normalize(ss, p=2, dim=-1))
sf = minmaxscaler(F.normalize(sf, p=2, dim=-1))

for i, y in enumerate(data.y):
    fs[i] += args.lamba * pred_t[i] *ss[i] + (1-args.lamba)*(1-pred_t[i]) * sf[i]
auc.append(eval_roc_auc(data.y, fs))
rec.append(eval_recall_at_k(data.y, fs, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.9231	Recall: 0.8012


In [4]:
# Flickr dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('str_pretrain.pth')
# pretrain feature reconstruction networks
model2 = torch.load('feat_pretrain.pth')

# load specific dataset
data = load_data('inj_flickr')
k_all = sum(data.y)
# load sampled subgraph with 30 nodes
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_flickr/eval_30')

# Inference with guided diffusion
# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 500
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:2' # distributed training and inference

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_fli.pth'))
classifier_model.eval()

correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

model1.eval()
model2.eval()

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    
    s_recover = model1.forward_di(data_pre)
    f_recover = model2.forward_di(data_pre)
    
    for t in range(0,3):      
        out, prop = classifier_model(model1.toden(s_recover,f_recover))
        pred = out.max(dim=1)[1]
        prop_s = prop[:, 0]
        prop_f = prop[:, 1]
        correct += pred.eq(data_pre.y).sum().item()
        loss += F.nll_loss(out, data_pre.y, reduction='sum').item()
        s_recover = model1.conditional_di(s_recover,prop_s)
        f_recover = model2.conditional_di(f_recover,prop_f)

    ss = model1.loss_func(data_pre , s_recover)
    sf = model2.loss_func(data_pre.x, f_recover)
    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()

y_label = np.array(y_label_list)
pred_t = np.array(pred_list)
data.y = data.y.bool().int()
fs = torch.zeros_like(ss)
# 0-1 scale
ss = minmaxscaler(F.normalize(ss, p=2, dim=-1))
sf = minmaxscaler(F.normalize(sf, p=2, dim=-1))

for i, y in enumerate(data.y):
    fs[i] += args.lamba * pred_t[i] *ss[i] + (1-args.lamba)*(1-pred_t[i]) * sf[i]
auc.append(eval_roc_auc(data.y, fs))
rec.append(eval_recall_at_k(data.y, fs, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.9349	Recall: 0.5931


In [5]:
# Weibo dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('str_pretrain.pth')
# pretrain feature reconstruction networks
model2 = torch.load('feat_pretrain.pth')

# load specific dataset
data = load_data('weibo')
k_all = sum(data.y)
# load sampled subgraph with 30 nodes
dataset = InjCoraDataset(root='./pygod/pretrain/data/weibo/eval_30')

# Inference with guided diffusion
# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 500
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:3' # distributed training and inference

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_weibo.pth'))
classifier_model.eval()

correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

model1.eval()
model2.eval()

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    
    s_recover = model1.forward_di(data_pre)
    f_recover = model2.forward_di(data_pre)
    
    for t in range(0,3):      
        out, prop = classifier_model(model1.toden(s_recover,f_recover))
        pred = out.max(dim=1)[1]
        prop_s = prop[:, 0]
        prop_f = prop[:, 1]
        correct += pred.eq(data_pre.y).sum().item()
        loss += F.nll_loss(out, data_pre.y, reduction='sum').item()
        s_recover = model1.conditional_di(s_recover,prop_s)
        f_recover = model2.conditional_di(f_recover,prop_f)

    ss = model1.loss_func(data_pre , s_recover)
    sf = model2.loss_func(data_pre.x, f_recover)
    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()

y_label = np.array(y_label_list)
pred_t = np.array(pred_list)
data.y = data.y.bool().int()
fs = torch.zeros_like(ss)
# 0-1 scale
ss = minmaxscaler(F.normalize(ss, p=2, dim=-1))
sf = minmaxscaler(F.normalize(sf, p=2, dim=-1))

for i, y in enumerate(data.y):
    fs[i] += args.lamba * pred_t[i] *ss[i] + (1-args.lamba)*(1-pred_t[i]) * sf[i]
auc.append(eval_roc_auc(data.y, fs))
rec.append(eval_recall_at_k(data.y, fs, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))

AUC: 0.9495	Recall: 0.7800


In [6]:
# OTC dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('str_pretrain.pth')
# pretrain feature reconstruction networks
model2 = torch.load('feat_pretrain.pth')

# preprocess as reference [3]
df = pd.read_csv('/export/data/lixujia/bond/data/soc-sign-bitcoinotc.csv', header=None,
                 names=["source", "target", "r", "t"])
benign_label = df.groupby('target')['r'].mean().to_frame()
benign_label["benign"] = benign_label['r'].apply(lambda x: 1 if x >= 0.5 else 0)
benign_label.drop(axis=1, columns='r', inplace=True)
df2 = df.join(benign_label, on="source")
df3 = df2.drop(index=df2[(df2['benign'] == 0)].index.tolist())
graph_label = df3.groupby('target')['r'].mean().to_frame()
graph_label["fraud"] = graph_label['r'].apply(lambda x: 1 if x <= -0.5 else 0)
graph_label.drop(axis=1, columns='r', inplace=True)
df_final = df.join(graph_label, on="target")
center_nodes = set(df_final['target'].tolist())
t_avg = df.groupby('target')['r'].mean()
s_avg = df.groupby('source')['r'].mean()
t_deg = df.groupby('target')['r'].count()
s_deg = df.groupby('source')['r'].count()
t_attr = pd.concat([t_avg, t_deg], axis=1, ignore_index=False)
s_attr = pd.concat([s_avg, s_deg], axis=1, ignore_index=False)
t_attr.columns = ['0', '1']
s_attr.columns = ['0', '1']

G = nx.from_pandas_edgelist(df_final, 'source', 'target', ['r', 't'], create_using=nx.DiGraph())
for node in G.nodes:
    G.nodes[node]['fraud'] = 0
    G.nodes[node]['t_avg'] = 0
    G.nodes[node]['s_avg'] = 0
    G.nodes[node]['t_deg'] = 0
    G.nodes[node]['s_deg'] = 0
for index, row in graph_label.iterrows():
    G.nodes[index]['fraud'] = row['fraud']
for index, row in t_attr.iterrows():
    G.nodes[index]['t_avg'] = row['0']
    G.nodes[index]['t_deg'] = row['1']
for index, row in s_attr.iterrows():
    G.nodes[index]['s_avg'] = row['0']
    G.nodes[index]['s_deg'] = row['1']

temp = from_networkx(G)

x = torch.stack((temp.t_avg, temp.s_avg, temp.t_deg, temp.s_deg), 1).to(torch.float32)
edge_index = temp.edge_index
y = temp.fraud

data = Data(x=x, edge_index=edge_index, y=y)
k_all = sum(data.y)

parser = util.parser
args = parser.parse_args(args=[])

dataset = BitcoinOTC(root='./pygod/pretrain/data/OTC')
args.num_classes = 2
args.num_features = 4
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:4'

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_OTC.pth'))
classifier_model.eval()

correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

model1.eval()
model2.eval()

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    
    s_recover = model1.forward_di(data_pre)
    f_recover = model2.forward_di(data_pre)
    
    for t in range(0,3):      
        out, prop = classifier_model(model1.toden(s_recover,f_recover))
        pred = out.max(dim=1)[1]
        prop_s = prop[:, 0]
        prop_f = prop[:, 1]
        correct += pred.eq(data_pre.y).sum().item()
        loss += F.nll_loss(out, data_pre.y, reduction='sum').item()
        s_recover = model1.conditional_di(s_recover,prop_s)
        f_recover = model2.conditional_di(f_recover,prop_f)

    ss = model1.loss_func(data_pre , s_recover)
    sf = model2.loss_func(data_pre.x, f_recover)
    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()

y_label = np.array(y_label_list)
pred_t = np.array(pred_list)
data.y = data.y.bool().int()
fs = torch.zeros_like(ss)
# 0-1 scale
ss = minmaxscaler(F.normalize(ss, p=2, dim=-1))
sf = minmaxscaler(F.normalize(sf, p=2, dim=-1))

for i, y in enumerate(data.y):
    fs[i] += args.lamba * pred_t[i] *ss[i] + (1-args.lamba)*(1-pred_t[i]) * sf[i]
auc.append(eval_roc_auc(data.y, fs))
rec.append(eval_recall_at_k(data.y, fs, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.8788	Recall: 0.5042
