In [1]:
import tqdm
import torch
import argparse
import warnings
from pygod.metric import *
from pygod.utils import load_data
import time
from pygod.pretrain.networks import Net
from pygod.pretrain import util
from torch_geometric.data import DataLoader
from pygod.pretrain.inj_cora_dataset import InjCoraDataset
from pygod.pretrain.OTC_dataset import BitcoinOTC
import torch.nn.functional as F
import numpy as np
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import (from_networkx)
from torch_geometric.data import (
    Data,
    InMemoryDataset,
    Dataset,
    download_url,
    extract_gz,
)
def minmaxscaler(data):
    min = torch.min(data)
    max = torch.max(data)
    return (data - min)/(max-min)


In [2]:
# Cora dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('domi_cora.pth')
# pretrain feature reconstruction networks
model2 = torch.load('gae_cora.pth')

# load specific dataset
data = load_data('inj_cora')
k_all = sum(data.y)
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_cora/eval_30')
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_flickr/eval_10')
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_amazon/eval_30')

# Inference with guided diffusion

# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 1433# 767 #500 #1433#767
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:0'
else:
    args.device = 'cpu'
loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_cora20.pth'))
# classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_ama.pth'))
classifier_model.eval()
correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    out, prop = classifier_model(data_pre)
    pred = out.max(dim=1)[1]

    prop = prop[:, 1]
    correct += pred.eq(data_pre.ys).sum().item()
    loss += F.nll_loss(out, data_pre.ys, reduction='sum').item()

    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()
import numpy as np
y_label = np.array(y_label_list)
pred_label = np.array(pred_list)

data.y = data.y.bool().int()


score = model1.decision_score_
score2 = model2.decision_score_
score = minmaxscaler(score) # AUC: 0.9079±0.0000 (0.9079) AP: 0.9553±0.0000 (0.9553)      Recall: 0.8415±0.0000 (0.8415)
score[score >= 0.65] = 0.65 # 50% 0.5 AUC: 0.8821±0.0000 (0.8821) AP: 0.9054±0.0000 (0.9054)      Recall: 0.8417±0.0000 (0.8417)


score = F.normalize(score, p=2, dim=-1)
score2 = F.normalize(score2, p=2, dim=-1)

score1 = minmaxscaler(score)
score2 = minmaxscaler(score2)


# print(score1.mean(),score1.std(),score2.mean(),score2.std())
emsemble =torch.zeros_like(score1)

for i, y in enumerate(data.y):
    if pred_list[i] == 1:
        emsemble[i] = score1[i]
    else:
        emsemble[i] = score2[i]


auc.append(eval_roc_auc(data.y, emsemble))
rec.append(eval_recall_at_k(data.y, emsemble, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.8631	Recall: 0.5217


In [3]:
# Amazon dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('domi_ama.pth')
# pretrain feature reconstruction networks
model2 = torch.load('gae_ama.pth')

# load specific dataset
data = load_data('inj_amazon')
k_all = sum(data.y)
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_cora/eval_30')
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_flickr/eval_10')
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_amazon/eval_30')

# Inference with guided diffusion

# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 767 # 767 #500 #1433#767
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:0'
else:
    args.device = 'cpu'
loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
# classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_cora20.pth'))
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_ama.pth'))
classifier_model.eval()
correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    out, prop = classifier_model(data_pre)
    pred = out.max(dim=1)[1]

    prop = prop[:, 1]
    correct += pred.eq(data_pre.ys).sum().item()
    loss += F.nll_loss(out, data_pre.ys, reduction='sum').item()

    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()
import numpy as np
y_label = np.array(y_label_list)
pred_label = np.array(pred_list)

data.y = data.y.bool().int()


score = model1.decision_score_
score2 = model2.decision_score_


score = F.normalize(score, p=2, dim=-1)
score2 = F.normalize(score2, p=2, dim=-1)

score1 = minmaxscaler(score)
score2 = minmaxscaler(score2)


# print(score1.mean(),score1.std(),score2.mean(),score2.std())
emsemble =torch.zeros_like(score1)

for i, y in enumerate(data.y):
    if pred_list[i] == 1:
        emsemble[i] = score1[i]
    else:
        emsemble[i] = score2[i]


auc.append(eval_roc_auc(data.y, emsemble))
rec.append(eval_recall_at_k(data.y, emsemble, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.9231	Recall: 0.8012


In [4]:
# Flickr dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('domi_fli.pth')
# pretrain feature reconstruction networks
model2 = torch.load('gae_fli.pth')

# load specific dataset
data = load_data('inj_flickr')
k_all = sum(data.y)
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_cora/eval_30')
dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_flickr/eval_10')
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_amazon/eval_30')

# Inference with guided diffusion

# load subgraph classifier and related parameters
parser = util.parser
args = parser.parse_args(args=[])
args.num_classes = 2
args.num_features = 500 # 767 #500 #1433#767
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:0'
else:
    args.device = 'cpu'
loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
classifier_model = Net(args).to(args.device)
# classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_cora20.pth'))
classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_flickr1.pth'))
classifier_model.eval()
correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []

# calculate the prediction by subgraph classifier
for data_pre in loader:
    data_pre = data_pre.to(args.device)
    out, prop = classifier_model(data_pre)
    pred = out.max(dim=1)[1]

    prop = prop[:, 1]
    correct += pred.eq(data_pre.ys).sum().item()
    loss += F.nll_loss(out, data_pre.ys, reduction='sum').item()

    anomaly_label_graph.append(data_pre.label)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()
import numpy as np
y_label = np.array(y_label_list)
pred_label = np.array(pred_list)

data.y = data.y.bool().int()

score = model1.decision_score_
score2 = model2.decision_score_


score = F.normalize(score, p=2, dim=-1)
score2 = F.normalize(score2, p=2, dim=-1)

score1 = minmaxscaler(score)
score2 = minmaxscaler(score2)


# print(score1.mean(),score1.std(),score2.mean(),score2.std())
emsemble =torch.zeros_like(score1)

for i, y in enumerate(data.y):
    if pred_list[i] == 1:
        emsemble[i] = score1[i]
    else:
        emsemble[i] = score2[i]


auc.append(eval_roc_auc(data.y, emsemble))
rec.append(eval_recall_at_k(data.y, emsemble, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.9349	Recall: 0.5931


In [5]:
# Weibo dataset

auc, rec = [], []
# load pretrained model
# pretrain structure reconstruction networks
model1 = torch.load('gae_weibo.pth')
# pretrain feature reconstruction networks
model2 = torch.load('gae_weibo.pth')

# load specific dataset
data = load_data('weibo')
k_all = sum(data.y)
# # dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_cora/eval_30')
# dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_flickr/eval_10')
# # dataset = InjCoraDataset(root='./pygod/pretrain/data/inj_amazon/eval_30')

# # Inference with guided diffusion

# # load subgraph classifier and related parameters
# parser = util.parser
# args = parser.parse_args(args=[])
# args.num_classes = 2
# args.num_features = 500 # 767 #500 #1433#767
# if torch.cuda.is_available():
#     torch.cuda.manual_seed(args.seed)
#     args.device = 'cuda:0'
# else:
#     args.device = 'cpu'
# loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
# classifier_model = Net(args).to(args.device)
# # classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_cora20.pth'))
# classifier_model.load_state_dict(torch.load('./pygod/pretrain/latest_flickr1.pth'))
# classifier_model.eval()
# correct = 0.
# loss = 0.
# anomaly_label_graph = []
# anomaly_pred_graph = []

# # calculate the prediction by subgraph classifier
# for data_pre in loader:
#     data_pre = data_pre.to(args.device)
#     out, prop = classifier_model(data_pre)
#     pred = out.max(dim=1)[1]

#     prop = prop[:, 1]
#     correct += pred.eq(data_pre.ys).sum().item()
#     loss += F.nll_loss(out, data_pre.ys, reduction='sum').item()

#     anomaly_label_graph.append(data_pre.label)
#     anomaly_pred_graph.append(pred)

# y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
# pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()
# import numpy as np
# y_label = np.array(y_label_list)
# pred_label = np.array(pred_list)

data.y = data.y.bool().int()

score = model1.decision_score_
score2 = model2.decision_score_


score = F.normalize(score, p=2, dim=-1)
score2 = F.normalize(score2, p=2, dim=-1)

score1 = minmaxscaler(score)
score2 = minmaxscaler(score2)


# print(score1.mean(),score1.std(),score2.mean(),score2.std())
emsemble =torch.zeros_like(score1)

for i, y in enumerate(data.y):
    if True:
        emsemble[i] = score1[i]
    else:
        emsemble[i] = score2[i]


auc.append(eval_roc_auc(data.y, emsemble))
rec.append(eval_recall_at_k(data.y, emsemble, k_all))


import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))

AUC: 0.9495	Recall: 0.7800


In [6]:
# OTC dataset

auc, rec = [], []
df = pd.read_csv('/export/data/lixujia/bond/data/soc-sign-bitcoinotc.csv', header=None,
                 names=["source", "target", "r", "t"])
benign_label = df.groupby('target')['r'].mean().to_frame()
benign_label["benign"] = benign_label['r'].apply(lambda x: 1 if x >= 0.5 else 0)  # 确认是好用户
# 合并回原df
benign_label.drop(axis=1, columns='r', inplace=True)
df2 = df.join(benign_label, on="source")
df3 = df2.drop(index=df2[(df2['benign'] == 0)].index.tolist())  # 只留下benign的source
# 找到异常点
graph_label = df3.groupby('target')['r'].mean().to_frame()
graph_label["fraud"] = graph_label['r'].apply(lambda x: 1 if x <= -0.5 else 0)
graph_label.drop(axis=1, columns='r', inplace=True)
df_final = df.join(graph_label, on="target")
center_nodes = set(df_final['target'].tolist())

# 添加节点特征
# 入均分
t_avg = df.groupby('target')['r'].mean()
# 出均分
s_avg = df.groupby('source')['r'].mean()
# 入度
t_deg = df.groupby('target')['r'].count()
# 出度
s_deg = df.groupby('source')['r'].count()
t_attr = pd.concat([t_avg, t_deg], axis=1, ignore_index=False)
s_attr = pd.concat([s_avg, s_deg], axis=1, ignore_index=False)
t_attr.columns = ['0', '1']
s_attr.columns = ['0', '1']

# 建立完整的图
G = nx.from_pandas_edgelist(df_final, 'source', 'target', ['r', 't'], create_using=nx.DiGraph())
for node in G.nodes:
    G.nodes[node]['fraud'] = 0
    G.nodes[node]['t_avg'] = 0
    G.nodes[node]['s_avg'] = 0
    G.nodes[node]['t_deg'] = 0
    G.nodes[node]['s_deg'] = 0
for index, row in graph_label.iterrows():
    G.nodes[index]['fraud'] = row['fraud']
for index, row in t_attr.iterrows():
    G.nodes[index]['t_avg'] = row['0']
    G.nodes[index]['t_deg'] = row['1']
for index, row in s_attr.iterrows():
    G.nodes[index]['s_avg'] = row['0']
    G.nodes[index]['s_deg'] = row['1']

temp = from_networkx(G)

x = torch.stack((temp.t_avg, temp.s_avg, temp.t_deg, temp.s_deg), 1).to(torch.float32)  # 节点属性
edge_index = temp.edge_index
# y = torch.stack((temp.fraud), 1).to(torch.float32)  # 节点属性
y = temp.fraud

data = Data(x=x, edge_index=edge_index, y=y)



model = torch.load('domi_OTC.pth')
score = model.decision_score_

k_all = sum(data.y)

model2 = torch.load('gae_OTC.pth')
score2 = model2.decision_score_



score = minmaxscaler(score) 

score = F.normalize(score, p=2, dim=-1)
score2 = F.normalize(score2, p=2, dim=-1)

score1 = minmaxscaler(score)
score2 = minmaxscaler(score2)

emsemble =torch.zeros_like(score1)


#
# 调取pretrain model
parser = util.parser
args = parser.parse_args(args=[])

dataset = BitcoinOTC(root='./pygod/pretrain/data/OTC')
args.num_classes = 2
args.num_features = 4#767
if torch.cuda.is_available():
    torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:0'
else:
    args.device = 'cpu'

loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)


pretrain_model = Net(args).to(args.device)
pretrain_model.load_state_dict(torch.load('./pygod/pretrain/latest_OTC20.pth'))
pretrain_model.eval()
correct = 0.
loss = 0.
anomaly_label_graph = []
anomaly_pred_graph = []


for data_pre in loader:
    data_pre = data_pre.to(args.device)
    out, prop = pretrain_model(data_pre)
    pred = out.max(dim=1)[1]

    prop = prop[:, 1]
    correct += pred.eq(data_pre.y).sum().item()
    loss += F.nll_loss(out, data_pre.y, reduction='sum').item()

    anomaly_label_graph.append(data_pre.y)
    anomaly_pred_graph.append(pred)

y_label_list = torch.cat(anomaly_label_graph, 0).view(-1).tolist()
pred_list = torch.cat(anomaly_pred_graph, 0).view(-1).tolist()
# y_label = np.array(y_label_list)
# pred_label = np.array(pred_list)

data.y = data.y.bool().int()

for i, y in enumerate(data.y):
    # print (y, y_label_list[i])
    assert y == y_label_list[i]
    if pred_list[i] == 1:
    # if ys[i] == 1:
        emsemble[i] = score1[i]
    else:
        emsemble[i] = score2[i]


auc.append(eval_roc_auc(data.y, emsemble))
rec.append(eval_recall_at_k(data.y, emsemble, k_all))




import numpy as np
print("AUC: {:.4f}\t"
      "Recall: {:.4f}"
      .format(np.mean(auc),
              np.mean(rec)))



AUC: 0.8788	Recall: 0.5042
