In [1]:
import pandas as pd

# 读取聚类结果
cluster = pd.read_csv('./labels/JODIE/wikipedia/cluster_labels.csv')

# 提取每个节点的最后一个时刻标签
last_indices_cluster = cluster.groupby('src_node_ids')['timestamps'].idxmax()
last_labels_cluster = cluster.loc[last_indices_cluster]

# 读取真实标签
true_labels = pd.read_csv('./processed_data/wikipedia/ml_wikipedia.csv')

# 提取每个节点的最后一个时刻的真实标签
last_indices_true = true_labels.groupby('u')['ts'].idxmax()
true_last_labels = true_labels.loc[last_indices_true]

# 设置索引以便于匹配
last_labels_cluster.set_index('src_node_ids', inplace=True)
true_last_labels.set_index('u', inplace=True)

# 替换聚类结果中的最后一个时刻的标签为真实标签
cluster.set_index('src_node_ids', inplace=True)
cluster.loc[last_labels_cluster.index, 'labels'] = true_last_labels['label']

# 重置索引
cluster.reset_index(inplace=True)

# 保存修改后的聚类结果
cluster.to_csv('./labels/JODIE/wikipedia/cluster_labels_with_true_last_labels.csv', index=False)

# 计算准确率
# 提取最后时刻的标签以进行比较
last_labels_check = cluster.loc[last_indices_cluster]

# 确保索引对齐
last_labels_check.set_index('src_node_ids', inplace=True)

# 计算准确率
correct_predictions = (true_last_labels['label'] == last_labels_check['labels']).sum()
accuracy = correct_predictions / len(true_last_labels)

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 100.00%


In [95]:
ml_wiki = pd.read_csv('./processed_data/rt_wiki/ml_rt_wiki.csv')
ml_wiki.iloc[87]

Unnamed: 0.1        87.0
Unnamed: 0          87.0
u                   33.0
i                 8260.0
ts                2053.0
label                0.0
idx                 88.0
last_timestamp    2053.0
Name: 87, dtype: float64

In [96]:
correct_predictions = (ml_wiki['u'] == cluster['src_node_ids']).sum()
accuracy = correct_predictions / len(ml_wiki)
accuracy

1.0

In [97]:
ml_wiki['label'] = cluster['labels'].apply(float)

In [98]:
ml_wiki.iloc[87]

Unnamed: 0.1        87.0
Unnamed: 0          87.0
u                   33.0
i                 8260.0
ts                2053.0
label                1.0
idx                 88.0
last_timestamp    2053.0
Name: 87, dtype: float64

In [99]:
ml_wiki.to_csv('./processed_data/rt_wiki/ml_rt_wiki.csv')

In [3]:
import torch
torch.cuda.is_available()

True

In [4]:
torch.empty(10000).to('cuda:2')

tensor([ 3.3631e-44,  6.8589e+22, -5.3770e-17,  ...,  0.0000e+00,
         0.0000e+00,  0.0000e+00], device='cuda:2')

## Check dataset spilt

In [19]:
import numpy as np
import pandas as pd
import random
dataset_name = 'reddit'
val_ratio = 0.15
test_ratio = 0.15


NODE_FEAT_DIM = EDGE_FEAT_DIM = 172
graph_df = pd.read_csv('./processed_data/{}/ml_{}.csv'.format(dataset_name, dataset_name))
edge_raw_features = np.load('./processed_data/{}/ml_{}.npy'.format(dataset_name, dataset_name))
node_raw_features = np.load('./processed_data/{}/ml_{}_node.npy'.format(dataset_name, dataset_name))
print(f'node_raw_features.shape: {node_raw_features.shape}')
if np.all(node_raw_features == 0):
    print("The feature matrix is entirely zero.")
else:
    print("The feature matrix is not entirely zero.")
assert NODE_FEAT_DIM >= node_raw_features.shape[1], f'Node feature dimension in dataset {dataset_name} is bigger than {NODE_FEAT_DIM}!'
assert EDGE_FEAT_DIM >= edge_raw_features.shape[1], f'Edge feature dimension in dataset {dataset_name} is bigger than {EDGE_FEAT_DIM}!'
# padding the features of edges and nodes to the same dimension (172 for all the datasets)
if node_raw_features.shape[1] < NODE_FEAT_DIM:
    node_zero_padding = np.zeros((node_raw_features.shape[0], NODE_FEAT_DIM - node_raw_features.shape[1]))
    node_raw_features = np.concatenate([node_raw_features, node_zero_padding], axis=1)
if edge_raw_features.shape[1] < EDGE_FEAT_DIM:
    edge_zero_padding = np.zeros((edge_raw_features.shape[0], EDGE_FEAT_DIM - edge_raw_features.shape[1]))
    edge_raw_features = np.concatenate([edge_raw_features, edge_zero_padding], axis=1)

assert NODE_FEAT_DIM == node_raw_features.shape[1] and EDGE_FEAT_DIM == edge_raw_features.shape[1], 'Unaligned feature dimensions after feature padding!'

# get the timestamp of validate and test set
val_time, test_time = list(np.quantile(graph_df.ts, [(1 - val_ratio - test_ratio), (1 - test_ratio)]))

src_node_ids = graph_df.u.values.astype(np.longlong)
dst_node_ids = graph_df.i.values.astype(np.longlong)
node_interact_times = graph_df.ts.values.astype(np.float64)
edge_ids = graph_df.idx.values.astype(np.longlong)
if dataset_name=='bot' or dataset_name=='bot22':
    label1 = graph_df.label1.values
    label2 = graph_df.label2.values
    labels=[label1,label2]
else:
    labels=graph_df.label.values
    labels_time = graph_df.last_timestamp.values
# The setting of seed follows previous works
random.seed(2020)

train_mask = node_interact_times <= val_time
val_mask = np.logical_and(node_interact_times <= test_time, node_interact_times > val_time)
test_mask = node_interact_times > test_time

node_raw_features.shape: (10985, 172)
The feature matrix is entirely zero.


In [20]:
final_labels = labels[node_interact_times == labels_time]

In [21]:
train_gt = labels[train_mask][node_interact_times[train_mask] == labels_time[train_mask]]

In [22]:
valid_gt = labels[val_mask][node_interact_times[val_mask] == labels_time[val_mask]]

In [23]:
test_gt = labels[test_mask][node_interact_times[test_mask] == labels_time[test_mask]]

In [24]:
print(f'trian 共包含: {len(node_interact_times[train_mask])}')
print(f'trian 共包含gt: {len(train_gt)}')
print(f'train 中 gt 的占比: {len(train_gt)/len(node_interact_times[train_mask])}')
print(f'train gt中class 1 的占比: {train_gt.sum()/len(train_gt)}\n')
        
print(f'valid 共包含: {len(node_interact_times[val_mask])}')        
print(f'valid 共包含gt: {len(valid_gt)}')        
print(f'valid 中 gt 的占比: {len(valid_gt)/len(node_interact_times[val_mask])}')
print(f'valid gt中class 1 的占比:{valid_gt.sum()/len(valid_gt)} \n')

print(f'test 共包含: {len(node_interact_times[test_mask])}')
print(f'test 共包含gt: {len(test_gt)}')
print(f'test 中 gt 的占比: {len(test_gt)/len(node_interact_times[test_mask])}')
print(f'test gt中class 1 的占比:{test_gt.sum()/len(test_gt)}')

trian 共包含: 470713
trian 共包含gt: 657
train 中 gt 的占比: 0.001395754950468757
train gt中class 1 的占比: 0.289193302891933

valid 共包含: 100867
valid 共包含gt: 687
valid 中 gt 的占比: 0.006810949071549665
valid gt中class 1 的占比:0.11935953420669577 

test 共包含: 100867
test 共包含gt: 8656
test 中 gt 的占比: 0.0858159754924802
test gt中class 1 的占比:0.010859519408502773


In [18]:
node_raw_features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
import os
import re
import torch

def load_src_embeddings(embedding_folder):
    src_embeddings = []
    timestamps = []
    src_node_ids = []

    # 获取文件列表并按文件名排序
    file_list = sorted([f for f in os.listdir(embedding_folder) if f.endswith('.pt')],
                       key=lambda x: float(re.search(r'embeddings_(\d+\.?\d*)', x).group(1)))

    for file_name in file_list:
        file_path = os.path.join(embedding_folder, file_name)
        data = torch.load(file_path)
        src_embeddings.append(data['src_node_embeddings'].numpy())
        timestamps.append(data['timestamps'])
        src_node_ids.append(data['src_node_ids'])

    src_embeddings = np.concatenate(src_embeddings, axis=0)
    timestamps = np.concatenate(timestamps, axis=0)
    src_node_ids = np.concatenate(src_node_ids, axis=0)

    return src_embeddings, timestamps, src_node_ids

embedding_folder = './embeddings/JODIE/wikipedia/'
save_path = './labels/JODIE/wikipedia/cluster_labels.csv'

# 读取数据
src_embeddings, timestamps, src_node_ids = load_src_embeddings(embedding_folder)

In [29]:
train_src_embeddings = src_embeddings[train_mask]
valid_src_embeddings = src_embeddings[val_mask]
test_src_embeddings = src_embeddings[test_mask]

In [31]:
def print_statistics(name, embeddings):
    print(f"{name} Statistics:")
    means = np.mean(embeddings, axis=0).sum()
    stds = np.std(embeddings, axis=0).sum()
    mins = np.min(embeddings, axis=0).sum()
    maxs = np.max(embeddings, axis=0).sum()
    print(f"Mean: {means}")
    print(f"Std: {stds}")
    print(f"Min: {mins}")
    print(f"Max: {maxs}")
    print(f"Shape: {embeddings.shape}")
    print()

# 打印统计信息
print_statistics("Train Embeddings", train_src_embeddings)
print_statistics("Validation Embeddings", valid_src_embeddings)
print_statistics("Test Embeddings", test_src_embeddings)

Train Embeddings Statistics:
Mean: -3.03743839263916
Std: 129.05203247070312
Min: -630.0359497070312
Max: 628.6611328125
Shape: (110232, 172)

Validation Embeddings Statistics:
Mean: -5.34822416305542
Std: 149.9070587158203
Min: -727.5516357421875
Max: 724.584228515625
Shape: (23621, 172)

Test Embeddings Statistics:
Mean: -5.715201377868652
Std: 161.79066467285156
Min: -832.5928955078125
Max: 846.3356323242188
Shape: (23621, 172)



In [9]:
import torch
val_labels = torch.load('./some_data/val_labels.pt')
val_preds = torch.load('./some_data/val_predicts.pt')
test_labels = torch.load('./some_data/test_labels.pt')
test_preds = torch.load('./some_data/test_predicts.pt')

In [10]:
from sklearn.metrics import accuracy_score
binary_predicts = (val_preds.cpu().numpy() >= 0.5).astype(int)  
accuracy = accuracy_score(y_true=val_labels.cpu().numpy(), y_pred=binary_predicts)
accuracy

0.9836065573770492

In [14]:
binary_predicts.sum()

3

In [17]:
binary_predicts = (test_preds.cpu().numpy() >= 0.5).astype(int) 

In [18]:
binary_predicts.sum()

17

In [1]:
import numpy as np

In [1]:
import torch
class inst():
    def __init__(self):
        self.val = torch.tensor([0,1,2])
a = inst()

In [2]:
a.val

tensor([0, 1, 2])

In [5]:
def func(item):
    model = item.val
    model = torch.tensor([2,3,4])
    print(item.val)
func(a)
a.val

tensor([0, 1, 2])


tensor([0, 1, 2])

In [1]:
import random
import torch
import torch.nn as nn
import numpy as np

from utils.DataLoader import Data


def set_random_seed(seed: int = 0):
    """
    set random seed
    :param seed: int, random seed
    :return:
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_random_seed(0)
a = torch.randn(1, 3)
print(a)

tensor([[ 1.5410, -0.2934, -2.1788]])
