# 0. Data Preview

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import json
import os
import random
import sys
import torch
import json
import hashlib
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from torch_geometric.utils import coalesce, to_undirected

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load the data from data1 folder
path = '../dataset/random_bluesky/edge_sets_random.json'
bluesky_data = json.load(open(path, 'r'))

# get the keys and values
for key in bluesky_data.keys():
    print("============")
    print(key, type(bluesky_data[key]))
    # and value shape
    if isinstance(bluesky_data[key], list):
        print(len(bluesky_data[key]))
    elif isinstance(bluesky_data[key], dict):
        print(bluesky_data[key].keys())

train_edges <class 'list'>
17409
valid_edges <class 'list'>
1088
test_positive_edges <class 'list'>
3265
test_negative_edges <class 'list'>
15013


# 1. Split Data

In [4]:
def save(file_name, data_name, edge):
    with open('../dataset/' + data_name + '/' + file_name+ '.txt', 'w') as f:
        for i in range(edge.size(1)):
            s, t = edge[0][i].item(), edge[1][i].item()
            f.write(str(s)+'\t'+str(t) +'\n')
            f.flush()

def load_json_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

In [5]:
data_name = 'random_bluesky'

train_pos, valid_pos, test_pos, test_neg = [], [], [], []
node_set = set()

data = load_json_data(path)
for edge in data['train_edges']:
    train_pos.append((edge[0], edge[1]))
    node_set.add(edge[0])
    node_set.add(edge[1])
for edge in data['valid_edges']:
    valid_pos.append((edge[0], edge[1]))
    node_set.add(edge[0])
    node_set.add(edge[1])
for edge in data['test_positive_edges']:
    test_pos.append((edge[0], edge[1]))
    node_set.add(edge[0])
    node_set.add(edge[1])
for edge in data['test_negative_edges']:
    test_neg.append((edge[0], edge[1])) # need to comment it if shuffle the test data
    node_set.add(edge[0])
    node_set.add(edge[1])

num_nodes = len(node_set)
print('Number of nodes: {}, Number of edges: {}'.format(num_nodes, len(train_pos) + len(valid_pos) + len(test_pos)))
print(len(test_pos), len(test_neg))


Number of nodes: 52488, Number of edges: 21762
3265 15013


In [6]:
# re-index the nodes
node_list = list(node_set)
node_list.sort()
node_map = {node_list[i]: i for i in range(num_nodes)}
train_pos = [(node_map[u], node_map[v]) for u, v in train_pos]
valid_pos = [(node_map[u], node_map[v]) for u, v in valid_pos]
test_pos = [(node_map[u], node_map[v]) for u, v in test_pos]
test_neg = [(node_map[u], node_map[v]) for u, v in test_neg]


In [7]:
# select the node feature for the nodes in the original node_set
# but save the new node index with their feature
node_features = torch.load('../dataset/bluesky' + '/gnn_feature.pt')
print(node_features.size())

new_node_features = torch.zeros(num_nodes, node_features.size(1))
for i in range(num_nodes):
    new_node_features[i] = node_features[node_list[i]]
node_features = new_node_features
torch.save(node_features, '../dataset/' + data_name + '/gnn_feature.pt')

print('Node feature shape:', node_features.size())

torch.Size([120090, 1536])
Node feature shape: torch.Size([52488, 1536])


In [8]:
train_pos_tensor = torch.transpose(torch.tensor(train_pos), 1, 0)
valid_pos_tensor = torch.transpose(torch.tensor(valid_pos), 1, 0)
test_pos_tensor = torch.transpose(torch.tensor(test_pos), 1, 0)
test_neg_tensor = torch.transpose(torch.tensor(test_neg), 1, 0)

edge_index = torch.cat((train_pos_tensor, train_pos_tensor[[1, 0]]), dim=1)
edge_index = to_undirected(edge_index)
edge_index = coalesce(edge_index)

nodenum = num_nodes

save('train_pos', data_name, train_pos_tensor)
save('valid_pos', data_name, valid_pos_tensor)
save('test_pos', data_name, test_pos_tensor)
save('test_neg', data_name, test_neg_tensor)

In [9]:
def generate_negatives(pos_tensor, edge_dict, node_indices):
    neg = []
    for i in range(pos_tensor.size(1)):
        src = random.choice(node_indices)
        dst = random.choice(node_indices)
        while dst in edge_dict[src] or src in edge_dict[dst]:
            src = random.choice(node_indices)
            dst = random.choice(node_indices)
        neg.append([src, dst])
    return torch.tensor(neg).t()
    
edge_dict = {i: set() for i in node_map.values()}
for u, v in train_pos + valid_pos + test_pos:
    edge_dict[u].add(v)
    edge_dict[v].add(u)
node_indices = list(node_map.values())

valid_neg_tensor = generate_negatives(valid_pos_tensor, edge_dict, node_indices)
# test_neg_tensor = generate_negatives(test_pos_tensor, edge_dict, node_indices)

save('valid_neg', data_name, valid_neg_tensor)
