# Extract Node Embeddings for users (v1)

In [18]:
import pickle
import numpy as np
import torch
from torch_geometric.data import Data

## Prepare Data

### Turn user feature vectors to Data.x

In [19]:

with open('../dataset/v1/bio_embeddings.pkl', 'rb') as f:
    bio_embeddings = pickle.load(f)

# turn bio_embeddings values to single numpy.ndarray
bio_embeddings = np.array(list(bio_embeddings.values()))
bio_embeddings.shape

(2736, 384)

In [20]:
# compose x and edge_index
x = torch.tensor(bio_embeddings, dtype=torch.float)

### Compose edge relationship

In [21]:
with open('../dataset/v1/users_followings_dict.pkl', 'rb') as f:
    users_following_dict = pickle.load(f)

In [22]:
users_following_dict

{'MDQ6VXNlcjI0MTEzOA==': [],
 'MDQ6VXNlcjEyNzk2MDk=': [],
 'MDQ6VXNlcjM5NzE3MDk5': [],
 'MDQ6VXNlcjU1NTA4NTA=': ['MDQ6VXNlcjQ2MzIzMA==',
  'MDQ6VXNlcjEzNjU4ODE=',
  'MDQ6VXNlcjE3NjAxMw=='],
 'MDQ6VXNlcjQ5OTU1MA==': ['MDQ6VXNlcjkxMjYzOTg=',
  'MDQ6VXNlcjE5MzQz',
  'MDQ6VXNlcjg3ODQ3MTI=',
  'MDQ6VXNlcjU4ODQ3Mw==',
  'MDQ6VXNlcjIwNjg0OA==',
  'MDQ6VXNlcjUwODM4',
  'MDQ6VXNlcjU3NTA=',
  'MDQ6VXNlcjE0NDQyNw==',
  'MDQ6VXNlcjYxNzQ4MQ==',
  'MDQ6VXNlcjI3OTg1',
  'MDQ6VXNlcjExMTk1MQ==',
  'MDQ6VXNlcjgxOTQy',
  'MDQ6VXNlcjM5MTkx',
  'MDQ6VXNlcjEzNzAw',
  'MDQ6VXNlcjE3MDI3MA==',
  'MDQ6VXNlcjExMDk1Mw==',
  'MDQ6VXNlcjUwMTMw'],
 'MDQ6VXNlcjIyNTQ3MzE=': ['MDQ6VXNlcjE1MDM4NTUz',
  'MDQ6VXNlcjU5NjAzNzY4',
  'MDQ6VXNlcjQ5MDMwODA0',
  'MDQ6VXNlcjgwNzkyNDQy',
  'MDQ6VXNlcjExNDk4NDU=',
  'MDQ6VXNlcjUwNjMzNTk5',
  'MDQ6VXNlcjM3NzI1MTk3',
  'MDQ6VXNlcjIwNDI0MTk3',
  'MDQ6VXNlcjM5MzQ1MjQ3',
  'MDQ6VXNlcjY2NDMxMjI=',
  'MDQ6VXNlcjgxMDQzOA=='],
 'MDQ6VXNlcjUwMDM5MDM=': [],
 'MDQ6VXNlcjgxMDQzOA==': ['MDQ6VXNl

In [23]:
# number nodes start from 0
id2idx = {id: idx for idx, id in enumerate(users_following_dict.keys())}
edge_index = []
for id, followings in users_following_dict.items():
    for following in followings:
        edge_index.append([id2idx[id], id2idx[following]])
        edge_index.append([id2idx[following], id2idx[id]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

#### Remove isolated nodes

In [24]:
# from torch_geometric.utils import remove_isolated_nodes
# edge_index, _ , mask = remove_isolated_nodes(edge_index)
# print(f"isolated node num: {len([i for i in mask if i == False])}")

### Compose Data and check if the data is acceptable

In [25]:
data = Data(x=x, edge_index=edge_index)

In [26]:
print(f"the data after running validation: {data.validate(raise_on_error=True)}")

the data after running validation: True


In [27]:
print(f"number of nodes: {data.num_nodes}")
print(f"number of edges: {data.num_edges}")
print(f"number of node features: {data.num_node_features}")
print(f"is there any isloated nodes: {data.has_isolated_nodes()}")
print(f"does the graph has self-loops: {data.has_self_loops()}")
print(f"does the graph is undirected: {data.is_undirected()}")

number of nodes: 2736
number of edges: 21486
number of node features: 384
is there any isloated nodes: True
does the graph has self-loops: False
does the graph is undirected: True
