# Extract Node Embeddings for users (v1)

In [1]:
import pickle
import numpy as np
import torch
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


## Prepare Data

### Turn user feature vectors to Data.x

In [2]:
repo_embeddings = np.load('../dataset/v1/sbert_embeddings_cls.npy')
repo_embeddings.shape

(94409, 384)

In [19]:

with open('../dataset/v1/bio_embeddings.pkl', 'rb') as f:
    bio_embeddings = pickle.load(f)

# turn bio_embeddings values to single numpy.ndarray
bio_embeddings = np.array(list(bio_embeddings.values()))
bio_embeddings.shape

(2736, 384)

In [3]:
# compose x and edge_index
x = torch.tensor(bio_embeddings, dtype=torch.float)

### Compose edge relationship

In [4]:
with open('../dataset/v1/users_followings_dict.pkl', 'rb') as f:
    users_following_dict = pickle.load(f)

In [23]:
# number nodes start from 0
id2idx = {id: idx for idx, id in enumerate(users_following_dict.keys())}
edge_index = []
for id, followings in users_following_dict.items():
    for following in followings:
        edge_index.append([id2idx[id], id2idx[following]])
        edge_index.append([id2idx[following], id2idx[id]])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

#### Remove isolated nodes

In [7]:
# from torch_geometric.utils import remove_isolated_nodes
# edge_index, _ , mask = remove_isolated_nodes(edge_index)
# print(f"isolated node num: {len([i for i in mask if i == False])}")

### Compose Data and check if the data is acceptable

In [8]:
# pos, neg = np.split(edge_index, 2, axis=1)
data = Data(
    x=x, 
    edge_index=edge_index, 
    pos_edge_label_index=edge_index, 
    neg_edge_label_index=None,
    num_nodes=x.shape[0],
    num_feaetures=x.shape[1]
)

In [9]:
print(f"the data after running validation: {data.validate(raise_on_error=True)}")

the data after running validation: True


In [10]:
print(f"number of nodes: {data.num_nodes}")
print(f"number of edges: {data.num_edges}")
print(f"number of node features: {data.num_node_features}")
print(f"is there any isloated nodes: {data.has_isolated_nodes()}")
print(f"does the graph has self-loops: {data.has_self_loops()}")
print(f"does the graph is undirected: {data.is_undirected()}")

number of nodes: 2736
number of edges: 21486
number of node features: 384
is there any isloated nodes: True
does the graph has self-loops: False
does the graph is undirected: True


### Run GAE

In [11]:
import os.path as osp

import torch

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GAE, VGAE, GCNConv
from torch_geometric.utils import negative_sampling

In [12]:
args = {
    'variational': True,
    'linear': False,
    'dataset': 'Cora',
    'epochs': 400
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])

#### Define Model

In [13]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)


class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


in_channels, out_channels = data.num_features, 16

if not args['variational'] and not args['linear']:
    model = GAE(GCNEncoder(in_channels, out_channels))
elif not args['variational'] and args['linear']:
    model = GAE(LinearEncoder(in_channels, out_channels))
elif args['variational'] and not args['linear']:
    model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
elif args['variational'] and args['linear']:
    model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [14]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)
    loss = model.recon_loss(z, data.pos_edge_label_index)
    if args['variational']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    neg_edge_index = negative_sampling(data.pos_edge_label_index, z.size(0))
    return model.test(z, data.pos_edge_label_index, neg_edge_index)

@torch.no_grad()
def inference(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    return z
    

In [15]:
for epoch in range(1, args['epochs'] + 1):
    loss = train()
    auc, ap = test(data)
    print(f'Epoch: {epoch:03d}, AUC(Area under ROC curve): {auc:.4f}, AP(average precision): {ap:.4f}')

Epoch: 001, AUC(Area under ROC curve): 0.7349, AP(average precision): 0.7088
Epoch: 002, AUC(Area under ROC curve): 0.7462, AP(average precision): 0.7249
Epoch: 003, AUC(Area under ROC curve): 0.7553, AP(average precision): 0.7452
Epoch: 004, AUC(Area under ROC curve): 0.7801, AP(average precision): 0.7817
Epoch: 005, AUC(Area under ROC curve): 0.8034, AP(average precision): 0.8128
Epoch: 006, AUC(Area under ROC curve): 0.8312, AP(average precision): 0.8420
Epoch: 007, AUC(Area under ROC curve): 0.8586, AP(average precision): 0.8626
Epoch: 008, AUC(Area under ROC curve): 0.8802, AP(average precision): 0.8724
Epoch: 009, AUC(Area under ROC curve): 0.8786, AP(average precision): 0.8629
Epoch: 010, AUC(Area under ROC curve): 0.8782, AP(average precision): 0.8605
Epoch: 011, AUC(Area under ROC curve): 0.8770, AP(average precision): 0.8577
Epoch: 012, AUC(Area under ROC curve): 0.8715, AP(average precision): 0.8514
Epoch: 013, AUC(Area under ROC curve): 0.8590, AP(average precision): 0.8361

In [16]:
embeddings = inference(data)

In [17]:
embeddings.shape

torch.Size([2736, 16])