# Extract User Node Embeddings (v2)

In [1]:
import pickle
import torch
from torch_geometric.data import Data

with open('../dataset/postprocess-data/date_range.pickle', 'rb') as f:
    date_range = pickle.load(f)

with open('../dataset/postprocess-data/collab_month_edges.pickle', 'rb') as f:
    collab_month_edges = pickle.load(f)

with open('../dataset/postprocess-data/user_to_user_id.pickle', 'rb') as f:
    user_to_id = pickle.load(f)

with open('../dataset/postprocess-data/bio_embeddings_v2.pkl', 'rb') as f:
    bio_embeddings = pickle.load(f)

In [2]:
import numpy as np

# read npy file
with open('../dataset/v2/user_star_repo_feat_v2.npy', 'rb') as f:
    star_feature = np.load(f)

with open('../dataset/v2/user_contrib_repo_feat_v2.npy', 'rb') as f:
    contribute_feature = np.load(f)

In [3]:
repo_features = np.concatenate((star_feature, contribute_feature), axis=1)
user_embedding = {}
for user in user_to_id:
    user_embedding[user] = np.concatenate((bio_embeddings[user], repo_features[user_to_id[user]]))

In [4]:
user_embedding_tensor = torch.Tensor(list(user_embedding.values()))

  user_embedding_tensor = torch.Tensor(list(user_embedding.values()))


Here's where data is defined, `x` is the user input features  
`edge_index` is a matrix that indicates which two nodes has edge between them with shape `(2, <number of edges>)`

In [5]:
data = {}
for date in date_range:
    data[date] = Data(
        x=user_embedding_tensor, 
        edge_index=torch.tensor(collab_month_edges[date], dtype=torch.int).transpose(0, 1),
        pos_edge_label_index=torch.tensor(collab_month_edges[date], dtype=torch.int).transpose(0, 1), 
        neg_edge_label_index=None,
        num_nodes=user_embedding_tensor.shape[0],
        num_feaetures=user_embedding_tensor.shape[1]
    )

## Graph AutoEncoder

In [6]:
import os.path as osp

import torch

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GAE, VGAE, GCNConv
from torch_geometric.utils import negative_sampling

In [7]:
args = {
    'variational': True,
    'linear': False,
    'epochs': 100,
    'lr': 0.01,
    'out_channel': 16,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])

In [8]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)


class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

in_channels, out_channels = data['2013-03'].num_features, args['out_channel']

if not args['variational'] and not args['linear']:
    model = GAE(GCNEncoder(in_channels, out_channels))
elif not args['variational'] and args['linear']:
    model = GAE(LinearEncoder(in_channels, out_channels))
elif args['variational'] and not args['linear']:
    model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
elif args['variational'] and args['linear']:
    model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])

In [9]:
def train(data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)
    loss = model.recon_loss(z, data.pos_edge_label_index)
    if args['variational']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    neg_edge_index = negative_sampling(data.pos_edge_label_index, num_nodes=z.size(0), num_neg_samples=data.edge_index.size(1))
    return model.test(z, data.pos_edge_label_index, neg_edge_index)

@torch.no_grad()
def inference(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    return z

In [10]:
# Some arguments
args['epochs'] = 50
args['lr'] = 0.001
args['out_channel'] = 16

In [11]:
output = {}
for date in date_range:
    print(data[date].num_nodes, data[date].num_edges)
    for epoch in range(1, args['epochs'] + 1):
        loss = train(data[date])
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
        # auc, ap = test(data[date])
        # print(f'Epoch: {epoch:03d}, AUC(Area under ROC curve): {auc:.4f}, AP(average precision): {ap:.4f}')
    output[date] = inference(data[date])
    # break

5393 40
Epoch: 001, Loss: 3.6616
Epoch: 002, Loss: 12.5694
Epoch: 003, Loss: 6.6810
Epoch: 004, Loss: 6.6266
Epoch: 005, Loss: 3.7103
Epoch: 006, Loss: 4.0764
Epoch: 007, Loss: 4.6273
Epoch: 008, Loss: 3.9972
Epoch: 009, Loss: 2.2424
Epoch: 010, Loss: 1.7440
Epoch: 011, Loss: 1.5903
Epoch: 012, Loss: 2.2454
Epoch: 013, Loss: 1.7828
Epoch: 014, Loss: 1.9428
Epoch: 015, Loss: 1.9975
Epoch: 016, Loss: 1.8399
Epoch: 017, Loss: 1.9742
Epoch: 018, Loss: 1.2775
Epoch: 019, Loss: 1.7618
Epoch: 020, Loss: 1.5424
Epoch: 021, Loss: 1.4333
Epoch: 022, Loss: 1.4533
Epoch: 023, Loss: 1.0488
Epoch: 024, Loss: 0.9576
Epoch: 025, Loss: 1.1154
Epoch: 026, Loss: 1.3821
Epoch: 027, Loss: 1.1503
Epoch: 028, Loss: 1.1678
Epoch: 029, Loss: 1.1299
Epoch: 030, Loss: 1.1624
Epoch: 031, Loss: 1.0305
Epoch: 032, Loss: 1.0699
Epoch: 033, Loss: 1.1197
Epoch: 034, Loss: 1.0055
Epoch: 035, Loss: 0.9465
Epoch: 036, Loss: 1.0034
Epoch: 037, Loss: 1.0166
Epoch: 038, Loss: 0.9584
Epoch: 039, Loss: 0.8650
Epoch: 040, Loss

In [12]:
with open('../dataset/postprocess-data/user_embedding_v2.pkl', 'wb') as f:
    pickle.dump(output, f)