# Extract User Node Embeddings (v2)

In [1]:
import pickle
import torch
from torch_geometric.data import Data

with open('../dataset/postprocess-data/date_range.pickle', 'rb') as f:
    date_range = pickle.load(f)

with open('../dataset/postprocess-data/collab_month_edges.pickle', 'rb') as f:
    collab_month_edges = pickle.load(f)

with open('../dataset/postprocess-data/user_to_user_id.pickle', 'rb') as f:
    user_to_id = pickle.load(f)

with open('../dataset/postprocess-data/bio_embeddings_v2.pkl', 'rb') as f:
    bio_embeddings = pickle.load(f)

In [2]:
import numpy as np

# read npy file
with open('../dataset/v2/user_star_repo_feat_v2.npy', 'rb') as f:
    star_feature = np.load(f)

with open('../dataset/v2/user_contrib_repo_feat_v2.npy', 'rb') as f:
    contribute_feature = np.load(f)

In [3]:
repo_features = np.concatenate((star_feature, contribute_feature), axis=1)
user_embedding = {}
for user in user_to_id:
    user_embedding[user] = np.concatenate((bio_embeddings[user], repo_features[user_to_id[user]]))

In [4]:
user_embedding_tensor = torch.Tensor(list(user_embedding.values()))

  user_embedding_tensor = torch.Tensor(list(user_embedding.values()))


Here's where data is defined, `x` is the user input features  
`edge_index` is a matrix that indicates which two nodes has edge between them with shape `(2, <number of edges>)`

In [5]:
collab_month_edges

{'2013-03': [[390, 2055],
  [2055, 390],
  [390, 249],
  [249, 390],
  [2055, 249],
  [249, 2055],
  [1691, 1832],
  [1832, 1691],
  [495, 1749],
  [1749, 495],
  [1213, 2243],
  [2243, 1213],
  [868, 5201],
  [5201, 868],
  [868, 5201],
  [5201, 868],
  [495, 1749],
  [1749, 495],
  [495, 1749],
  [1749, 495],
  [495, 1749],
  [1749, 495],
  [495, 1749],
  [1749, 495],
  [495, 1749],
  [1749, 495],
  [3639, 1711],
  [1711, 3639],
  [1691, 1832],
  [1832, 1691],
  [1691, 1832],
  [1832, 1691],
  [4607, 1252],
  [1252, 4607],
  [1691, 1832],
  [1832, 1691],
  [1691, 1832],
  [1832, 1691],
  [3049, 4510],
  [4510, 3049]],
 '2013-04': [[4985, 2055],
  [2055, 4985],
  [4985, 1329],
  [1329, 4985],
  [4985, 390],
  [390, 4985],
  [4985, 249],
  [249, 4985],
  [2055, 1329],
  [1329, 2055],
  [2055, 390],
  [390, 2055],
  [2055, 249],
  [249, 2055],
  [1329, 390],
  [390, 1329],
  [1329, 249],
  [249, 1329],
  [390, 249],
  [249, 390],
  [1691, 1832],
  [1832, 1691],
  [495, 1749],
  [1749, 4

In [6]:
# Aggregate edge label by month
edge_label = {}
# Consider the last 3 years
for i in range(84, len(date_range)):
    print(i)
    adjacency = torch.zeros(user_embedding_tensor.shape[0], user_embedding_tensor.shape[0])
    for edge in collab_month_edges[date_range[i]]:
        adjacency[edge[0], edge[1]] = 1
    
    if i % 3 == 2:
        edge_label[date_range[i-2]] = []
        for j in range(len(adjacency)):
            for k in range(j, len(adjacency)):
                if adjacency[j][k] == 1:
                    edge_label[date_range[i-2]].append([j, k])
                    edge_label[date_range[i-2]].append([k, j])


84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


In [7]:
with open('../dataset/postprocess-data/3months_3year_edge.pkl', 'wb') as f:
    pickle.dump(edge_label, f)

In [8]:
edge_label['2020-03']

[[0, 51],
 [51, 0],
 [0, 77],
 [77, 0],
 [0, 89],
 [89, 0],
 [0, 102],
 [102, 0],
 [0, 117],
 [117, 0],
 [0, 749],
 [749, 0],
 [0, 827],
 [827, 0],
 [0, 1172],
 [1172, 0],
 [0, 1298],
 [1298, 0],
 [0, 1308],
 [1308, 0],
 [0, 1762],
 [1762, 0],
 [0, 1808],
 [1808, 0],
 [0, 2320],
 [2320, 0],
 [0, 2780],
 [2780, 0],
 [5, 91],
 [91, 5],
 [5, 100],
 [100, 5],
 [5, 112],
 [112, 5],
 [5, 289],
 [289, 5],
 [5, 1734],
 [1734, 5],
 [6, 644],
 [644, 6],
 [7, 91],
 [91, 7],
 [7, 913],
 [913, 7],
 [7, 963],
 [963, 7],
 [7, 1137],
 [1137, 7],
 [7, 1256],
 [1256, 7],
 [7, 1371],
 [1371, 7],
 [7, 1622],
 [1622, 7],
 [7, 1765],
 [1765, 7],
 [7, 1906],
 [1906, 7],
 [7, 1944],
 [1944, 7],
 [7, 3508],
 [3508, 7],
 [28, 39],
 [39, 28],
 [28, 48],
 [48, 28],
 [28, 54],
 [54, 28],
 [28, 65],
 [65, 28],
 [28, 72],
 [72, 28],
 [28, 101],
 [101, 28],
 [28, 123],
 [123, 28],
 [28, 787],
 [787, 28],
 [28, 1452],
 [1452, 28],
 [28, 1466],
 [1466, 28],
 [28, 1959],
 [1959, 28],
 [31, 1344],
 [1344, 31],
 [31, 1903

In [9]:
date_range = date_range[84:-2:3]

In [10]:
date_range

['2020-03',
 '2020-06',
 '2020-09',
 '2020-12',
 '2021-03',
 '2021-06',
 '2021-09',
 '2021-12',
 '2022-03',
 '2022-06',
 '2022-09',
 '2022-12',
 '2023-03']

In [11]:
data = {}
for date in date_range:
    data[date] = Data(
        x=user_embedding_tensor, 
        edge_index=torch.tensor(collab_month_edges[date], dtype=torch.int).transpose(0, 1),
        pos_edge_label_index=torch.tensor(collab_month_edges[date], dtype=torch.int).transpose(0, 1), 
        neg_edge_label_index=None,
        num_nodes=user_embedding_tensor.shape[0],
        num_feaetures=user_embedding_tensor.shape[1]
    )

## Graph AutoEncoder

In [12]:
import os.path as osp

import torch

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GAE, VGAE, GCNConv
from torch_geometric.utils import negative_sampling

In [13]:
args = {
    'variational': True,
    'linear': False,
    'epochs': 100,
    'lr': 0.01,
    'out_channel': 16,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      split_labels=True, add_negative_train_samples=False),
])

In [14]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


class VariationalGCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv_mu = GCNConv(2 * out_channels, out_channels)
        self.conv_logstd = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)


class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv(x, edge_index)


class VariationalLinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv_mu = GCNConv(in_channels, out_channels)
        self.conv_logstd = GCNConv(in_channels, out_channels)

    def forward(self, x, edge_index):
        return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index)

in_channels, out_channels = data['2023-03'].num_features, args['out_channel']

if not args['variational'] and not args['linear']:
    model = GAE(GCNEncoder(in_channels, out_channels))
elif not args['variational'] and args['linear']:
    model = GAE(LinearEncoder(in_channels, out_channels))
elif args['variational'] and not args['linear']:
    model = VGAE(VariationalGCNEncoder(in_channels, out_channels))
elif args['variational'] and args['linear']:
    model = VGAE(VariationalLinearEncoder(in_channels, out_channels))

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])

In [15]:
def train(data):
    model.train()
    optimizer.zero_grad()
    z = model.encode(data.x, data.edge_index)
    loss = model.recon_loss(z, data.pos_edge_label_index)
    if args['variational']:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    neg_edge_index = negative_sampling(data.pos_edge_label_index, num_nodes=z.size(0), num_neg_samples=data.edge_index.size(1))
    return model.test(z, data.pos_edge_label_index, neg_edge_index)

@torch.no_grad()
def inference(data):
    model.eval()
    z = model.encode(data.x, data.edge_index)
    return z

In [16]:
# Some arguments
args['epochs'] = 50
args['lr'] = 0.001
args['out_channel'] = 16

In [17]:
output = {}
for epoch in range(1, args['epochs'] + 1):
    for date in date_range:
        loss = train(data[date])
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')
        # auc, ap = test(data[date])
        # print(f'Epoch: {epoch:03d}, AUC(Area under ROC curve): {auc:.4f}, AP(average precision): {ap:.4f}')
    
for date in date_range:
    output[date] = inference(data[date])
    # print(output[date].shape)
    # break

Epoch: 001, Loss: 4.1111
Epoch: 001, Loss: 4.6997
Epoch: 001, Loss: 5.4801
Epoch: 001, Loss: 3.0265
Epoch: 001, Loss: 2.0540
Epoch: 001, Loss: 2.1160
Epoch: 001, Loss: 2.5505
Epoch: 001, Loss: 2.3878
Epoch: 001, Loss: 1.9879
Epoch: 001, Loss: 1.7531
Epoch: 001, Loss: 1.6651
Epoch: 001, Loss: 1.6434
Epoch: 001, Loss: 1.6398
Epoch: 002, Loss: 1.6907
Epoch: 002, Loss: 1.6272
Epoch: 002, Loss: 1.5395
Epoch: 002, Loss: 1.4876
Epoch: 002, Loss: 1.4684
Epoch: 002, Loss: 1.4455
Epoch: 002, Loss: 1.4146
Epoch: 002, Loss: 1.4052
Epoch: 002, Loss: 1.3950
Epoch: 002, Loss: 1.3665
Epoch: 002, Loss: 1.3577
Epoch: 002, Loss: 1.3419
Epoch: 002, Loss: 1.3133
Epoch: 003, Loss: 1.2999
Epoch: 003, Loss: 1.2967
Epoch: 003, Loss: 1.2806
Epoch: 003, Loss: 1.2681
Epoch: 003, Loss: 1.2484
Epoch: 003, Loss: 1.2170
Epoch: 003, Loss: 1.1974
Epoch: 003, Loss: 1.1826
Epoch: 003, Loss: 1.1581
Epoch: 003, Loss: 1.1543
Epoch: 003, Loss: 1.1454
Epoch: 003, Loss: 1.1444
Epoch: 003, Loss: 1.1310
Epoch: 004, Loss: 1.1072


In [18]:
with open('../dataset/v2/user_embedding_v3.pkl', 'wb') as f:
    pickle.dump(output, f)