In [1]:
# define two constants for our purpose
FEATURE_DIM = 8 # this is the dimension of the encoded feature of each node using Word2Vec algorithm
EMBEDDING_DIM = 2 # this is the dimension of the embedding of each node of the graph

# The following code reads musae_git_edges.csv and create the corresponding adjacency matrix

import csv
import numpy as np
import torch

# read csv file into a list of tuples
with open('musae_git_edges.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)  # skip first row
    data = [(int(row[0]), int(row[1])) for row in reader]
    
# # for some applications it is convenient to have edge_index as a matrix
# # each column is 2x1 denoting the two nodes forming the edge

# edge_index = torch.tensor(data).T
# edge_inv = edge_index[[1,0]]

# edge_index = torch.cat((edge_index, edge_inv),dim=1)



# construct adjacency matrix
n_nodes = max(max(row) for row in data) + 1
adjacency_matrix = np.zeros((n_nodes, n_nodes))
for u, v in data:
    adjacency_matrix[u, v] = 1
    adjacency_matrix[v, u] = 1

# remove self-loops (optional)
np.fill_diagonal(adjacency_matrix, 0)



In [2]:
# The following code reads musae_git_features.json and create a list of original features

import json
import numpy as np

# Load JSON data from file
with open('musae_git_features.json', 'r') as f:
    features_dict = json.load(f)

len_data = len(features_dict)

# Convert dict to list
features_ori = [[] for _ in range(len_data)]
for i in range(len_data):
    features_ori[i] = features_dict[str(i)]


In [3]:
# The following code reads the true label from musae_git_target.csv
with open('musae_git_target.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)  # skip first row
    true_label = [int(row[2]) for row in reader]

In [4]:
# The following code uses Word2Vec to convert the original features (with each feature being a set of intergers) 
# to the encoded features with each feature being a vector of FEATURE_DIM dimension

from gensim.models import Word2Vec
import numpy as np

# train a Word2Vec model on the features
model = Word2Vec(features_ori, vector_size=FEATURE_DIM, window=5, min_count=1, workers=4)

# create the FEATURE_DEMENSION x len_data matrix
features = np.zeros((FEATURE_DIM, len_data))
for i in range(len_data):
    vector = np.mean(model.wv[features_ori[i]], axis=0)
    features[:, i] = vector

# transpose if you want each row to be the feature for each node
features = features.T
