In [98]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sp
import numpy as np
from sklearn.metrics import roc_auc_score
from gensim.models import KeyedVectors

In [99]:
# Step 1: load the embeddings from word_2_vec format using keyedvectors
model = KeyedVectors.load_word2vec_format('POS_4.0_1.0.txt', binary=False)
print(model)

INFO - 2023-11-30 22:11:56,300: loading projection weights from POS_4.0_1.0.txt


INFO - 2023-11-30 22:11:56,519: KeyedVectors lifecycle event {'msg': 'loaded (3890, 128) matrix of type float32 from POS_4.0_1.0.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-30T22:11:56.519911', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-88-generic-x86_64-with-glibc2.31', 'event': 'load_word2vec_format'}


KeyedVectors<vector_size=128, 3890 keys>


In [100]:
# populate the node embeddings in a numpy array
node_embeddings = np.zeros((model.vectors.shape[0], model.vectors.shape[1]))

for i in range(model.vectors.shape[0]):
    node_embeddings[i] = model[str(i)]

print(node_embeddings.shape)
print(node_embeddings[:2,:10])

(3890, 128)
[[-0.39515445 -0.14002295 -0.16805811  0.72960186 -0.04901455  0.13445862
   0.11357475  0.08609206 -0.6915254  -0.17039596]
 [-0.53287774  0.07854514  0.26818833  0.43647176  0.01400182  0.82921994
  -0.31643945 -0.50358909  0.74191505  0.08680608]]


In [101]:
# Generate bootstrapped edge embeddings (as is done in node2vec paper)
    # Edge embedding for (v1, v2) = hadamard product of node embeddings for v1, v2
def get_edge_embeddings(edge_list):
    embs = []
    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        emb1 = node_embeddings[node1]
        emb2 = node_embeddings[node2]
        edge_emb = np.multiply(emb1, emb2)
        embs.append(edge_emb)
    embs = np.array(embs)
    return embs

In [102]:
# Train-set edge embeddings
pos_train_edge_embs = get_edge_embeddings(train_edges)
neg_train_edge_embs = get_edge_embeddings(train_edges_false)
train_edge_embs = np.concatenate([pos_train_edge_embs, neg_train_edge_embs])

# Create train-set edge labels: 1 = real edge, 0 = false edge
train_edge_labels = np.concatenate([np.ones(len(train_edges)), np.zeros(len(train_edges_false))])

# Val-set edge embeddings, labels
pos_val_edge_embs = get_edge_embeddings(val_edges)
neg_val_edge_embs = get_edge_embeddings(val_edges_false)
val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
val_edge_labels = np.concatenate([np.ones(len(val_edges)), np.zeros(len(val_edges_false))])

# Test-set edge embeddings, labels
pos_test_edge_embs = get_edge_embeddings(test_edges)
neg_test_edge_embs = get_edge_embeddings(test_edges_false)
test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

# Create val-set edge labels: 1 = real edge, 0 = false edge
test_edge_labels = np.concatenate([np.ones(len(test_edges)), np.zeros(len(test_edges_false))])

In [103]:
# Train logistic regression classifier on train-set edge embeddings
from sklearn.linear_model import LogisticRegression
edge_classifier = LogisticRegression(random_state=0)
edge_classifier.fit(train_edge_embs, train_edge_labels)

In [104]:
# Predicted edge scores: probability of being of class "1" (real edge)
val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
val_roc = roc_auc_score(val_edge_labels, val_preds)

In [105]:
# Predicted edge scores: probability of being of class "1" (real edge)
test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
test_roc = roc_auc_score(test_edge_labels, test_preds)

In [106]:
print ('node2vec Validation ROC score: ', str(val_roc))
print ('node2vec Test ROC score: ', str(test_roc))

node2vec Validation ROC score:  0.7918141898234023
node2vec Test ROC score:  0.7836331892884454
