In [125]:
import scipy.io as sio
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import networkx as nx
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score
from gensim.models import Word2Vec, KeyedVectors
import multiprocessing
import pandas as pd
import sys
import datetime
from sklearn.metrics import roc_auc_score
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", level=logging.INFO)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

8


In [126]:
# load a .mat file into a numpy array
def load_mat(filename):
    data = sio.loadmat(filename)
    # return data['data']
    # print(type(data))
    return data

In [127]:
# Step 1: load the embeddings from word_2_vec format using keyedvectors
model = KeyedVectors.load_word2vec_format('../embeddings/blogcatalog_n2v.txt', binary=False)
print(model)

INFO - 2023-11-28 00:06:05,924: loading projection weights from ../embeddings/blogcatalog_n2v.txt
INFO - 2023-11-28 00:06:06,479: KeyedVectors lifecycle event {'msg': 'loaded (10312, 128) matrix of type float32 from ../embeddings/blogcatalog_n2v.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-11-28T00:06:06.479635', 'gensim': '4.3.2', 'python': '3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-88-generic-x86_64-with-glibc2.31', 'event': 'load_word2vec_format'}


KeyedVectors<vector_size=128, 10312 keys>


In [128]:
# populate the node embeddings in a numpy array
node_embeddings = np.zeros((model.vectors.shape[0], model.vectors.shape[1]))

for i in range(model.vectors.shape[0]):
    node_embeddings[i] = model[str(i)]

In [129]:
# Next step will be to get the edge list
# - load a networkx graph from the .mat file
# - get the edge list from the graph

sparse_mat = load_mat('../datasets/blogcatalog.mat')['network']
print(type(sparse_mat))
G = nx.from_scipy_sparse_array(sparse_mat)
print(G.number_of_nodes())
print(G.number_of_edges())

edge_list = list(G.edges())

<class 'scipy.sparse._csc.csc_matrix'>
10312
333983


In [130]:
# Initialize a list to store edge embeddings and information
edge_data = []

# Iterate through the edges and construct edge embeddings
for edge in G.edges():
    node1, node2 = edge
    embedding1 = node_embeddings[node1]
    embedding2 = node_embeddings[node2]
    concatenated_embedding = np.multiply(embedding1, embedding2)  # Concatenate embeddings
    
    edge_dict = {
        'node1': node1,
        'node2': node2,
        'edge_embedding': concatenated_embedding
    }
    
    edge_data.append(edge_dict)

# Convert the list of edge dictionaries into a DataFrame
edge_df = pd.DataFrame(edge_data)

# Print the resulting DataFrame
print(edge_df)

        node1  node2                                     edge_embedding
0           0    175  [-0.010408797394451996, -0.00572047360224065, ...
1           0    232  [0.013740495769570482, 0.005690894386413126, -...
2           0    282  [0.03690906314616882, -0.02001184203271278, -0...
3           0    370  [0.13434024211433382, 0.009192588322365636, 0....
4           0    393  [0.009477176926681374, 0.014309214989318697, 0...
...       ...    ...                                                ...
333978  10301  10309  [0.19505496597020588, 0.037573853164098114, -0...
333979  10302  10309  [0.03149758006600334, -0.0018957392103583215, ...
333980  10304  10310  [0.266293744464452, 0.32625420589451437, -0.00...
333981  10306  10309  [0.21423885739163673, 0.10656689676583131, 0.0...
333982  10307  10309  [0.0054022755788777355, 0.07144771219926094, 0...

[333983 rows x 3 columns]


In [131]:
# Split the edge DataFrame into training and test sets
train_df, test_df = train_test_split(edge_df, test_size=0.2, random_state=42)

In [132]:
# generate negative edges for training and test sets -> completed by Co-pilot
def get_negative_edges(graph, num_edges_to_gen):
    '''
    Function to generate negative edges for a graph.
    '''
    edge_list = list(graph.edges())
    nodes = list(graph.nodes())
    num_nodes = len(nodes)
    
    edges_to_gen = num_edges_to_gen
    num_edges = len(edge_list)
    num_edges_gen = 0
    
    negative_edges = []
    
    while num_edges_gen < edges_to_gen:
        # generate random edge
        edge = np.random.choice(nodes, size=2, replace=False)
        
        # check if edge exists in graph
        if not graph.has_edge(*edge):
            negative_edges.append(tuple(edge))
            num_edges_gen += 1
    
    return negative_edges

In [133]:
# Generate negative edges for training and test sets
num_neg_train = len(train_df)
num_neg_test = len(test_df)

train_negatives = get_negative_edges(G, num_neg_train)
test_negatives = get_negative_edges(G, num_neg_test)

In [134]:
total_train = np.concatenate((train_df[['node1', 'node2']].values, np.array(train_negatives)), axis=0)
total_test = np.concatenate((test_df[['node1', 'node2']].values, np.array(test_negatives)), axis=0)

# Create labels for training and test sets
train_labels = np.concatenate((np.ones(num_neg_train), np.zeros(num_neg_train)))
test_labels = np.concatenate((np.ones(num_neg_test), np.zeros(num_neg_test)))

# Create a logistic regression model to use for edge classification
lr = LogisticRegression()

# Train the model on the training data
lr.fit(total_train, train_labels)

# Evaluate the model on the test data
lr_predictions = lr.predict(total_test)

# compute AUC score for the model
lr_auc = roc_auc_score(test_labels, lr_predictions)
print(lr_auc)

0.641855173136518
