# The method follows the original one here:
https://github.com/xiangyue9607/BioNEV/tree/master/src/bionev
which follows this paper https://arxiv.org/pdf/1906.05017.pdf

In [1]:
import copy
import itertools
import random
import json
import numpy as np
from pathlib import Path
import networkx as nx
import csv
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [11]:
# Maybe Change the testing_ratio can help get a better result, I haven't tried any other ration yet
def split_train_test_graph(input_edgelist, seed, testing_ratio=0.2, weighted=False): 
    if (weighted):
        G = nx.read_weighted_edgelist(input_edgelist)
    else:
        G = nx.read_gml(input_edgelist)
    node_num1, edge_num1 = len(G.nodes), len(G.edges)
    print('Original Graph: nodes:', node_num1, 'edges:', edge_num1)
    testing_edges_num = int(len(G.edges) * testing_ratio)
    random.seed(seed)
    testing_pos_edges = random.sample(G.edges, testing_edges_num)
    G_train = copy.deepcopy(G)
    for edge in testing_pos_edges:
        node_u, node_v = edge
        if (G_train.degree(node_u) > 1 and G_train.degree(node_v) > 1):
            G_train.remove_edge(node_u, node_v)

    G_train.remove_nodes_from(nx.isolates(G_train))
    node_num2, edge_num2 = len(G_train.nodes), len(G_train.edges)
    assert node_num1 == node_num2
    train_graph_filename = 'graph_train.edgelist'
    if weighted:
        nx.write_edgelist(G_train, train_graph_filename, data=['weight'])
    else:
        nx.write_edgelist(G_train, train_graph_filename, data=False)

    node_num1, edge_num1 = len(G_train.nodes), len(G_train.edges)
    print('Training Graph: nodes:', node_num1, 'edges:', edge_num1)
    return G, G_train, testing_pos_edges

def generate_neg_edges(original_graph, testing_edges_num, seed):
    L = list(original_graph.nodes())

    # create a complete graph
    G = nx.Graph()
    G.add_nodes_from(L)
    G.add_edges_from(itertools.combinations(L, 2))
    # remove original edges
    G.remove_edges_from(original_graph.edges())
    random.seed(seed)
    neg_edges = random.sample(G.edges, testing_edges_num)
    return neg_edges

def get_y_pred(y_test, y_pred_prob):
    y_pred = np.zeros(y_pred_prob.shape)
    sort_index = np.flip(np.argsort(y_pred_prob, axis=1), 1)
    for i in range(y_test.shape[0]):
        num = np.sum(y_test[i])
        for j in range(num):
            y_pred[i][sort_index[i][j]] = 1
    return y_pred

In [9]:
# Read Original Graph
facebook_gml_file = Path() / "preprocessed_data/facebook_network.gml"
graph = nx.read_gml(facebook_gml_file)

In [12]:
# Split train and test set, I set the seed to be 200, maybe it also can be changed to improve
G, G_train, testing_pos_edges = split_train_test_graph(facebook_gml_file,200,0.2,False)

Original Graph: nodes: 4039 edges: 88234


since Python 3.9 and will be removed in a subsequent version.
  testing_pos_edges = random.sample(G.edges, testing_edges_num)


Training Graph: nodes: 4039 edges: 70612


In [14]:
def LinkPrediction(embedding_look_up, original_graph, train_graph, test_pos_edges, seed):
    random.seed(seed)

    train_neg_edges = generate_neg_edges(original_graph, len(train_graph.edges()), seed)

    # create a auxiliary graph to ensure that testing negative edges will not used in training
    G_aux = copy.deepcopy(original_graph)
    G_aux.add_edges_from(train_neg_edges)
    test_neg_edges = generate_neg_edges(G_aux, len(test_pos_edges), seed)

    # construct X_train, y_train, X_test, y_test
    X_train = []
    y_train = []
    for edge in train_graph.edges():
        node_u_emb = embedding_look_up.iloc[int(edge[0])]
        node_v_emb = embedding_look_up.iloc[int(edge[1])]
        feature_vector = np.append(node_u_emb, node_v_emb)
        X_train.append(feature_vector)
        y_train.append(1)
    for edge in train_neg_edges:
        node_u_emb = embedding_look_up.iloc[int(edge[0])]
        node_v_emb = embedding_look_up.iloc[int(edge[1])]
        feature_vector = np.append(node_u_emb, node_v_emb)
        X_train.append(feature_vector)
        y_train.append(0)

    X_test = []
    y_test = []
    for edge in test_pos_edges:
        node_u_emb = embedding_look_up.iloc[int(edge[0])]
        node_v_emb = embedding_look_up.iloc[int(edge[1])]
        feature_vector = np.append(node_u_emb, node_v_emb)
        X_test.append(feature_vector)
        y_test.append(1)
    for edge in test_neg_edges:
        node_u_emb = embedding_look_up.iloc[int(edge[0])]
        node_v_emb = embedding_look_up.iloc[int(edge[1])]
        feature_vector = np.append(node_u_emb, node_v_emb)
        X_test.append(feature_vector)
        y_test.append(0)

    # shuffle for training and testing
    c = list(zip(X_train, y_train))
    random.shuffle(c)
    X_train, y_train = zip(*c)

    c = list(zip(X_test, y_test))
    random.shuffle(c)
    X_test, y_test = zip(*c)

    X_train = np.array(X_train)
    y_train = np.array(y_train)

    X_test = np.array(X_test)
    y_test = np.array(y_test)

    clf1 = LogisticRegression(random_state=seed, solver='lbfgs')
    clf1.fit(X_train, y_train)
    y_pred_proba = clf1.predict_proba(X_test)[:, 1]
    y_pred = clf1.predict(X_test)
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    auc_pr = average_precision_score(y_test, y_pred_proba)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print('#' * 9 + ' Link Prediction Performance ' + '#' * 9)
    print(f'AUC-ROC: {auc_roc:.3f}, AUC-PR: {auc_pr:.3f}, Accuracy: {accuracy:.3f}, F1: {f1:.3f}')
    print('#' * 50)
    return auc_roc, auc_pr, accuracy, f1

In [15]:
# Read and Calculate for node2vec
node2vec = pd.read_csv('embedded_data/node2vec.csv')
auc_roc, auc_pr, accuracy, f1 = LinkPrediction(node2vec,G,G_train,testing_pos_edges,200)

since Python 3.9 and will be removed in a subsequent version.
  neg_edges = random.sample(G.edges, testing_edges_num)


######### Link Prediction Performance #########
AUC-ROC: 0.910, AUC-PR: 0.857, Accuracy: 0.851, F1: 0.857
##################################################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Read Splitter and change it to dataframe
node_embeddings = []
with open('Splitter/output/facebook_embedding_Splitter.csv', 'r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # skip the header row
    for row in csv_reader:
        embeddings = np.array(row, dtype=np.float32)  
        node_embeddings.append(embeddings)
node_embeddings_df = pd.DataFrame(node_embeddings)
node_embeddings_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,119,120,121,122,123,124,125,126,127,128
0,0.0,-0.140779,-0.094674,0.260234,-0.170755,0.262383,-0.068288,-0.060825,0.150190,-0.092990,...,0.134296,-0.088870,0.138199,-0.300697,-0.100388,0.072272,0.222875,-0.225467,-0.088320,0.027933
1,1.0,-1.312702,-2.252581,4.609657,-1.634007,2.631526,1.051310,-1.228687,-0.371454,2.055938,...,3.139706,-6.602293,10.241209,2.874973,2.876019,7.494203,-6.689292,9.019254,-2.382051,2.410625
2,2.0,-3.974691,1.647938,2.325118,-4.137644,7.872515,-2.123207,-4.169729,-2.892547,-1.368232,...,0.286680,-3.173656,0.675563,4.027215,-1.807431,0.635117,2.779734,-0.353243,-4.559447,-1.585459
3,3.0,-4.350463,4.098444,2.770425,-4.972549,-3.513717,5.433891,-7.877466,4.577151,-1.081996,...,-1.111069,4.649827,-3.707739,2.530500,1.806943,-4.027565,4.453427,4.845517,2.510613,2.243322
4,4.0,0.509271,1.150627,3.138192,9.268862,1.306738,-8.536861,0.147767,-6.803989,5.134263,...,1.630434,-9.946416,1.995768,4.090886,5.937874,-0.726479,-5.560363,0.196782,3.139832,-1.745120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4133,4133.0,4.176734,-8.421040,4.366193,-4.065439,0.388216,-4.477820,3.064137,0.148565,-1.075947,...,-3.911112,-9.297780,-9.379425,0.261620,2.806340,3.394747,5.439377,-3.001965,-0.718352,-2.582008
4134,4134.0,-4.802187,1.922007,-1.099977,-1.798039,-1.223162,-6.003423,-3.817633,4.578604,-6.096305,...,-5.549270,0.898616,-3.682824,-3.347621,0.440157,6.891018,2.686739,-4.552588,4.067808,-0.108759
4135,4135.0,6.289729,-8.096797,4.513383,-3.815911,-0.411856,-5.306495,4.179504,-3.224790,-1.207437,...,-4.232800,-11.906752,-9.188235,2.296703,3.532612,3.187505,4.148688,-1.652968,-0.178185,-2.513448
4136,4136.0,5.147783,-9.147342,5.589265,-2.880635,-0.402792,-4.898163,4.684113,-0.525643,-1.256946,...,-5.136655,-11.011087,-9.922793,0.283366,3.059024,2.251458,4.297604,-1.741698,0.449481,-3.414312


In [18]:
# Add a column "node "to the Splitter Dataframe to indicate the embedding at each row respresent which node
with open('Splitter/output/facebook_personas_splitter.json', 'r') as file:
    connections = json.load(file)
    
L = []
for i in node_embeddings_df.index:
    L.append(connections[str(i)])
node_embeddings_df.drop(columns=[0], inplace=True)
node_embeddings_df['node'] = L

In [21]:
# The number of node embeddings for each node is not a fix number, unlike the node2vec(every node can only have one embedding)
# The non-fixed number of each node embeddings make it hard to train the data cause the size of each feature is not the same
# Thus I think maybe we can aggregate embeddings from same node, and use the mean for each column as the feature we have, 
# so we won't features with non-fixed number or high-dimensional features
node_embeddings_df_new = pd.DataFrame()
for i in range(0, len(graph.nodes())):
    df = node_embeddings_df[node_embeddings_df['node'] == i]
    mean_values = df.mean(axis=0)

    # 创建一个新的DataFrame，将平均值作为一行数据
    mean_df = pd.DataFrame(mean_values).transpose()
    node_embeddings_df_new = node_embeddings_df_new.append(mean_df)
node_embeddings_df_new = node_embeddings_df_new.reset_index()
node_embeddings_df_new.drop(columns=['index','node'], inplace=True)
node_embeddings_df_new

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,119,120,121,122,123,124,125,126,127,128
0,-1.063562,0.197238,3.623455,1.273041,0.579444,-1.308125,-0.929160,0.022117,0.375102,-1.003756,...,-1.613999,-2.196432,1.347018,0.909478,1.350324,1.606158,-0.494290,2.761420,1.812767,2.256130
1,-5.526092,1.115980,8.550709,-8.518214,4.127553,-2.630500,-1.143222,5.955279,-7.193746,-1.328159,...,5.053708,-1.605578,4.122857,-9.616627,-8.326284,4.230447,10.035099,-10.024920,-5.303612,1.437987
2,-2.226171,-2.089637,13.211426,-6.382279,6.480599,-3.551500,-2.101265,6.511686,-5.977349,-1.685926,...,2.645203,-4.858116,5.983568,-8.145611,-4.348152,5.017794,8.228811,-6.137849,-2.969920,2.463503
3,-4.379409,1.831318,10.590300,-8.569770,5.947935,-1.697540,-3.762246,7.331847,-6.600364,-2.416258,...,6.048513,-0.981798,2.793098,-10.548617,-9.793402,4.008341,10.495137,-8.437451,-5.262221,1.434732
4,-3.234488,-3.124572,8.543689,-4.457748,6.479342,-4.832870,-1.811378,4.788585,-5.316233,-0.808784,...,4.730561,-4.893484,3.723854,-7.451002,-3.135221,2.461663,5.949787,-6.022098,-1.368739,-0.245067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,4.176734,-8.421040,4.366193,-4.065439,0.388216,-4.477820,3.064137,0.148565,-1.075947,2.218400,...,-3.911112,-9.297780,-9.379425,0.261620,2.806340,3.394747,5.439377,-3.001965,-0.718352,-2.582008
4035,-4.802187,1.922007,-1.099977,-1.798039,-1.223162,-6.003423,-3.817633,4.578604,-6.096305,-1.761178,...,-5.549270,0.898616,-3.682824,-3.347621,0.440157,6.891018,2.686739,-4.552588,4.067808,-0.108759
4036,6.289729,-8.096797,4.513383,-3.815911,-0.411856,-5.306495,4.179504,-3.224790,-1.207437,3.719649,...,-4.232800,-11.906752,-9.188235,2.296703,3.532612,3.187505,4.148688,-1.652968,-0.178185,-2.513448
4037,5.147783,-9.147342,5.589265,-2.880635,-0.402792,-4.898163,4.684113,-0.525643,-1.256946,3.629960,...,-5.136655,-11.011087,-9.922793,0.283366,3.059024,2.251458,4.297604,-1.741698,0.449481,-3.414312


In [22]:
auc_roc_, auc_pr_, accuracy_, f1_ = LinkPrediction(node_embeddings_df_new,G, G_train, testing_pos_edges,200)

since Python 3.9 and will be removed in a subsequent version.
  neg_edges = random.sample(G.edges, testing_edges_num)


######### Link Prediction Performance #########
AUC-ROC: 0.931, AUC-PR: 0.895, Accuracy: 0.877, F1: 0.882
##################################################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
