In [1]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
import numpy as np
import networkx as nx
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import scipy
from random import sample
import csv
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import seaborn as sns
import pickle

# Data loading and processing

In [2]:
edges = pd.read_csv('edge.csv.gz', compression='gzip', header=0, sep=',', quotechar='"', error_bad_lines=False, names=['node1','node2'])
G = nx.from_edgelist(zip(edges['node1'],edges['node2']), create_using=nx.Graph())

In [3]:
G_complement = nx.complement(G)

In [4]:
negative_edges = sample(list(G_complement.edges()),len(edges)) 

In [5]:
neg_edge_df = pd.DataFrame()
node_1 = []
node_2 = []

for edge in tqdm(negative_edges):
    node_1.append(int(edge[0]))
    node_2.append(int(edge[1]))
    
neg_edge_df['node1'] = node_1
neg_edge_df['node2'] = node_2

100%|██████████| 1067910/1067910 [00:00<00:00, 1082851.96it/s]


In [6]:
all_data = pd.concat([edges,neg_edge_df])
all_data['Y'] = [1] * len(edges) + [0] * len(neg_edge_df)

In [7]:
n2v_embeddings = np.load('n2v_emb.npy')

In [8]:
n2v_embeddings.shape

(4267, 128)

# Train-test

In [9]:
node1_concat_node2_emb = []

for index, row in tqdm(all_data.iterrows()):
    node1_concat_node2_emb.append(list(n2v_embeddings[row['node1']]) + list(n2v_embeddings[row['node2']]))
    
all_data['edge_embed'] = node1_concat_node2_emb

2135820it [05:08, 6920.93it/s]


In [10]:
train, test = train_test_split(all_data, test_size=0.2)

In [11]:
clf = MLPClassifier(hidden_layer_sizes=(3,256),
                    random_state=5,
                    verbose=True,
                    learning_rate_init=0.01,
                    max_iter=100)

In [12]:
X_train = train['edge_embed'].tolist()
y_train = train['Y'].tolist()
clf.fit(X_train,y_train)

Iteration 1, loss = 0.28071150
Iteration 2, loss = 0.27519270
Iteration 3, loss = 0.27449319
Iteration 4, loss = 0.27417687
Iteration 5, loss = 0.27408407
Iteration 6, loss = 0.27401680
Iteration 7, loss = 0.27399647
Iteration 8, loss = 0.27389697
Iteration 9, loss = 0.27381315
Iteration 10, loss = 0.27383164
Iteration 11, loss = 0.27365181
Iteration 12, loss = 0.27365840
Iteration 13, loss = 0.27357902
Iteration 14, loss = 0.27356754
Iteration 15, loss = 0.27352407
Iteration 16, loss = 0.27357576
Iteration 17, loss = 0.27348729
Iteration 18, loss = 0.27341171
Iteration 19, loss = 0.27345101
Iteration 20, loss = 0.27342852
Iteration 21, loss = 0.27348650
Iteration 22, loss = 0.27351601
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [13]:
X_test = test['edge_embed'].tolist()
ypred = clf.predict(X_test)

In [14]:
test['pred'] = ypred

# Degree binning perfromance

In [15]:
G_degree = dict(G.degree())
degree_list = list(G_degree.values())

In [16]:
print(min(degree_list),max(degree_list))

1 2234


In [17]:
bin_1_nodes = []
bin_2_nodes = []

for key in tqdm(G_degree.keys()):
    if G_degree[key] < 100:
        bin_1_nodes.append(key)
    else:
        bin_2_nodes.append(key)

100%|██████████| 4267/4267 [00:00<00:00, 1889872.77it/s]


In [18]:
all_test_bin_1 = test[test['node1'].isin(bin_1_nodes) & test['node2'].isin(bin_1_nodes)]
print(len(all_test_bin_1))

13970


In [19]:
print('AUROC: ', roc_auc_score(all_test_bin_1['Y'].tolist(),all_test_bin_1['pred'].tolist()))
print('AUPRC: ', average_precision_score(all_test_bin_1['Y'].tolist(),all_test_bin_1['pred'].tolist()))

AUROC:  0.5
AUPRC:  0.00987831066571224


In [20]:
all_test_bin_2 = test[test['node1'].isin(bin_2_nodes) & test['node2'].isin(bin_2_nodes)]
print(len(all_test_bin_2))

320419


In [21]:
print('AUROC: ', roc_auc_score(all_test_bin_2['Y'].tolist(),all_test_bin_2['pred'].tolist()))
print('AUPRC: ', average_precision_score(all_test_bin_2['Y'].tolist(),all_test_bin_2['pred'].tolist()))

AUROC:  0.8427959647584465
AUPRC:  0.8620929453150907
