In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

In [2]:
ppa_node2vec = np.load('ogbl_ppa_node2vec.npy')

In [3]:
ppa_original_metadata = np.load('ogbl_ppa_original_metadata.npy')

In [4]:
ppa_protvec_metadata = np.load('protvec_embeddings.npy')

## With k=5

In [64]:
k_clusters = 5

In [65]:
ppa_node2vec_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec)

In [66]:
ppa_node2vec_kmeans.labels_

array([3, 3, 2, ..., 0, 2, 3])

In [67]:
davies_bouldin_score(ppa_node2vec,ppa_node2vec_kmeans.labels_)

3.649718137613847

In [68]:
ppa_original_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_original_metadata)

In [69]:
ppa_original_metadata_kmeans.labels_

array([0, 0, 0, ..., 3, 0, 0])

In [70]:
davies_bouldin_score(ppa_original_metadata,ppa_original_metadata_kmeans.labels_)

0.9690113280193648

In [71]:
ppa_protvec_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_protvec_metadata)

In [72]:
ppa_protvec_metadata_kmeans.labels_

array([2, 2, 2, ..., 2, 0, 2])

In [73]:
davies_bouldin_score(ppa_protvec_metadata,ppa_protvec_metadata_kmeans.labels_)

0.8529700419440669

In [74]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_original_metadata_kmeans.labels_)

0.14049682188952967

In [75]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_protvec_metadata_kmeans.labels_)

0.01710647542915478

In [76]:
adjusted_mutual_info_score(ppa_original_metadata_kmeans.labels_, ppa_protvec_metadata_kmeans.labels_)

0.0038632195775763433

Randomized Metadata

In [77]:
def crazyshuffle(arr):
    
    x, y = arr.shape
    rows = np.indices((x,y))[0]
    cols = [np.random.permutation(y) for _ in range(x)]
    
    return arr[rows, cols]

In [78]:
ppa_protvec_metadata = crazyshuffle(ppa_protvec_metadata)

In [79]:
ppa_protvec_metadata_shuffled_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_protvec_metadata)

In [80]:
davies_bouldin_score(ppa_protvec_metadata,ppa_protvec_metadata_shuffled_kmeans.labels_)

0.8977396292943505

In [81]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_protvec_metadata_shuffled_kmeans.labels_)

3.572398341756564e-06

In [60]:
ppa_random_metadata = np.random.rand(ppa_protvec_metadata.shape[0],ppa_protvec_metadata.shape[1])

In [61]:
ppa_random_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_random_metadata)

In [62]:
davies_bouldin_score(ppa_random_metadata,ppa_random_metadata_kmeans.labels_)

9.51475701561646

In [17]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

-4.6991952654937574e-06

In [18]:
adjusted_mutual_info_score(ppa_original_metadata_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

8.310153538095236e-07

In [19]:
adjusted_mutual_info_score(ppa_protvec_metadata_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

-3.528295029461585e-06

Concatenating Node2vec and metadata

In [20]:
ppa_node2vec_original_metadata = np.concatenate((ppa_node2vec,ppa_original_metadata,), axis=1)

In [21]:
ppa_node2vec_original_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec_original_metadata)

In [22]:
davies_bouldin_score(ppa_node2vec_original_metadata,ppa_node2vec_original_metadata_kmeans.labels_)

3.6141679822873014

In [23]:
ppa_node2vec_protvec_metadata = np.concatenate((ppa_node2vec,ppa_protvec_metadata,), axis=1)

In [24]:
ppa_node2vec_protvec_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec_protvec_metadata)

In [25]:
davies_bouldin_score(ppa_node2vec_protvec_metadata,ppa_node2vec_protvec_metadata_kmeans.labels_)

0.858171037219836

In [26]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_node2vec_original_metadata_kmeans.labels_)

0.6676345677995844

In [27]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_node2vec_protvec_metadata_kmeans.labels_)

0.017107493718901082

## With k=10

In [5]:
k_clusters = 10

In [6]:
ppa_node2vec_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec)

In [7]:
ppa_node2vec_kmeans.labels_

array([7, 7, 3, ..., 4, 2, 7])

In [8]:
davies_bouldin_score(ppa_node2vec,ppa_node2vec_kmeans.labels_)

3.717250642809043

In [9]:
ppa_original_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_original_metadata)

In [10]:
ppa_original_metadata_kmeans.labels_

array([0, 0, 0, ..., 5, 0, 0])

In [11]:
davies_bouldin_score(ppa_original_metadata,ppa_original_metadata_kmeans.labels_)

0.9650278894658235

In [12]:
ppa_protvec_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_protvec_metadata)

In [13]:
ppa_protvec_metadata_kmeans.labels_

array([0, 6, 6, ..., 6, 6, 0])

In [14]:
davies_bouldin_score(ppa_protvec_metadata,ppa_protvec_metadata_kmeans.labels_)

0.9926465049178139

In [15]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_original_metadata_kmeans.labels_)

0.2576290117378349

In [16]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_protvec_metadata_kmeans.labels_)

0.018312739384359334

In [17]:
adjusted_mutual_info_score(ppa_original_metadata_kmeans.labels_, ppa_protvec_metadata_kmeans.labels_)

0.005506942929683204

Randomized Metadata

In [18]:
def crazyshuffle(arr):
    
    x, y = arr.shape
    rows = np.indices((x,y))[0]
    cols = [np.random.permutation(y) for _ in range(x)]
    
    return arr[rows, cols]

In [19]:
ppa_protvec_metadata = crazyshuffle(ppa_protvec_metadata)

In [20]:
ppa_protvec_metadata_shuffled_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_protvec_metadata)

In [21]:
davies_bouldin_score(ppa_protvec_metadata,ppa_protvec_metadata_shuffled_kmeans.labels_)

2.481861704679909

In [22]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_protvec_metadata_shuffled_kmeans.labels_)

0.006904369059461411

In [23]:
ppa_random_metadata = np.random.rand(ppa_protvec_metadata.shape[0],ppa_protvec_metadata.shape[1])

In [24]:
ppa_random_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_random_metadata)

In [25]:
davies_bouldin_score(ppa_random_metadata,ppa_random_metadata_kmeans.labels_)

8.250999830547885

In [26]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

2.4273210815149187e-06

In [27]:
adjusted_mutual_info_score(ppa_original_metadata_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

1.2843163078976508e-05

In [28]:
adjusted_mutual_info_score(ppa_protvec_metadata_kmeans.labels_, ppa_random_metadata_kmeans.labels_)

3.8014054598343136e-06

Concatenating Node2vec and metadata

In [29]:
ppa_node2vec_original_metadata = np.concatenate((ppa_node2vec,ppa_original_metadata,), axis=1)

In [30]:
ppa_node2vec_original_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec_original_metadata)

In [31]:
davies_bouldin_score(ppa_node2vec_original_metadata,ppa_node2vec_original_metadata_kmeans.labels_)

3.259108029356588

In [32]:
ppa_node2vec_protvec_metadata = np.concatenate((ppa_node2vec,ppa_protvec_metadata,), axis=1)

In [33]:
ppa_node2vec_protvec_metadata_kmeans = KMeans(n_clusters=k_clusters, random_state=0).fit(ppa_node2vec_protvec_metadata)

In [34]:
davies_bouldin_score(ppa_node2vec_protvec_metadata,ppa_node2vec_protvec_metadata_kmeans.labels_)

1.9583005424339603

In [35]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_node2vec_original_metadata_kmeans.labels_)

0.5878673030599538

In [36]:
adjusted_mutual_info_score(ppa_node2vec_kmeans.labels_, ppa_node2vec_protvec_metadata_kmeans.labels_)

0.006783895637977446