In [None]:
!pip install scipy==1.8.1
import scipy

from IPython.display import clear_output
clear_output()

In [None]:
!echo "deb http://downloads.skewed.de/apt jammy main" >> /etc/apt/sources.list
!apt-key adv --keyserver keyserver.ubuntu.com --recv-key 612DEFB798507F25
!apt-get update
!apt-get install python3-graph-tool python3-matplotlib python3-cairo

clear_output()

In [None]:
!apt purge python3-cairo
!apt install libcairo2-dev pkg-config python3-dev
!pip install --force-reinstall pycairo
!pip install zstandard
!pip install rustworkx

clear_output()

In [None]:
!pip install infomap
!pip install igraph
!pip install pygsp
!pip install graphlearning
!pip install ucimlrepo

clear_output()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/GSMB')

In [None]:
!ls drive/MyDrive/GSMB

amazon.txt			 D_Amazon	   helper_plots.py     Results
B_Amazon			 data		   knn_data	       S_Amazon
clustering.py			 Datasets	   metric_backbone.py  spectral_clustering
community_experiments_plots.py	 datasets.py	   metrics.py	       T_Amazon
community_experiments_tables.py  graph_builder.py  __pycache__	       TSC.py


In [None]:
def get_Amazon_meta():
    f = open("drive/MyDrive/GSMB/amazon.txt", 'r')
    n = int(f.readline())
    print("n", n)

    idxToLabel = {}
    for i in range(n):
        line = f.readline().split(" ")
        x = int(float(line[0]))
        label = int(float(line[1]))
        idxToLabel[x] = label

    f.close()
    return idxToLabel


In [None]:
import graph_tool.all as gt
import numpy as np
import matplotlib
import time

def get_Bayesian_partition(D, weight='proximity'):
    start = time.time()
    Dweights = np.array([D[e[0]][e[1]][weight] for e in D.edges])
    edges = np.array([e for e in D.edges])

    g = gt.Graph(directed=False)
    g.add_edge_list(edges)
    ew = g.new_edge_property("double")
    ew.a = Dweights
    g.ep['weight'] = ew

    state = gt.minimize_blockmodel_dl(g, state_args=dict(recs=[g.ep.weight],
                                                            rec_types=["real-exponential"]))

    blocks = state.get_blocks().get_array()
    partition = {}
    clusterTypes = set()
    for i in range(len(blocks)):
        partition[i] = blocks[i]
        clusterTypes.add(partition[i])

    #For isolated vertices
    choices = list(clusterTypes)
    for i in range(len(blocks), D.number_of_nodes()):
        partition[i] = random.choice(choices)

    end = time.time()
    print("Bayesian Fit executed in %.3f s" % (end-start))
    print("Num partitions =", len(clusterTypes))

    partitions = {}
    partitions['Bayesian'] = (partition, len(clusterTypes))
    return partitions

In [None]:
from metrics import *
from datasets import *
from clustering import *
from graph_builder import *
import networkx as nx

initialPath = '/content/drive/MyDrive/GSMB/'

def get_partitions_similarity_ARI(p1, p2):
    if p1 is None or p2 is None:
        return -1

    vals1 = []
    vals2 = []
    for i in p1.keys():
        vals1.append(p1[i])
        vals2.append(p2[i])
    return sklearn.metrics.adjusted_rand_score(vals1, vals2)

def get_similarities(f, type, partitions, partitions_D, partition_Meta=None):
    similarity_D = {}
    similarity_metaLabels = {}
    for algo in partitions:
        (partition, cluster) = partitions[algo]
        (partitionD, clusterD) = partitions_D[algo]
        similarity_D[algo] = get_partitions_similarity_ARI(partitionD, partition)
        similarity_metaLabels[algo] = get_partitions_similarity_ARI(partition_Meta, partition)

    if partition_Meta is not None:
        for algo in similarity_metaLabels.keys():
            f.write(type + " Meta " + algo + " " + str(similarity_metaLabels[algo]) + "\n")
    for algo in similarity_D.keys():
        f.write(type + " Original " + algo + " " + str(similarity_D[algo]) + "\n")


def compute_similarities_real_datasets_Bayesian():
    get_dataset_array = [get_high_school_dataset, get_primary_school_dataset, get_DBLP_dataset, get_Amazon_dataset]
    title_array = ["High_School", "Primary_School", "DBLP", "Amazon"]
    has_meta_array = [True, True, True, True]

    get_dataset_array = [get_USairport500_dataset, get_OpenFlights_dataset]
    title_array = ["US_Airport500", "Open_Flights"]
    has_meta_array = [False, False]

    f = open("./Similarities.txt", 'w', encoding="utf-8")
    for i in range(len(get_dataset_array)):
        get_dataset = get_dataset_array[i]
        has_meta = has_meta_array[i]
        title = title_array[i]
        f.write("\n" + title + '\n')

        if title != "Amazon":
            partition_Meta = None
            clusters_Meta = -1
            if has_meta:
                D, D_ig, partition_Meta, B, B_ig, T, T_ig, S, S_ig = get_graphs(get_dataset, has_meta, True)
                clusters_Meta = len(set(partition_Meta.values()))
            else:
                D, D_ig, B, B_ig, T, T_ig, S, S_ig = get_graphs(get_dataset, has_meta, True)
        else:
            D = nx.read_weighted_edgelist("drive/MyDrive/GSMB/D_Amazon", nodetype=int)
            B = nx.read_weighted_edgelist("drive/MyDrive/GSMB/B_Amazon", nodetype=int)
            T = nx.read_weighted_edgelist("drive/MyDrive/GSMB/T_Amazon", nodetype=int)
            S = nx.read_weighted_edgelist("drive/MyDrive/GSMB/S_Amazon", nodetype=int)

            for u, v, data in D.edges(data=True):
                data["proximity"] = data.pop("weight")
            for u, v, data in B.edges(data=True):
                data["proximity"] = data.pop("weight")
            for u, v, data in T.edges(data=True):
                data["proximity"] = data.pop("weight")
            for u, v, data in S.edges(data=True):
                data["proximity"] = data.pop("weight")

            partition_Meta = get_Amazon_meta()
            clusters_Meta = len(set(partition_Meta.values()))

        partitions_D = get_Bayesian_partition(D)
        partitions_B = get_Bayesian_partition(B)
        partitions_T = get_Bayesian_partition(T)
        partitions_S = get_Bayesian_partition(S)

        get_similarities(f, "Original", partitions_D, partitions_D, partition_Meta)
        get_similarities(f, "Backbone", partitions_B, partitions_D, partition_Meta)
        get_similarities(f, "Threshold", partitions_T, partitions_D, partition_Meta)
        get_similarities(f, "Spielman", partitions_S, partitions_D, partition_Meta)

    f.close()

np.random.seed(11)
compute_similarities_real_datasets_Bayesian()


Built the distance graph in 2.559 s
Built the metric backbone graph in 0.401 s
Spielman sparisfier ran in 0.473 s
Built the Spielman graph in 0.583 s
Built the threshold graph in 0.034 s
Bayesian Fit executed in 4.741 s
Num partitions = 16
Bayesian Fit executed in 0.940 s
Num partitions = 10
Bayesian Fit executed in 1.928 s
Num partitions = 4
Bayesian Fit executed in 0.811 s
Num partitions = 6
HI
Built the distance graph in 7.762 s
Built the metric backbone graph in 3.272 s
Spielman sparisfier ran in 31.034 s
Built the Spielman graph in 31.380 s
Built the threshold graph in 0.074 s
Bayesian Fit executed in 11.553 s
Num partitions = 32
Bayesian Fit executed in 2.902 s
Num partitions = 24
Bayesian Fit executed in 3.924 s
Num partitions = 21
Bayesian Fit executed in 3.152 s
Num partitions = 19
