In [1]:
import os
import numpy as np

In [2]:
# Define dataset name
dataset_name = "ENZYMES"

# Define datasets destination
datasets_folder = "datasets/"
os.makedirs(datasets_folder, exist_ok = True)

# File paths of edge list and node IDs list
edges_file = os.path.join(datasets_folder, dataset_name, f"{dataset_name}.edges")
graph_idx_file = os.path.join(datasets_folder, dataset_name, f"{dataset_name}.graph_idx")

# Load edge list and node IDs list
edges = np.loadtxt(edges_file, dtype=int, delimiter=",")
graph_idx = np.loadtxt(graph_idx_file, dtype=int)

In [3]:
from graph_tool import all as gt
from math import comb

In [4]:
# Keep node IDs file and edges file row indexes to read graphs sequentially
nodes_file_row = 0
edges_file_row = 0

# Extract graph ids
unique_graph_ids = np.unique(graph_idx)

# Global clustering coefficients of graphs in dataset
clustering_coefficients = []

for current_graph_id in unique_graph_ids:

    # Data structure for soring node IDs and edges
    current_graph_node_ids = []
    current_graph_edges = []

    # Extract node IDs of current graph
    for idx, graph_id in enumerate(graph_idx[nodes_file_row:], start = nodes_file_row):
        if(graph_id == current_graph_id):
            current_graph_node_ids.append(idx + 1)
        else:
            nodes_file_row = nodes_file_row + len(current_graph_node_ids)
            break

    # Structure to store edges already in the current graph
    seen_edges = set()

    # Extract edges of current graph
    for row, col in edges[edges_file_row:]:
        if row in current_graph_node_ids and col in current_graph_node_ids:
                
            # Sort edge tuple to check if is already present
            edge = tuple(sorted((row, col)))        
            if(edge not in seen_edges):
                current_graph_edges.append((row, col))
                seen_edges.add(edge)

            # Count at which row we are reading the file
            edges_file_row = edges_file_row + 1
        else:
            break
    
    # Current graph initialization
    g = gt.Graph(directed = False)

    # Add nodes to graph
    for node_id in current_graph_node_ids:
        g.add_vertex()

    # Map node IDs to range starting from 0 (needed in graphs that are not the first)
    node_ids_map = {node_id : shifted_node_id for shifted_node_id, node_id in enumerate(current_graph_node_ids)}
    mapped_edges = [(node_ids_map[row], node_ids_map[col]) for row, col in current_graph_edges]

    # Add edge list to graph
    g.add_edge_list(mapped_edges)
        
    # Compute combination n choose 3 (n is the number of nodes in current graph)
    n_choose_three = comb(g.num_vertices(), 3)

    # Compute and store clustering coefficient 
    clustering_coefficient = gt.global_clustering(g, ret_counts = True)
    
    if n_choose_three > 0:
        clustering_coefficients.append(clustering_coefficient[1] / n_choose_three)
    else:
        clustering_coefficients.append(clustering_coefficient[1])

In [5]:
# Write global clustering coefficient of all graphs in a text file
output_file =  os.path.join(datasets_folder, dataset_name, f"{dataset_name}.global_cc")

with open(output_file, "w") as file:
    for clustering_coefficient in clustering_coefficients:
        file.write(f"{clustering_coefficient}\n")

print(f"Clustering coefficients have been written in {output_file} successfully")

Clustering coefficients have been written in datasets/ENZYMES/ENZYMES.global_cc successfully
