In [1]:
# Install matplotlib (usually comes with Anaconda/Jupyter, but good to ensure)
!pip install matplotlib --quiet

# Install numpy (usually comes with Anaconda/Jupyter, but good to ensure)
!pip install numpy --quiet

# Install igraph
!pip install igraph --quiet

In [2]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors 
import numpy as np
import os 
import igraph as ig
import time
import multiprocessing
import random

In [3]:
def load_graph_from_gml_file(graph_file: str, weight_attribute_name: str = "weight"):
    # Check if the graph file exists
    if not os.path.exists(graph_file):
        print(f"Error: Graph file '{graph_file}' not found.")
        print(f"Please ensure '{graph_file}' is a valid path to your 'lesmis.gml' file.")
        print("You can typically find this file by searching for 'lesmis.gml network dataset'.")
        return # Exit the function if file is not found

    try:
        # Load network from GML file
        # igraph.Graph.Read_GML will automatically load edge attributes like 'value'
        # if they are present in the GML file.
        graph = ig.Graph.Read_GML(graph_file)
        
        # Check if the graph has the correct weight attribute name
        if weight_attribute_name not in graph.edge_attributes():
            print(f"Warning: Graph '{graph_file}' does not have a '{weight_attribute_name}' attribute. "
                  "Community detection will proceed without explicit weights, or if the algorithm "
                  "expects them, it might use default uniform weights.")
            # If no 'value' attribute, assign a default uniform weight for visualization purposes
            graph.es[weight_attribute_name] = 1 

        return graph


    except Exception as e:
        print(f"An error occurred while loading or processing the graph: {e}")
        return

def community_detection(graph: ig.Graph, community_detection_method: str = "multilevel", weight_attribute_name: str = "weight"):
    if community_detection_method == "multilevel":
        return graph.community_multilevel(weights=weight_attribute_name if weight_attribute_name in graph.edge_attributes() else None)
    elif community_detection_method == "fastgreedy":
        return graph.community_fastgreedy(weights=weight_attribute_name if weight_attribute_name in graph.edge_attributes() else None).as_clustering()


# Functions useful to test community structure

In [4]:

def get_modularity_on_clustering(graph: ig.Graph, community_detection_method: str = "multilevel"):
    partition = community_detection(graph, community_detection_method, weight_attribute_name=None)
    return partition.modularity

def rewire(graph: ig.Graph):
    num_randomizations = 500  # Number of randomized networks to generate
    modularity_random_networks = []
    
    
    # TODO: Why is there a * 10 edge swaps
    num_swaps_for_randomization = graph.ecount() * 10
    
    for i in range(num_randomizations):
        # G.rewire() modifies the graph in-place, so we must work on a copy.
        graph_random = graph.copy()
    
        graph_random.rewire(n=num_swaps_for_randomization)
    
        modularity_random_networks.append(get_modularity_on_clustering(graph_random))

    return modularity_random_networks

def plot_histogram(modularity_original: float, modularity_random_networks: list[float], graph_name: str="Karate Club Network"):
    plt.figure(figsize=(10, 6))
    plt.hist(modularity_random_networks, bins=30, alpha=0.7, color='lightgreen',
             edgecolor='black', label='Modularity of Randomized Networks')
    
    # Plot a vertical line for the original network's modularity
    plt.axvline(modularity_original, color='red', linestyle='dashed', linewidth=2,
                label=f'Original Network Modularity ({modularity_original:.4f})')
    
    plt.title(f'Modularity of Original vs. Randomized {graph_name} (igraph)')
    plt.xlabel('Modularity Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', alpha=0.75)
    plt.tight_layout()
    plt.show()


def test_community_structure(graph: ig.Graph, graph_name: str = "Karate Club Network", community_detection_method: str = "multilevel"):
    modularity_orig = get_modularity_on_clustering(graph, community_detection_method)
    modularity_random_networks = rewire(graph)
    plot_histogram(modularity_orig, modularity_random_networks, graph_name)


In [None]:
# A helper function to run a target function with a timeout using multiprocessing
def _run_with_timeout(func, args=(), kwargs={}, timeout_seconds=60):
    """
    Runs a function in a separate process with a timeout.
    Returns (result, True) if successful, (None, False) if timeout occurs.
    """
    # Use a multiprocessing.Queue to get the result from the child process
    q = multiprocessing.Queue()
    
    def target():
        try:
            res = func(*args, **kwargs)
            q.put((res, None)) # Put result and no exception
        except Exception as e:
            q.put((None, e)) # Put no result and the exception

    process = multiprocessing.Process(target=target)
    process.start()
    process.join(timeout=timeout_seconds)

    if process.is_alive():
        # If the process is still alive, it means it timed out
        print(f"Warning: Function '{func.__name__}' timed out after {timeout_seconds} seconds. Terminating process.")
        process.terminate() # Forcefully terminate the process
        process.join() # Wait for termination
        time.sleep(0.01) # Small delay to allow OS cleanup after termination attempt
        return None, False, None # Return None result, False for success, None for exception
    else:
        # Process finished, check for result or exception
        if not q.empty():
            res, exception = q.get()
            if exception:
                raise exception # Re-raise any exception caught in the process
            return res, True, None # Return result, True for success
        else:
            # This case might happen if process terminates unexpectedly without putting anything
            print(f"Warning: Process for '{func.__name__}' finished but no result was put in queue.")
            return None, False, None


def generate_reference_partition(graph: ig.Graph, optimal_timeout_seconds: int, use_optimal_as_reference: bool = True):
    reference_partition = None
    if use_optimal_as_reference:
        ref_partition_result, success, exception = _run_with_timeout(
            graph.community_optimal_modularity,
            timeout_seconds=optimal_timeout_seconds
        )
        if success and ref_partition_result is not None:
            reference_partition = ref_partition_result
            print(f"Optimal partition found with modularity: {reference_partition.modularity:.4f}")
        else:
            if exception:
                print(f"Optimal partition calculation failed with error: {exception}")
            print("Falling back to a fixed-seed Louvain partition as reference.")
            # Fallback for larger graphs or if optimal fails/times out
            random.seed(42) # Fix seed for a reproducible reference
            reference_partition = graph.community_multilevel()
            random.seed(None) # Unset seed for subsequent stochastic runs
            print(f"Reference Louvain partition (fixed seed) found with modularity: {reference_partition.modularity:.4f}")
    else:
        # Option B: Louvain with fixed seed as reference (for larger graphs)
        print("\nUsing a fixed-seed Louvain partition as reference.")
        random.seed(42) # Fix seed for a reproducible reference
        reference_partition = graph.community_multilevel()
        random.seed(None) # Unset seed for subsequent stochastic runs
        print(f"Reference Louvain partition (fixed seed) found with modularity: {reference_partition.modularity:.4f}")

    
    if reference_partition is None:
        raise("Could not establish a reference partition")

    return reference_partition

def run_stochastic_community_detection(graph, reference_partition: ig.clustering.VertexClustering, num_runs: int, community_detection_method: str = "multilevel"):
    nmi_values = []
    print(f"\nRunning Louvain community detection {num_runs} times and calculating NMI...")

    if community_detection_method == "multilevel":
        community_detection = graph.community_multilevel
        params = None
    elif community_detection_method == "leiden":
        community_detection = graph.community_leiden
        params = {
            "objective_function": "modularity", 
            "resolution_parameter": 1.0
        }
    else:
        raise ValueError("Invalid community_detection_method. Choose 'multilevel' or 'leiden'.")

    for i in range(num_runs):
        random.seed()
        if params is not None:
            current_partition = community_detection(**params)
        else:
            current_partition = community_detection()
        
        # Calculate NMI between the current partition and the reference partition
        # 'method='nmi'' specifies Normalized Mutual Information
        nmi = ig.compare_communities(reference_partition, current_partition, method='nmi')
        nmi_values.append(nmi)

        if (i + 1) % (num_runs // 10 if num_runs >= 10 else 1) == 0:
            print(f"  Processed {i + 1}/{num_runs} runs.")
    print(nmi_values)
    return nmi_values

def plot_nmi_histogram(graph, nmi_values):
    plt.figure(figsize=(10, 6))
    plt.hist(nmi_values, bins=60, edgecolor='black', alpha=0.7, color='lightcoral')
    plt.title(f'Histogram of NMI Values (Louvain vs. Reference Partition) for {graph.vcount()} nodes')
    plt.xlabel('Normalized Mutual Information (NMI) Score')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)

    # Add a line for the mean NMI
    mean_nmi = np.mean(nmi_values)
    plt.axvline(mean_nmi, color='blue', linestyle='dashed', linewidth=2,
                label=f'Mean NMI: {mean_nmi:.4f}')

    plt.legend()
    plt.tight_layout()
    plt.show()
