In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from collections import defaultdict

# Define the list of file paths
file_paths = [
    'datacleaning/aishihik_lake_preprocessed.csv',
    'datacleaning/AkatoreA_preprocessed.csv',
    'datacleaning/AkatoreB_preprocessed.csv',
    'datacleaning/cold_lake_preprocessed.csv',
    'datacleaning/lake_of_the_woods_preprocessed.csv',
    'datacleaning/mcgregor_river_preprocessed.csv',
    'datacleaning/parsnip_river_preprocessed.csv',
    'datacleaning/sbay_lake_huron_preprocessed.csv',
    'datacleaning/smallwood_reservoir_preprocessed.csv',
    'datacleaning/Venlaw_preprocessed.csv'
]

def calculate_network_metrics(file_path):
    """
    Calculate comprehensive network metrics including bipartite clustering coefficients
    """
    # Read the edge list
    edge_list = pd.read_csv(file_path)
    
    # Create NetworkX graph
    G = nx.from_pandas_edgelist(edge_list, source='source', target='target')
    
    # Basic network metrics
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    degree_distribution = [degree for node, degree in G.degree()]
    mean_degree = sum(degree_distribution) / num_nodes
    clustering_coeff_nx = nx.average_clustering(G)
    
    # Calculate mean geodesic distance if possible
    if nx.is_connected(G):
        mean_geodesic_distance = nx.average_shortest_path_length(G)
    else:
        mean_geodesic_distance = None
    
    # Create bipartite projection matrices
    # First, identify hosts and parasites
    unique_sources = edge_list['source'].unique()
    unique_targets = edge_list['target'].unique()
    
    # Create adjacency matrix
    adj_matrix = np.zeros((len(unique_sources), len(unique_targets)))
    source_to_idx = {source: idx for idx, source in enumerate(unique_sources)}
    target_to_idx = {target: idx for idx, target in enumerate(unique_targets)}
    
    for _, row in edge_list.iterrows():
        i = source_to_idx[row['source']]
        j = target_to_idx[row['target']]
        adj_matrix[i, j] = 1
    
    # Calculate host projection
    host_projection = np.dot(adj_matrix, adj_matrix.T)
    
    # Calculate bipartite clustering coefficient
    host_coefficients = {}
    total_triangles = 0
    total_possible = 0
    
    for i, host in enumerate(unique_sources):
        neighbors = np.where(host_projection[i] > 0)[0]
        if len(neighbors) < 2:
            host_coefficients[host] = 0
            continue
            
        possible_triangles = len(neighbors) * (len(neighbors) - 1) / 2
        actual_triangles = 0
        
        for j in neighbors:
            for k in neighbors:
                if j < k:
                    if host_projection[j,k] > 0:
                        actual_triangles += 1
        
        total_triangles += actual_triangles
        total_possible += possible_triangles
        
        cc = actual_triangles / possible_triangles if possible_triangles > 0 else 0
        host_coefficients[host] = cc
    
    global_bipartite_cc = total_triangles / total_possible if total_possible > 0 else 0
    
    return {
        'filename': file_path.split('/')[-1],
        'num_nodes': num_nodes,
        'num_edges': num_edges,
        'mean_degree': mean_degree,
        'clustering_coeff_nx': clustering_coeff_nx,
        'mean_geodesic_distance': mean_geodesic_distance,
        'global_bipartite_cc': global_bipartite_cc,
        'host_coefficients': host_coefficients,
        'degree_distribution': degree_distribution
    }

# Process all files
results = []
for file_path in file_paths:
    try:
        metrics = calculate_network_metrics(file_path)
        results.append(metrics)
        
        # Print detailed results for each network
        print(f"\nResults for {metrics['filename']}:")
        print("-" * 50)
        print(f"Number of nodes: {metrics['num_nodes']}")
        print(f"Number of edges: {metrics['num_edges']}")
        print(f"Mean degree: {metrics['mean_degree']:.3f}")
        print(f"NetworkX clustering coefficient: {metrics['clustering_coeff_nx']:.3f}")
        print(f"Bipartite clustering coefficient: {metrics['global_bipartite_cc']:.3f}")
        
        if metrics['mean_geodesic_distance'] is not None:
            print(f"Mean geodesic distance: {metrics['mean_geodesic_distance']:.3f}")
        else:
            print("Mean geodesic distance: Not applicable (disconnected graph)")
        
        print("\nTop 5 nodes by clustering coefficient:")
        sorted_hosts = dict(sorted(metrics['host_coefficients'].items(), 
                                 key=lambda x: x[1], 
                                 reverse=True)[:5])
        for host, cc in sorted_hosts.items():
            print(f"{host}: {cc:.3f}")
        
        print("-" * 50)
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# Calculate summary statistics across all networks
print("\nSummary Statistics Across All Networks:")
print("-" * 50)
print(f"Average number of nodes: {np.mean([r['num_nodes'] for r in results]):.1f}")
print(f"Average number of edges: {np.mean([r['num_edges'] for r in results]):.1f}")
print(f"Average clustering coefficient (NetworkX): {np.mean([r['clustering_coeff_nx'] for r in results]):.3f}")
print(f"Average bipartite clustering coefficient: {np.mean([r['global_bipartite_cc'] for r in results]):.3f}")

# Calculate connected networks' MGD
connected_mgds = [r['mean_geodesic_distance'] for r in results if r['mean_geodesic_distance'] is not None]
if connected_mgds:
    print(f"Average mean geodesic distance (connected networks only): {np.mean(connected_mgds):.3f}")


Results for aishihik_lake_preprocessed.csv:
--------------------------------------------------
Number of nodes: 39
Number of edges: 248
Mean degree: 12.718
NetworkX clustering coefficient: 0.000
Bipartite clustering coefficient: 1.000
Mean geodesic distance: 1.665

Top 5 nodes by clustering coefficient:
Host species: 1.000
Catostomus catostomus: 1.000
Coregonus clupeaformis: 1.000
Cottus cognatus: 1.000
Esox lucius: 1.000
--------------------------------------------------

Results for AkatoreA_preprocessed.csv:
--------------------------------------------------
Number of nodes: 126
Number of edges: 6279
Mean degree: 99.667
NetworkX clustering coefficient: 0.777
Bipartite clustering coefficient: 1.000
Mean geodesic distance: 1.208

Top 5 nodes by clustering coefficient:
Unidentified detritus: 1.000
Terrestrial invertebrates: 1.000
Plant materials: 1.000
Meiofauna: 1.000
Achnanthes inflata: 1.000
--------------------------------------------------

Results for AkatoreB_preprocessed.csv:
