In [1]:
import networkx as nx
import pandas as pd
from community import community_louvain  # Install this using pip if not already installed

# List of file paths
file_paths = [
    'datacleaning/aishihik_lake_preprocessed.csv',
    'datacleaning/AkatoreA_preprocessed.csv',
    'datacleaning/AkatoreB_preprocessed.csv',
    'datacleaning/cold_lake_preprocessed.csv',
    'datacleaning/lake_of_the_woods_preprocessed.csv',
    'datacleaning/mcgregor_river_preprocessed.csv',
    'datacleaning/parsnip_river_preprocessed.csv',
    'datacleaning/sbay_lake_huron_preprocessed.csv',
    'datacleaning/smallwood_reservoir_preprocessed.csv',
    'datacleaning/Venlaw_preprocessed.csv'
]

# Function to calculate network properties, link prediction, and community detection
def calculate_network_properties(file_path):
    # Read the CSV into a pandas DataFrame
    edge_list = pd.read_csv(file_path)
    
    # Create a graph from the edge list (assuming the CSV has columns 'source' and 'target')
    G = nx.from_pandas_edgelist(edge_list, source='source', target='target')
    
    # Calculate network properties
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    degree_distribution = [degree for node, degree in G.degree()]
    mean_degree = sum(degree_distribution) / num_nodes
    clustering_coeff = nx.average_clustering(G)
    
    # Check connectivity for MGD
    if nx.is_connected(G):
        mean_geodesic_distance = nx.average_shortest_path_length(G)
    else:
        mean_geodesic_distance = None  # Not applicable for disconnected graphs
    
    # Link Prediction (using Common Neighbors as an example)
    link_pred_scores = []
    for u, v in nx.non_edges(G):  # All pairs of nodes that are not connected
        score = len(list(nx.common_neighbors(G, u, v)))  # Direct import from base networkx
        link_pred_scores.append((u, v, score))
    
    # Sort by prediction score
    link_pred_scores_sorted = sorted(link_pred_scores, key=lambda x: x[2], reverse=True)
    
    # Community Detection using Louvain Method
    partition = community_louvain.best_partition(G)
    num_communities = len(set(partition.values()))
    
    # Create a mapping of nodes to their communities
    community_mapping = {node: f"Community {comm}" for node, comm in partition.items()}
    
    # Return the results as a formatted string
    results = f"""
    Results for {file_path}:
    ------------------------------
      Number of nodes (species): {num_nodes}
      Number of edges (interactions): {num_edges}
      Degree distribution: {degree_distribution}
      Mean degree: {mean_degree}
      Clustering coefficient: {clustering_coeff}
    """
    if mean_geodesic_distance is not None:
        results += f"      Mean geodesic distance (MGD): {mean_geodesic_distance}\n"
    else:
        results += "      Graph is not connected, MGD cannot be computed.\n"
    
    # Display the top 5 predicted links
    results += "\nTop 5 Predicted Links (based on Common Neighbors):\n"
    for u, v, score in link_pred_scores_sorted[:5]:
        results += f"      ({u}, {v}) - Prediction Score: {score}\n"
    
    # Display community detection results
    results += f"\nNumber of communities detected: {num_communities}\n"
    results += f"Node to Community Mapping (first 10 nodes): {list(community_mapping.items())[:10]}\n"
    
    results += "-" * 50 + "\n"
    
    return results

# Loop through all files and calculate properties, printing the results
for file_path in file_paths:
    result = calculate_network_properties(file_path)
    print(result)



    Results for datacleaning/aishihik_lake_preprocessed.csv:
    ------------------------------
      Number of nodes (species): 39
      Number of edges (interactions): 248
      Degree distribution: [31, 8, 31, 31, 31, 31, 31, 31, 31, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
      Mean degree: 12.717948717948717
      Clustering coefficient: 0.0
          Mean geodesic distance (MGD): 1.6653171390013495

Top 5 Predicted Links (based on Common Neighbors):
      (Thymallus arcticus, Catostomus catostomus) - Prediction Score: 31
      (Thymallus arcticus, Esox lucius) - Prediction Score: 31
      (Thymallus arcticus, Prosopium cylindraceum) - Prediction Score: 31
      (Thymallus arcticus, Coregonus clupeaformis) - Prediction Score: 31
      (Thymallus arcticus, Cottus cognatus) - Prediction Score: 31

Number of communities detected: 1
Node to Community Mapping (first 10 nodes): [('Host species', 'Community 0'), ('Unnamed: 1', 'Community

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
from itertools import permutations
import random
from multiprocessing import Pool, cpu_count
from functools import partial

def sample_motifs(G, sample_size=1000):
    """
    Sample network motifs using a faster sampling approach
    """
    nodes = list(G.nodes())
    n = len(nodes)
    if n < 3:
        return [], []
    
    # Pre-compute adjacency for faster lookup
    adj = {node: set(G.successors(node)) for node in G.nodes()}
    
    ffls = set()
    fbls = set()
    
    # Sample random node triplets instead of checking all combinations
    samples = [random.sample(nodes, 3) for _ in range(sample_size)]
    
    for nodes in samples:
        # Check for feed-forward loops
        if (nodes[1] in adj[nodes[0]] and 
            nodes[2] in adj[nodes[1]] and 
            nodes[2] in adj[nodes[0]]):
            ffls.add(tuple(sorted(nodes)))
            
        # Check for feedback loops
        if (nodes[1] in adj[nodes[0]] and 
            nodes[2] in adj[nodes[1]] and 
            nodes[0] in adj[nodes[2]]):
            fbls.add(tuple(sorted(nodes)))
    
    # Extrapolate counts based on sampling ratio
    total_combinations = n * (n-1) * (n-2)
    scaling_factor = total_combinations / sample_size
    
    return list(ffls), list(fbls), scaling_factor

def generate_random_network(args):
    """
    Generate a single random network (for parallel processing)
    """
    G, seed = args
    random.seed(seed)
    try:
        RG = nx.directed_configuration_model(
            [d for _, d in G.in_degree()],
            [d for _, d in G.out_degree()]
        )
        RG = nx.DiGraph(RG)
        RG.remove_edges_from(nx.selfloop_edges(RG))
        return RG
    except:
        return nx.gnm_random_graph(G.number_of_nodes(), G.number_of_edges(), directed=True)

def calculate_network_properties(file_path):
    """
    Calculate network properties with optimized motif analysis
    """
    # Read and create network
    edge_list = pd.read_csv(file_path)
    G = nx.from_pandas_edgelist(edge_list, source='source', target='target', create_using=nx.DiGraph())
    
    # Basic network metrics
    n_nodes = G.number_of_nodes()
    n_edges = G.number_of_edges()
    mean_degree = 2 * n_edges / n_nodes
    clustering_coeff = nx.average_clustering(G.to_undirected())
    
    # Motif sampling
    ffls, fbls, scaling_factor = sample_motifs(G, sample_size=1000)
    
    # Parallel random network generation (reduced to 100 networks)
    n_random = 100
    with Pool(processes=cpu_count()) as pool:
        random_networks = pool.map(
            generate_random_network,
            [(G, i) for i in range(n_random)]
        )
    
    # Calculate motif statistics for random networks
    random_ffl_counts = []
    random_fbl_counts = []
    
    for RG in random_networks:
        rand_ffls, rand_fbls, rand_scale = sample_motifs(RG, sample_size=1000)
        random_ffl_counts.append(len(rand_ffls) * rand_scale)
        random_fbl_counts.append(len(rand_fbls) * rand_scale)
    
    # Calculate statistics
    ffl_stats = {
        'count': len(ffls) * scaling_factor,
        'mean_random': np.mean(random_ffl_counts),
        'std_random': np.std(random_ffl_counts),
        'z_score': (len(ffls) * scaling_factor - np.mean(random_ffl_counts)) / (np.std(random_ffl_counts) + 1e-10),
        'p_value': sum(count >= len(ffls) * scaling_factor for count in random_ffl_counts) / len(random_ffl_counts)
    }
    
    fbl_stats = {
        'count': len(fbls) * scaling_factor,
        'mean_random': np.mean(random_fbl_counts),
        'std_random': np.std(random_fbl_counts),
        'z_score': (len(fbls) * scaling_factor - np.mean(random_fbl_counts)) / (np.std(random_fbl_counts) + 1e-10),
        'p_value': sum(count >= len(fbls) * scaling_factor for count in random_fbl_counts) / len(random_fbl_counts)
    }
    
    results = f"""
    Results for {file_path}:
    ------------------------------
    Network Properties:
      Nodes: {n_nodes}
      Edges: {n_edges}
      Mean Degree: {mean_degree:.2f}
      Clustering Coefficient: {clustering_coeff:.4f}
    
    Motif Analysis (Estimated):
      Feed-Forward Loops (FFL):
        Estimated Count: {ffl_stats['count']:.1f}
        Random Mean: {ffl_stats['mean_random']:.1f} ± {ffl_stats['std_random']:.1f}
        Z-score: {ffl_stats['z_score']:.4f}
        P-value: {ffl_stats['p_value']:.4f}
        
      Feedback Loops (FBL):
        Estimated Count: {fbl_stats['count']:.1f}
        Random Mean: {fbl_stats['mean_random']:.1f} ± {fbl_stats['std_random']:.1f}
        Z-score: {fbl_stats['z_score']:.4f}
        P-value: {fbl_stats['p_value']:.4f}
    ------------------------------
    """
    
    return results

# File paths
file_paths = [
    'datacleaning/aishihik_lake_preprocessed.csv',
    'datacleaning/AkatoreA_preprocessed.csv',
    'datacleaning/AkatoreB_preprocessed.csv',
    'datacleaning/cold_lake_preprocessed.csv',
    'datacleaning/lake_of_the_woods_preprocessed.csv',
    'datacleaning/mcgregor_river_preprocessed.csv',
    'datacleaning/parsnip_river_preprocessed.csv',
    'datacleaning/sbay_lake_huron_preprocessed.csv',
    'datacleaning/smallwood_reservoir_preprocessed.csv',
    'datacleaning/Venlaw_preprocessed.csv'
]

# Process all files
for file_path in file_paths:
    result = calculate_network_properties(file_path)
    print(result)


    Results for datacleaning/aishihik_lake_preprocessed.csv:
    ------------------------------
    Network Properties:
      Nodes: 39
      Edges: 248
      Mean Degree: 12.72
      Clustering Coefficient: 0.0000
    
    Motif Analysis (Estimated):
      Feed-Forward Loops (FFL):
        Estimated Count: 0.0
        Random Mean: 0.0 ± 0.0
        Z-score: 0.0000
        P-value: 1.0000
        
      Feedback Loops (FBL):
        Estimated Count: 0.0
        Random Mean: 0.0 ± 0.0
        Z-score: 0.0000
        P-value: 1.0000
    ------------------------------
    

    Results for datacleaning/AkatoreA_preprocessed.csv:
    ------------------------------
    Network Properties:
      Nodes: 126
      Edges: 7225
      Mean Degree: 114.68
      Clustering Coefficient: 0.7766
    
    Motif Analysis (Estimated):
      Feed-Forward Loops (FFL):
        Estimated Count: 277326.0
        Random Mean: 79369.9 ± 12733.6
        Z-score: 15.5460
        P-value: 0.0000
        
      Fe