In [None]:
import pandas as pd
import networkx as nx

import pickle
import random

from pathlib import Path

In [None]:
def graph_from_edge_csv(
    path,
    from_col="from",
    to_col="to",
    directed=False,
    edge_attr=None,
    has_header=True,
    separator=','
):
    """
    Create a NetworkX graph from a CSV edge list.

    Parameters
    ----------
    path : str or path-like
        Path to the CSV file.
    from_col : str
        Name of the column containing the source node.
    to_col : str
        Name of the column containing the target node.
    directed : bool
        If True, create a DiGraph, else an undirected Graph.
    edge_attr : str or list of str or None
        Column name(s) to use as edge attributes.
    has_header : bool
        If True, first row contains column names. If False, assign names=[from_col, to_col].
    separator : str
        CSV delimiter.

    Returns
    -------
    G : networkx.Graph or networkx.DiGraph
    """
    if has_header:
        df = pd.read_csv(path, sep=separator)
    else:
        df = pd.read_csv(path, header=None, names=[from_col, to_col], sep=separator)
    
    graph_type = nx.DiGraph() if directed else nx.Graph()

    G = nx.from_pandas_edgelist(
        df,
        source=from_col,
        target=to_col,
        edge_attr=edge_attr,
        create_using=graph_type,
    )
    return G


In [None]:
def build_graphs_from_folder(
    input_folder,
    output_folder,
    from_col="from",
    to_col="to",
    directed=False,
    edge_attr=None,
    has_header=True,
    separator=',',
    pattern="*.csv"
):
    """
    Read all CSV edge files from input_folder, build NetworkX graphs,
    and save them to output_folder using standard pickle.

    Parameters
    ----------
    input_folder : str or Path
        Folder containing CSV edge files.
    output_folder : str or Path
        Folder to save .pkl files.
    from_col, to_col, directed, edge_attr, has_header, separator : see graph_from_edge_csv
    pattern : str
        Glob pattern for CSV files.

    Returns
    -------
    dict
        {csv_filename: graph} mapping
    """
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)

    csv_files = list(input_path.glob(pattern))
    graphs = {}

    for csv_file in csv_files:
        basename = csv_file.stem
        print(f"Processing {csv_file.name}...")

        G = graph_from_edge_csv(
            str(csv_file),
            from_col=from_col,
            to_col=to_col,
            directed=directed,
            edge_attr=edge_attr,
            has_header=has_header,
            separator=separator,
        )

        output_file = output_path / f"{basename}.pkl"
        with open(output_file, 'wb') as f:
            pickle.dump(G, f)
        
        graphs[str(csv_file)] = G
        print(f"Saved {output_file}")

    print(f"Processed {len(graphs)} graphs.")
    return graphs


In [None]:
csvs_folder = '/media/xander/HDD/Repos/data/graphs/csv'
output_folder = '/media/xander/HDD/Repos/data/graphs/'

In [None]:
# Save graphs as .gpickle files (fast, NetworkX native)
graphs = build_graphs_from_folder(
    input_folder=csvs_folder,
    output_folder=output_folder,
    has_header=False,
    pattern="*.csv"
)

In [None]:
def load_seeds(seeds_file):
    """Load comma-separated seeds from text file."""
    with open(seeds_file, 'r') as f:
        seeds_str = f.read().strip()
    return [s.strip() for s in seeds_str.split(',') if s.strip()]

def generate_experimental_seeds(graph_folder, pct=0.10, num_seeds=10, seed=None):
    """
    Generate seeds following the article's first experiment setup.
    
    For each graph: randomly select 10% of nodes (or fixed num_seeds).
    Save as comma-separated text file.
    
    Parameters
    ----------
    graph_folder : str/Path
        Folder with .pkl graphs.
    pct : float
        Fraction of nodes for seeds (default 0.10).
    num_seeds : int
        Fixed number if pct=0 (overrides).
    seed : int
        Random seed for reproducibility.
    """
    if seed is not None:
        random.seed(seed)
    
    graph_path = Path(graph_folder)
    for pkl_file in graph_path.glob("*.pkl"):
        with open(pkl_file, 'rb') as f:
            G = pickle.load(f)
        
        nodes = list(G.nodes())
        n = len(nodes)
        
        if num_seeds > 0:
            k = min(num_seeds, n)  # Cap at graph size
        else:
            k = max(1, int(n * pct))
        
        seeds = random.sample(nodes, k)
        seeds.sort()  # Deterministic order
        
        seeds_file = pkl_file.with_suffix('.seeds.txt')
        with open(seeds_file, 'w') as f:
            f.write(','.join(map(str, seeds)))
        
        print(f"{pkl_file.name}: {n} nodes → {k} seeds → {seeds_file.name}")


In [None]:
# Usage
generate_experimental_seeds(output_folder, pct=0.10, seed=2026)