In [1]:
import networkx as nx

from pmotifs.config import config

In [2]:
import os

ppi_networks_dir = config.DATASET_DIRECTORY / "some_PPI_networks"

In [3]:
def parse_ppi_tsv(tsv):
    return nx.read_edgelist(
        tsv,
        delimiter="\t",
        data=[
            ("node1_string_id", str),
            ("node2_string_id", str),
            ("neighborhood_on_chromosome", float),
            ("gene_fusion", float),
            ("phylogenetic_cooccurrence", float),
            ("homology", float),
            ("coexpression", float),
            ("experimentally_determined_interaction", float),
            ("database_annotated", float),
            ("automated_textmining", float),
            ("combined_score", float),
        ],
    )

In [4]:
networks = {
    n: parse_ppi_tsv(ppi_networks_dir / n)
    for n in os.listdir(ppi_networks_dir)
}

In [5]:
def remove_edges_to_fit(original_g, max_edges, score_field="combined_score"):
    if max_edges < 0:
        raise ValueError("Can not reduce graph to less than 0 edges!")
    
    g = original_g.copy()
    del original_g

    scores_to_edges = {}
    for edge, value in nx.get_edge_attributes(g, name=score_field).items():
        scores_to_edges[value] = scores_to_edges.get(value, []) + [edge]

    total_edge_count = len(g.edges)
    unique_scores_sorted = sorted(scores_to_edges.keys())

    removed = 0
    current_cutoff_index = -1
    while total_edge_count - removed > max_edges:
        current_cutoff_index += 1
        cutoff_value = unique_scores_sorted[current_cutoff_index]
        
        values_to_remove = [v for v in unique_scores_sorted if v <= cutoff_value]
        edges_to_remove = [edge for v in values_to_remove for edge in scores_to_edges[v] ]
        removed = sum([len(scores_to_edges[v]) for v in values_to_remove])

    for edge in edges_to_remove:
        g.remove_edge(*edge)

    return g, cutoff_value

In [7]:
from pmotifs.gtrieScanner.graph_io import write_shifted_edgelist

max_edges = 2000

for n, g in networks.items():
    print(n, len(g.nodes), len(g.edges))
    preproc = False
    if len(g.edges) > max_edges:
        print("\tToo many edges, pre-processing by finding optimal cutoff-value")
        preproc_g, cutoff_value = remove_edges_to_fit(g, max_edges)
        
        print("\tCutoff Chosen at", cutoff_value)
        print(f"\tRemoved {round(1 - len(preproc_g.edges) / len(g.edges), 2) * 100}% of all edges")
        print("\tNew graph: ", len(preproc_g.nodes), len(preproc_g.edges))
        preproc = True
    
    # Write new graphs
    old_postfix = "_string_interactions_short.tsv"
    if preproc:
        new_postfix = f"_cutoff_{cutoff_value}.edgelist"
        out_graph = preproc_g
    else:
        new_postfix = f"_cutoff_None.edgelist"
        out_graph = g
    
    outpath = config.DATASET_DIRECTORY / n.replace(old_postfix, new_postfix)
    write_shifted_edgelist(out_graph, outpath, reindex=True)

human_memory_all_string_interactions_short.tsv 106 522
ecoli_metabolism_macromol_string_interactions_short.tsv 805 10414
	Too many edges, pre-processing by finding optimal cutoff-value
	Cutoff Chosen at 0.965
	Removed 81.0% of all edges
	New graph:  805 1988
human_cancer_string_interactions_short.tsv 862 19730
	Too many edges, pre-processing by finding optimal cutoff-value
	Cutoff Chosen at 0.936
	Removed 90.0% of all edges
	New graph:  862 1997
human_celldeath_string_interactions_short.tsv 1068 15758
	Too many edges, pre-processing by finding optimal cutoff-value
	Cutoff Chosen at 0.94
	Removed 87.0% of all edges
	New graph:  1068 1993
human_cellcycleGO_string_interactions_short.tsv 1293 37751
	Too many edges, pre-processing by finding optimal cutoff-value
	Cutoff Chosen at 0.989
	Removed 95.0% of all edges
	New graph:  1293 1978
human_brain_development_string_interactions_short.tsv 733 8824
	Too many edges, pre-processing by finding optimal cutoff-value
	Cutoff Chosen at 0.775
	Remov