In [2]:
from pathlib import Path
import random
import time

import pandas as pd
import networkx as nx

from ddh.io_handlers import load_csv_data
from ddh.config import CsvFilesConfig
from ddh.graphs import build_ppi_graph
from ddh.scoring import build_drug_target_map, build_disease_gene_map


In [3]:
base = Path("../data/real")

cfg = CsvFilesConfig(
    drugs_csv=base / "drugs.csv",
    genes_csv=base / "genes.csv",
    diseases_csv=base / "diseases.csv",
    drug_targets_csv=base / "drug_targets_filtered.csv",
    gene_disease_csv=base / "gene_disease_filtered.csv",
    ppi_csv=base / "ppi.csv",
)

drugs, genes, diseases, dts, gds, ppis = load_csv_data(cfg)

len(drugs), len(genes), len(diseases), len(dts), len(gds), len(ppis)

(1079727, 20997, 22778, 325311, 933568, 1016148)

In [4]:
drug_to_genes = build_drug_target_map(dts)
disease_to_genes = build_disease_gene_map(gds)

G = build_ppi_graph(ppis)

print(f"# drugs with targets: {len(drug_to_genes)}")
print(f"# diseases with genes: {len(disease_to_genes)}")
print(f"PPI graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# Total possible pairs if you ever consider all combinations
n_pairs_all = len(drug_to_genes) * len(disease_to_genes)
print(f"Max drug–disease pairs (cartesian): {n_pairs_all:,}")


# drugs with targets: 244203
# diseases with genes: 19783
PPI graph: 19497 nodes, 1016148 edges
Max drug–disease pairs (cartesian): 4,831,067,949


In [5]:
def mean_shortest_distance_between_sets(G, genes_a, genes_b, cutoff=None):
    """
    Compute the mean shortest path length between all gene pairs
    in genes_a × genes_b that are connected in G.

    cutoff: optional max distance to search; beyond that, treated as no path.
    """
    # Convert to sets, remove genes not in graph
    genes_a = [g for g in set(genes_a) if g in G]
    genes_b = [g for g in set(genes_b) if g in G]

    if not genes_a or not genes_b:
        return float("inf")

    dists = []

    # For each source gene, get distances to all nodes (BFS / Dijkstra)
    for ga in genes_a:
        # single-source shortest paths
        sp = nx.single_source_shortest_path_length(G, ga, cutoff=cutoff)
        for gb in genes_b:
            if gb in sp:
                dists.append(sp[gb])

    if not dists:
        return float("inf")

    return sum(dists) / len(dists)


In [6]:
# Build list of all possible pairs (or just overlapping ones if you want)
drug_ids = list(drug_to_genes.keys())
disease_ids = list(disease_to_genes.keys())

# If this is too big, just think of it as conceptual; we only sample
n_pairs_all = len(drug_ids) * len(disease_ids)
print(f"Max pairs: {n_pairs_all:,}")

# Sample a manageable number of random pairs
n_sample = 200  # you can adjust
pairs_sample = []
for _ in range(n_sample):
    d = random.choice(drug_ids)
    dis = random.choice(disease_ids)
    pairs_sample.append((d, dis))

start = time.perf_counter()
results = []

for d, dis in pairs_sample:
    mg = mean_shortest_distance_between_sets(
        G,
        drug_to_genes[d],
        disease_to_genes[dis],
        cutoff=5,   # optional: limit radius to speed up
    )
    results.append(mg)

elapsed = time.perf_counter() - start
print(f"Computed {n_sample} pairs in {elapsed:.2f} s")
print(f"Average time per pair: {elapsed / n_sample:.4f} s")

Max pairs: 4,831,067,949
Computed 200 pairs in 41.14 s
Average time per pair: 0.2057 s


In [7]:
time_per_pair = elapsed / n_sample

print(f"Estimated time per pair: {time_per_pair:.4f} s")

est_full_all_pairs_sec = time_per_pair * n_pairs_all
est_full_all_pairs_hr = est_full_all_pairs_sec / 3600

print(f"If we did ALL {n_pairs_all:,} pairs:")
print(f"  ~{est_full_all_pairs_sec:,.0f} seconds (~{est_full_all_pairs_hr:.2f} hours)")


Estimated time per pair: 0.2057 s
If we did ALL 4,831,067,949 pairs:
  ~993,843,060 seconds (~276067.52 hours)


In [8]:
#python -m ddh.cli csv   --drugs-csv data/real/drugs.csv   --genes-csv data/real/genes.csv   --diseases-csv data/real/diseases.csv   --drug-targets-csv data/real/drug_targets_filtered.csv   --gene-disease-csv data/real/gene_disease_filtered.csv   --ppi-csv data/real/ppi.csv   --top-k 20  --output-csv outputs/real_results.csv

In [10]:
from ddh.scoring import compute_overlap_table_fast
from ddh.io_handlers import load_csv_data
from ddh.config import CsvFilesConfig
from pathlib import Path

base = Path("../data/real")
cfg = CsvFilesConfig(
    drugs_csv=base / "drugs.csv",
    genes_csv=base / "genes.csv",
    diseases_csv=base / "diseases.csv",
    drug_targets_csv=base / "drug_targets_filtered.csv",
    gene_disease_csv=base / "gene_disease_filtered.csv",
    ppi_csv=base / "ppi.csv",
)

drugs, genes, diseases, dts, gds, ppis = load_csv_data(cfg)
overlap_df = compute_overlap_table_fast(dts, gds)
len(overlap_df)

40175481

In [14]:
import pandas as pd

dt = pd.read_csv("../data/real/drug_targets_filtered.csv")
len(dt)  # total edges after filter
dt["drug_id"].nunique()  # should be 694
dt.groupby("drug_id")["gene_id"].nunique().describe()  # targets per drug


count    244203.000000
mean          1.332134
std           1.104534
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max         198.000000
Name: gene_id, dtype: float64

In [None]:
python scripts/annotate_pairs_with_ppi_from_matrix.py \
  --pairs-csv outputs/real_results_overlap.csv \
  --drug-targets-csv data/real/drug_targets_filtered.csv \
  --gene-disease-csv data/real/gene_disease_filtered.csv \
  --gene-index data/real/gene_index.csv \
  --dist-matrix data/real/gene_distances.uint16.dat \
  --max-pairs 100000000 \
  --chunk-size 10000 \
  --alpha 1.0 \
  --beta 1.0 \
  --output outputs/real_results_with_ppi.csv

In [24]:
IGNORE = {'.git', '__pycache__', '.idea', '.ipynb_checkpoints', 'ddh_env' , '.pytest_cache'}

In [25]:
import os

def print_tree(start_path: str, prefix: str = ""):
    """Recursively print a tree structure starting at start_path."""
    # Get entries in the directory, sorted for consistency
    try:
        entries = [e for e in os.listdir(start_path) if e not in IGNORE]
    except PermissionError:
        print(prefix + "└── [Permission Denied]")
        return

    # Build full paths and separate files/dirs to list dirs first
    entries_full = [os.path.join(start_path, e) for e in entries]
    entries = list(zip(entries, entries_full))
    dirs = [e for e in entries if os.path.isdir(e[1])]
    files = [e for e in entries if not os.path.isdir(e[1])]
    entries_ordered = dirs + files

    for index, (name, full_path) in enumerate(entries_ordered):
        is_last = index == len(entries_ordered) - 1
        connector = "└── " if is_last else "├── "

        print(prefix + connector + name)

        if os.path.isdir(full_path):
            extension = "    " if is_last else "│   "
            print_tree(full_path, prefix + extension)


if __name__ == "__main__":
    root = "."  # change this to any path you want
    print(root)
    print_tree(root)

.
├── src
│   ├── ddh
│   │   ├── data_models.py
│   │   ├── __init__.py
│   │   ├── io_handlers.py
│   │   ├── scoring.py
│   │   ├── graphs.py
│   │   ├── pipeline.py
│   │   ├── cli.py
│   │   └── config.py
│   └── ddh.egg-info
│       ├── requires.txt
│       ├── PKG-INFO
│       ├── dependency_links.txt
│       ├── top_level.txt
│       └── SOURCES.txt
├── data
│   ├── toy
│   │   ├── toy_diseases.csv
│   │   ├── toy_drugs.csv
│   │   ├── toy_genes.csv
│   │   ├── toy_drug_targets.csv
│   │   ├── toy_gene_disease.csv
│   │   └── toy_ppi.csv
│   ├── raw
│   │   ├── association_overall_direct
│   │   │   ├── part-00001-115d4937-47d3-4de3-a70b-eb4fc9f78010-c000.snappy.parquet
│   │   │   ├── part-00004-115d4937-47d3-4de3-a70b-eb4fc9f78010-c000.snappy.parquet
│   │   │   ├── part-00005-115d4937-47d3-4de3-a70b-eb4fc9f78010-c000.snappy.parquet
│   │   │   ├── part-00003-115d4937-47d3-4de3-a70b-eb4fc9f78010-c000.snappy.parquet
│   │   │   ├── part-00002-115d4937-47d3-4de3-a70b-eb4fc9f780