In [5]:
import requests

# Assuming 'my_genes.txt' contains your list of genes
with open('genes.txt', 'r') as f:
    genes = [line.strip() for line in f.readlines()]

genes_string = '\r'.join(genes)
value = requests.get(f'https://string-db.org/api/tsv/network?identifiers={genes_string}')

'<html>\r\n<head><title>414 Request-URI Too Large</title></head>\r\n<body>\r\n<center><h1>414 Request-URI Too Large</h1></center>\r\n<hr><center>cloudflare</center>\r\n</body>\r\n</html>\r\n'

In [14]:
from multiprocessing import Pool
from tqdm import tqdm
from io import StringIO
import pandas as pd
import concurrent.futures

def get_gene_network(gene):
    value = requests.get(f'https://string-db.org/api/tsv/network?identifiers=' + gene)
    df = pd.read_csv(StringIO(value.text), delimiter='\t')
    return gene, df

# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Use a list comprehension to collect the results
    results = list(tqdm(executor.map(get_gene_network, genes), total=len(genes)))

# Convert the results into a dictionary
gene_to_df = {gene: df for gene, df in results}

100%|██████████| 18211/18211 [34:29<00:00,  8.80it/s] 


In [16]:
import pickle

with open('gene_to_df.pkl', 'wb') as f:
    pickle.dump(gene_to_df, f)

In [20]:
gene_set = set(genes)

In [22]:
gene_to_id = {}
for i, gene in enumerate(genes):
    gene_to_id[gene] = i

In [49]:
edge_from_list_a = []
edge_from_list_b = []
for gene_a in tqdm(genes):
    df = gene_to_df[gene_a]
    if 'Error' in df:
        continue
    for row in df.itertuples():
        row = row._asdict()
        gene_b = row['preferredName_B']
        if gene_b not in gene_set:
            continue
        score = row['score']
        if score >= 0.1:
            # Edge from gene a to gene b
            a = gene_to_id[gene_a]
            b = gene_to_id[gene_b]
            edge_from_list_a.append(a)
            edge_from_list_b.append(b)

100%|██████████| 18211/18211 [00:06<00:00, 2749.61it/s]


In [50]:
len(edge_from_list_a), len(edge_from_list_b)

(784134, 784134)

In [51]:
import torch

edge_index = torch.tensor([edge_from_list_a,
                           edge_from_list_b], dtype=torch.long)

torch.save(edge_index, 'correlation_edge_index.pt')

In [53]:
from torch_geometric.utils import degree
node_degrees = degree(edge_index[0], num_nodes=18212)
# Average node degree
average_degree = node_degrees.mean().item()
print("Average node degree:", average_degree)

# Amount of nodes with neighbors
# (nodes with degree > 0)
nodes_with_neighbors = (node_degrees > 0).sum().item()
print("Amount of nodes with neighbors:", nodes_with_neighbors)

# Highest degree node
highest_degree_node = node_degrees.argmax().item()
highest_degree = node_degrees[highest_degree_node].item()
print("Highest degree node:", highest_degree_node, "with degree:", highest_degree)

genes[6]

Average node degree: 43.0558967590332
Amount of nodes with neighbors: 13001
Highest degree node: 6 with degree: 110.0


'AAAS'

In [54]:
len(edge_index[0])

784134