purpose: to download human tissue-specific networks, to see if the tissue specific networks have higher network overlap for those that are relevant to the phenotype than those that are not

Full Network: the network is fully connected

    [entrez gene id 1][entrez gene id 2][posterior prob., with known edges set to 1][posterior prob.]

In [1]:
import pandas as pd
import os
import networkx as nx
import gzip
import ndex2
import sys
import pickle
import gzip
import numpy as np
import scipy.sparse as sp
from collections import defaultdict

from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization


In [2]:
def process_network_in_chunks(file_path, chunk_size=100000):
    node_set = set()
    edges = []
    node_index = {}
    
    print('First pass: determine unique nodes and create an index')
    with gzip.open(file_path, 'rt') as file:
        while True:
            lines = [file.readline() for _ in range(chunk_size)]
            if not lines or lines[0] == '':
                break  # Stop at EOF
            
            for line in lines:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue  # Skip malformed lines
                
                node1, node2 = (parts[0]), (parts[1])
                
                node_set.add(node1)
                node_set.add(node2)
                edges.append((node1, node2))
                edges.append((node2, node1))  # Ensure symmetry
    
    print('Create node index mapping')
    node_list = sorted(node_set)  # Ensure consistent order- MUST REPLACE WHEN FIXED
    node_index = {node: idx for idx, node in enumerate(node_list)}
    
    print('Second pass: construct sparse adjacency matrix')
    row_idx, col_idx = [], []
    degree = defaultdict(int)
    
    with gzip.open(file_path, 'rt') as file:
        while True:
            lines = [file.readline() for _ in range(chunk_size)]
            if not lines or lines[0] == '':
                break  # Stop at EOF
            
            for line in lines:
                parts = line.strip().split()
                if len(parts) < 2:
                    continue  # Skip malformed lines
                
                node1, node2 = (parts[0]), (parts[1])
    
                i, j = node_index[node1], node_index[node2]
                
                # Store indices (all weights are 1)
                row_idx.append(i)
                col_idx.append(j)
                row_idx.append(j)
                col_idx.append(i)  # Ensure symmetric adjacency matrix
                
                # Update node degree
                degree[node1] += 1
                degree[node2] += 1

    n = len(node_list)
    adj_matrix = sp.csr_matrix((np.ones(len(row_idx)), (row_idx, col_idx)), shape=(n, n))
    return adj_matrix, node_index, degree, node_list


In [3]:
def normalize_adjacency_matrix(adj_matrix, node_index, degree, node_list, conserve_heat=True):
    print('normalize adjacency matrix')
    row_idx, col_idx = adj_matrix.nonzero()
    weight_vals = adj_matrix.data.copy()  # Copy original weights (all 1s)
    
    if conserve_heat:
        for k in range(len(weight_vals)):
            i, j = row_idx[k], col_idx[k]
            weight_vals[k] = 1/degree[node_list[i]]  # Normalize by the degree of the destination node
    
    else:
        for k in range(len(weight_vals)):
            i, j = row_idx[k], col_idx[k]
            weight_vals[k] = 1/np.sqrt(degree[node_list[i]] * degree[node_list[j]])
    
    x=sp.csr_matrix((weight_vals, (row_idx, col_idx)), shape=adj_matrix.shape)
    return x

In [20]:
#tissue=sys.argv[1]
#python format_tissue_specific_network.py

In [4]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [5]:
tissue='amygdala'

In [6]:
file_path=f'tissue_networks/{tissue}.gz'

In [7]:
outdir='tissue_networks/intermediate/'

In [8]:
os.path.exists(file_path) #check if input data exists

True

In [9]:
adj_matrix, node_index, degree, node_list = process_network_in_chunks(file_path)

First pass: determine unique nodes and create an index
Create node index mapping
Second pass: construct sparse adjacency matrix


In [10]:
wp = normalize_adjacency_matrix(adj_matrix, node_index, degree, node_list, conserve_heat=True)

normalize adjacency matrix


In [12]:
# write the node list and adjacency matrix to file
#outdir='tissue_networks/intermediate/'

# Save adjacency matrix as a sparse file
sp.save_npz(f'{outdir}normalized_adjacency_{tissue}.npz', wp)

# Save the node list in the same order as adjacency matrix
with open(f'{outdir}node_list_{tissue}.txt', 'w') as f:
    for node in node_list:
        f.write(f"{node}\n")

In [None]:
node_list

In [16]:
pd.DataFrame.from_dict(degree,orient='index').to_csv(f'{outdir}degree_{tissue}.csv',header=None)

In [None]:
# calculate w_double_prime and write it to file
wdp = netprop.get_individual_heats_matrix(wp, .5)
np.save(f'{outdir}w_double_prime_{tissue}',wdp)

# check which tissues are missing

# old code

In [41]:
tissue='PCNet2.0'

In [42]:
outdir='tissue_networks/intermediate/'
# Save adjacency matrix as a sparse file
sp.save_npz(f'{outdir}normalized_adjacency_{tissue}.npz', norm_adj_matrix)

# Save the node list in the same order as adjacency matrix
with open(f'{outdir}node_list_{tissue}.txt', 'w') as f:
    for node in node_list:
        f.write(f"{node}\n")

In [43]:
w_double_prime = netprop.get_individual_heats_matrix(norm_adj_matrix, .5)
w_double_prime.tofile(f'{outdir}w_double_prime_{tissue}.csv', sep=',')

In [None]:
adj_matrix

In [12]:
# Normalize the adjacency matrix
norm_adj_matrix = normalize_adjacency_matrix(adj_matrix, node_index, degree, node_list, conserve_heat=True)

sp.save_npz(f'tissue_networks/intermediate/normalized_adjacency_{tissue}.npz', norm_adj_matrix)


In [21]:
w_double_prime.tofile(f'tissue_networks/intermediate/w_double_prime_{tissue}.csv', sep=',')

In [None]:
# File path to your gzipped network
file_path = file

# Normalize the adjacency matrix
norm_adj_matrix = normalize_adjacency_matrix(adj_matrix, node_index, degree, conserve_heat=True)

# Save as sparse matrix
#sp.save_npz(f'normalized_adjacency_{tissue}.npz', norm_adj_matrix)


In [14]:
# Initialize an empty graph
G = nx.Graph()

# Read the file and process each line
i=0
with gzip.open(file,'rt') as f:
    for line in f:    
        i=i+1
        parts = line.split()
        
        if len(parts) == 4:
            node1, node2, _, weight = parts
        elif len(parts) == 3:
            node1, node2, weight = parts
        else:
            print('line skipped:')
            print(parts)
            continue  # Skip malformed lines
        
        # Convert types and add edge
        G.add_edge(int(node1), int(node2), weight=float(weight))
print('import complete')

FileNotFoundError: [Errno 2] No such file or directory: 'tissue_networks/amydala.gz'

In [10]:
print(f'Nedges {len(G.edges())}')
print(f'Nedges-len file={len(G.edges())-i}')

Nedges333452400
Nedges-len file=0


In [11]:
print(f'Nnodes={len(G.nodes())}')

Nnodes=25825


In [15]:
# pre calculate the matricies used for network propagation
print('\ncalculating w_prime')
w_prime = netprop.get_normalized_adjacency_matrix(G, conserve_heat=True)

print('\ncalculating w_double_prime')
w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)


calculating w_prime

calculating w_double_prime


In [19]:
w_prime.shape

(25825, 25825)

In [None]:
pickle.dump(G, open(f'tissue_networks/{tissue}.pickle', 'wb'))
#G = pickle.load(open('filename.pickle', 'rb'))

In [12]:
G = pickle.load(open(f'tissue_networks/{tissue}.pickle', 'rb'))

In [None]:
#nx.write_gml(G,f'tissue_networks/{tissue}.gml')

In [21]:
sys.getsizeof(G)

48

In [16]:
HGNC=pd.read_csv('HGNC_ncbi_gene_id.txt',sep='\t')
t=HGNC[['NCBI gene ID','Approved symbol']].dropna()
t.index=t['NCBI gene ID'].astype(int)
symbol_dict=t['Approved symbol'].to_dict()

In [18]:
print('genes with mapped symbols')
retain=set(G.nodes()).intersection(t.index)
len(retain)/len(set(G.nodes()))

genes with mapped symbols


0.9167086156824782

In [19]:
print('genes without mapped symbols')
len(set(G.nodes()).difference(t.index))/len(set(G.nodes()))

genes without mapped symbols


0.08329138431752178

In [20]:
Gsub=G.subgraph(retain)

In [22]:
del(G)

In [23]:
g=nx.relabel_nodes(Gsub, symbol_dict, copy=True)
print ('relabelling completed')

relabelling completed


In [35]:
sys.getsizeof(g)

48

In [None]:
g.nodes()

In [1]:
pickle.dump(g, open(f'tissue_networks/{tissue}_relabelled_nodes.pickle', 'wb'))

NameError: name 'pickle' is not defined

In [7]:
graph = pickle.load(open(f'tissue_networks/{tissue}_relabelled_nodes.pickle', 'rb'))

UnpicklingError: invalid load key, '\x00'.

In [16]:
len(g.nodes())

23674

In [36]:
len(set(g.nodes()))

23674

In [None]:
net_cx = ndex2.create_nice_cx_from_networkx(g)
net_cx.set_name(f'human_base_{tissue}_full_network_mapped_to_symbol')
#network_uuid = net_cx.upload_to(ndex_server, user, password)

In [None]:
net_cx = ndex2.create_nice_cx_from_networkx(G)

In [None]:
net_cx.set_name(f'human_base_{tissue}_full_network')

In [None]:
network_uuid = net_cx.upload_to(ndex_server, user, password)

In [None]:
net_cx = ndex2.create_nice_cx_from_networkx(G)