purpose: run network propagation for a given dataset, in this case used for locomotor activity and externalizing.

In [1]:
import os
import pandas as pd
import ndex2
import networkx as nx
from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization
import sys
import random
#os.chdir('/tscc/projects/ps-palmer/brittany/rare_common_alcohol/rare_common_alcohol_comparison/notebooks/')
#from rca_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/scripts')
from network_functions import *
from network_validation_functions import *
from plotting_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [2]:
random_seed=random.seed(211)

In [3]:
save_file=True

# functions

In [4]:
#from rat bmi notebooks not netcoloc
def calculate_heat_zscores_with_sampling(data, nodes, individual_heats, G_PC, trait="BMI", max_genes=500, num_samples=100,
                                        nominal_sig=0.05, num_reps=1000, out_path="", minimum_bin_size=10,outfile='sample.tsv'):
    """Takes a set of summary statistics and a molecular interaction and performs sampling of the significant genes.
    For each sample a random selection of seed genes is chosen, weighted by the p-value of each gene in the summary
    statistics. Network propagation with zscore calculation is performed for each sample to generate a distribution
    of z-scores for each gene in the seed_gene set.

    Args:
        data (pd.DataFrame): Gene level summary statistics
        nodes (list): list of nodes in the interaction network
        individual_heats (np.array): Heat matrix calculated by `netprop_zscore.get_individual_heats_matrix()`
        G_PC (nx.Graph): molecular interaction network
        trait (str, optional): name of trait being investigated. Defaults to "BMI".
        max_genes (int, optional): Maximum number of seed genes to include in each sample (maximum=500). Defaults to 500.
        num_samples (int, optional): Number of times to perform sampling. Defaults to 100.
        nominal_sig (float, optional): Significance cutoff for keeping genes in data (Note: this value will be Bonferroni corrected). Defaults to 0.05.
        num_reps (int, optional): Number of repetitions of randomization for generating null distribution for z_scores. Defaults to 1000.
        out_path (str, optional): File path prefix for saving results of sampling. Defaults to "".
        minimum_bin_size (int, optional): minimum number of genes that should be in each degree matching bin. Defaults to 10.

    Returns:
        pd.DataFrame: Gene x sampling run dataframe of sampled z-scores
    """
    assert max_genes <= 500, "NetColoc is only valid for sets of 500 or less genes so maximum number of genes for sampling must be <= 500"
    #outfile = out_path + trait + "sampling_" + str(max_genes) + "_" + str(num_samples) + ".tsv"
    data = data.loc[data.gene_symbol.isin(nodes)]  # subset to genes in interaction network
    all_seeds = data.loc[data.pvalue <= nominal_sig / len(data)]  # Bonferroni correction
    all_seeds = all_seeds.assign(log10p=-1 * np.log10(all_seeds.pvalue))  # get -log10p for weighted sampling
    sampling_results = []
    for i in tqdm(range(num_samples)):
        # perform propagation for sample
        sample_seeds = random.choices(population=all_seeds.gene_symbol.values, weights=all_seeds.log10p.values, k=max_genes)
        sample_results = netprop_zscore.calculate_heat_zscores(individual_heats, nodes=list(G_PC.nodes), degrees=dict(G_PC.degree),
                                                seed_genes=sample_seeds, num_reps=num_reps,
                                                minimum_bin_size=minimum_bin_size, random_seed=i)[0]
        sample_z = pd.DataFrame(sample_results, columns=["z" + str(i)])
        # save running results of sampling
        if i == 0:
            sample_z.to_csv(outfile, sep="\t")
        else:
            existing = pd.read_csv(outfile, sep="\t", index_col=0)
            existing = existing.join(sample_z)
            existing.to_csv(outfile, sep="\t")
        sampling_results.append(sample_z)
    return pd.concat(sampling_results, axis=1)

# Interactome Set-up

pcnet2- versions 
from wright et al. 2024 preprint:
PCNet 2.0= best-performing ranked composite (top 15 interactomes, 3.85M interactions)
PCNet 2.1= top 8 interactomes, 1.75M interactions
PCNet 2.2= top 10 co-citation-free interactomes, 3.32M interactions 

In [5]:
tissue_net=False

In [6]:
UUIDs.keys()

dict_keys(['PCNet2.0', 'PCNet2.1', 'PCNet2.2', 'signor_rat', 'signor_human', 'signor_mouse'])

In [7]:
interactome_name='PCNet2.0'
tissue='amygdala'
if not tissue_net:
    print(f'importing {interactome_name} from ndex')
    graph=import_interactome(UUIDs=UUIDs,interactome_name=interactome_name)
else:
    print(f'importing tissue specific network of {tissue}')
    #file=f'tissue_networks/{tissue}.gz'
    graph = pickle.load(open(f'tissue_networks/{tissue}.pickle', 'rb'))
    interactome_name=f'hb_{tissue}'

importing PCNet2.0 from ndex
PCNet2.0
Name: PCNet 2.0
Nodes: 19267
Edges: 3852119
Node Attributes: 19267
Edge Attributes: 19260595

number of nodes:
19267

number of edges:
3852119


In [11]:
edges=list(graph.edges())
all_nodes=list(graph.nodes())

In [25]:
# pre calculate the matricies used for network propagation
print('\ncalculating w_prime')
w_prime = netprop.get_normalized_adjacency_matrix(graph, conserve_heat=True)
print('\ncalculating w_double_prime')
w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)


calculating w_prime

calculating w_double_prime


# calculate gwas NPS

In [28]:
seed_dict=import_seed_dict(mag_dir,file_dict,bonf_dict,gene_col_dict,all_nodes)
seed_dict.keys()

NameError: name 'all_nodes' is not defined

In [4]:
#dictionary of human control traits
ctrl_dict={}
ctrl_traits=['facial_hair', 'age_smkinit', 'antisoc', 'friend_sat', 'hr', 'infant_bw', 'LDL', 'maternal_smok', 'townsend', 'age_menarche', 'neurot','addict-rf']
for t in ctrl_traits:
    ctrl_dict[t]=pd.read_csv('gwas_ctrl_hm/magma/seed_genes/'+t+'_annot.tsv',sep='\t')
for t in ctrl_traits:
    seed_dict[t+'_FDR']=(set(ctrl_dict[t][ctrl_dict[t]['Q']<0.05]['GENE']))
    seed_dict[t+'_bonf']=(set(ctrl_dict[t][ctrl_dict[t]['P']<0.05/len(ctrl_dict[t])]['GENE']))
    seed_dict[t+'_top500']=set(ctrl_dict[t][(ctrl_dict[t]['GENE'].isin(all_nodes))].nsmallest(500,'P')['GENE'])

NameError: name 'seed_dict' is not defined

In [None]:
NPS_dict,NPS_dict_series=import_NPS_scores(seed_dict,UUIDs)

In [None]:
# old version

In [None]:
#use for GWAS- actual traits of interest
mag_dir='magma/seed_genes/'
file_dict={
    'loco':mag_dir+'loco_win10_annot.tsv',
    'loco_gsem':mag_dir+'loco_gsem_annot.tsv',
    'ext_munged':mag_dir+'ext_munged_annot.tsv',
    'ext':mag_dir+'ext_orig_annot.tsv',
    'ext_st22':mag_dir+'all_tests_ext1_st22_genes.csv',
    'loco_mega_fus_naac':'loco_twas_dan/loco_fusion_NACC_seed.tsv',
    'ext_fus_naac':'ext_FUSION/ext_fusion_NACC_seed.tsv'
}

In [None]:
#dictionary of human control traits
ctrl_dict={}
ctrl_traits=['facial_hair', 'age_smkinit', 'antisoc', 'friend_sat', 'hr', 'infant_bw', 'LDL', 'maternal_smok', 'townsend', 'age_menarche', 'neurot','addict-rf']
for t in ctrl_traits:
    ctrl_dict[t]=pd.read_csv('gwas_ctrl_hm/magma/seed_genes/'+t+'_annot.tsv',sep='\t')

In [None]:
seed_dict={}

In [None]:
#locomotor mega analysis
loco=pd.read_csv(file_dict['loco'],sep='\t')
loco_bonf_cut=2.6389402016150313e-06 #calculated in define_seed_genes_orthologs- from raw magma results before adding in ortholog info
seed_dict['loco_FDR']=(set(loco[loco['Q']<0.05]['HM_ORTHO']))
seed_dict['loco_bonf']=(set(loco[loco['P']<loco_bonf_cut]['HM_ORTHO']))

In [None]:
#locomotor- gsem common factor
loco_gsem=pd.read_csv(file_dict['loco_gsem'],sep='\t')
loco_gsem_bonf_cut=2.650129856362962e-06 #calculated in define_seed_genes_orthologs- from raw magma results before adding in ortholog info
seed_dict['loco_gsem_FDR']=(set(loco_gsem[loco_gsem['Q']<0.05]['HM_ORTHO']))
seed_dict['loco_gsem_bonf']=(set(loco_gsem[loco_gsem['P']<loco_gsem_bonf_cut]['HM_ORTHO']))

In [None]:
#locomotor- mega FUSION
loco_fus=pd.read_csv(file_dict['loco_mega_fus_naac'],sep='\t')
loco_fus_bonf_cut=9.338812103100487e-06 #calculated in define_seed_genes_orthologs- from raw magma results before adding in ortholog info
seed_dict['loco_mega_fus_naac_FDR']=(set(loco_fus[loco_fus['Q']<0.05]['human_ortholog']))
seed_dict['loco_mega_fus_naac_bonf']=(set(loco_fus[loco_fus['TWAS.P']<loco_fus_bonf_cut]['human_ortholog']))

In [None]:
#externalizing- FUSION
ext=pd.read_csv(file_dict['ext_fus_naac'],sep='\t')
ext_bonf_cut=0.05/len(ext) # no ortholog adding, so can calculate from this table
seed_dict['ext_fus_naac_FDR']=(set(ext[ext['Q']<0.05]['ID']))
seed_dict['ext_fus_naac_bonf']=(set(ext[ext['TWAS.P']<ext_bonf_cut]['ID']))
seed_dict['ext_fus_naac_top500']=set(ext[(ext['ID'].isin(all_nodes))].nsmallest(500,'TWAS.P')['ID'])

In [None]:
ext=pd.read_csv(file_dict['ext'],sep='\t')
ext_bonf_cut=0.05/len(ext) # no ortholog adding, so can calculate from this table
seed_dict['ext_FDR']=(set(ext[ext['Q']<0.05]['GENE']))
seed_dict['ext_bonf']=(set(ext[ext['P']<ext_bonf_cut]['GENE']))
seed_dict['ext_top500']=set(ext[(ext['GENE'].isin(all_nodes))].nsmallest(500,'P')['GENE'])
#the genes from ext1.0 supplemental table 22
ext_st22=pd.read_csv(file_dict['ext_st22'],sep='\t')
seed_dict['ext_st22']=(set(ext_st22['GENE NAME']))

In [None]:
for t in ctrl_traits:
    seed_dict[t+'_FDR']=(set(ctrl_dict[t][ctrl_dict[t]['Q']<0.05]['GENE']))
    seed_dict[t+'_bonf']=(set(ctrl_dict[t][ctrl_dict[t]['P']<0.05/len(ctrl_dict[t])]['GENE']))
    seed_dict[t+'_top500']=set(ctrl_dict[t][(ctrl_dict[t]['GENE'].isin(all_nodes))].nsmallest(500,'P')['GENE'])

In [None]:
max(ext[ext.P<1.556900e-08].P)

In [None]:
seed_dict.keys()

In [None]:
overwrite=False

In [None]:
seed_dict.keys()

# run straight propagation- no sampling

In [None]:
overwrite=False

In [None]:
#loop for only addiction traits genes
#ls=[i for i in seed_dict.keys() if 'addict' in i]
ls=seed_dict.keys()
for k in ls:  
    seed_genes = list(seed_dict[k].intersection(graph.nodes()))
    
    NPSc, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(
        w_double_prime,  
        list(graph.nodes()),
        dict(graph.degree), 
        seed_genes, num_reps=1000,
        minimum_bin_size=100,
        random_seed=random_seed)
    print(NPSc.head())
    if save_file:
        file_path='network_scores/'+k+'_'+interactome_name+'_zscore.tsv'
        print(file_path)
        if ((os.path.exists(file_path))&(overwrite==False)):
            print('File already exists. If you would like to overwrite this file, set overwrite=True, and rerun')
        else:
            NPSc.to_csv(file_path,sep='\t',header=False)

In [None]:
#loop for all seed genes
for k in seed_dict.keys():  
    seed_genes = list(seed_dict[k].intersection(graph.nodes()))
    
    NPSc, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(
        w_double_prime,  
        list(graph.nodes()),
        dict(graph.degree), 
        seed_genes, num_reps=1000,
        minimum_bin_size=100,
        random_seed=random_seed)
    print(NPSc.head())
    if save_file:
        file_path='network_scores/'+k+'_'+interactome_name+'_zscore.tsv'
        print(file_path)
        if ((overwrite==False)&(os.path.exists(file_path))):
            print('File already exists. If you would like to overwrite this file, set overwrite=True, and rerun')
        else:
            NPSc.to_csv(file_path,sep='\t',header=False)

# run for an individual seed gene set

In [None]:
seed_dict.keys()

In [None]:
ctrl_dict.keys()

In [None]:
overwrite=False

In [None]:
k='loco_gsem_FDR'
print(len(seed_dict[k]))
seed_genes = list(seed_dict[k].intersection(graph.nodes()))

NPSc, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(
    w_double_prime,  
    list(graph.nodes()),
    dict(graph.degree), 
    seed_genes, num_reps=1000,
    minimum_bin_size=100,
    random_seed=random_seed)
print(NPSc.head())
if save_file:
    file_path='network_scores/'+k+'_'+interactome_name+'_zscore.tsv'
    print(file_path)
    if ((overwrite==False)&(os.path.exists(file_path))):
        print('File already exists. If you would like to overwrite this file, set overwrite=True, and rerun')
    else:
        NPSc.to_csv(file_path,sep='\t',header=False)