network colocalization for human control traits (positive and negative)

# set-up

In [1]:
import os
import pandas as pd
import ndex2
import networkx as nx
from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization
import sys
import random

In [2]:
import os
os.chdir('/tscc/projects/ps-palmer/brittany/rare_common_alcohol/rare_common_alcohol_comparison/notebooks/')
from rca_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/scripts')
from network_functions import *
from plotting_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [3]:
random_seed=random.seed(211)

In [4]:
save_fig=True

In [5]:
#create a file called environ_ndex_meta.py where you save variables 'ndex_user' and 'ndex_password'
#otherwise will prompt you to define those within the notebooks
if os.path.isfile('../environ_ndex_meta.py'):
    print ('NDEx credentials imported from meta file')
    sys.path.insert(1, '../')
    from environ_ndex_meta import *
    sys.path.pop(1)
else:
    # Prompt the user for a username
    ndex_user = input("Enter your NDEx username: ")
    # Prompt the user for a password
    ndex_password = input("Enter your NDEx password: ")

NDEx credentials imported from meta file


In [6]:
plt.rcParams.update({'font.size': 16})

# Interactome Set-up

pcnet2- versions 
from wright et al. 2024 preprint:
PCNet 2.0= best-performing ranked composite (top 15 interactomes, 3.85M interactions)
PCNet 2.1= top 8 interactomes, 1.75M interactions
PCNet 2.2= top 10 co-citation-free interactomes, 3.32M interactions 

In [7]:
interactome_name='PCNet2.0'

In [8]:
graph=import_interactome(UUIDs=UUIDs,interactome_name=interactome_name)
all_nodes=list(graph.nodes())

PCNet2.0
Name: PCNet 2.0
Nodes: 19267
Edges: 3852119
Node Attributes: 19267
Edge Attributes: 19260595

number of nodes:
19267

number of edges:
3852119


In [9]:
# pre calculate the matricies used for network propagation
print('\ncalculating w_prime')
w_prime = netprop.get_normalized_adjacency_matrix(graph, conserve_heat=True)

print('\ncalculating w_double_prime')
w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)


calculating w_prime

calculating w_double_prime


# import NPS scores and seed genes

In [10]:
seed_dict=import_seed_dict(mag_dir,file_dict,bonf_dict,gene_col_dict,all_nodes)
seed_dict.keys()

dict_keys(['loco_bonf', 'loco_top500', 'loco_FDR', 'loco_gsem_bonf', 'loco_gsem_top500', 'loco_gsem_FDR', 'ext_bonf', 'ext_top500', 'ext_FDR', 'ext_st22', 'loco_mega_fus_naac_bonf', 'loco_mega_fus_naac_top500', 'loco_mega_fus_naac_FDR', 'ext_fus_naac_bonf', 'ext_fus_naac_top500', 'ext_fus_naac_FDR', 'loco_final_cf_bonf', 'loco_final_cf_top500', 'loco_final_cf_FDR', 'loco_final_mega_bonf', 'loco_final_mega_top500', 'loco_final_mega_FDR'])

In [14]:
#dictionary of human control traits
ctrl_dict={}
ctrl_traits=['facial_hair', 'age_smkinit', 'antisoc', 'friend_sat', 'hr', 'infant_bw', 'LDL', 'maternal_smok', 'townsend', 'age_menarche', 'neurot','addict-rf']
for t in ctrl_traits:
    ctrl_dict[t]=pd.read_csv('gwas_ctrl_hm/magma/seed_genes/'+t+'_annot.tsv',sep='\t')
for t in ctrl_traits:
    seed_dict[t+'_FDR']=(set(ctrl_dict[t][ctrl_dict[t]['Q']<0.05]['GENE']))
    seed_dict[t+'_bonf']=(set(ctrl_dict[t][ctrl_dict[t]['P']<0.05/len(ctrl_dict[t])]['GENE']))
    seed_dict[t+'_top500']=set(ctrl_dict[t][(ctrl_dict[t]['GENE'].isin(all_nodes))].nsmallest(500,'P')['GENE'])

In [15]:
NPS_dict,NPS_dict_series=import_NPS_scores(seed_dict,UUIDs)
NPS_dict.keys()

dict_keys(['loco_bonf_PCNet2.0', 'loco_bonf_PCNet2.1', 'loco_bonf_PCNet2.2', 'loco_FDR_PCNet2.0', 'loco_FDR_PCNet2.1', 'loco_FDR_PCNet2.2', 'loco_gsem_bonf_PCNet2.0', 'loco_gsem_bonf_PCNet2.1', 'loco_gsem_bonf_PCNet2.2', 'loco_gsem_FDR_PCNet2.0', 'loco_gsem_FDR_PCNet2.1', 'loco_gsem_FDR_PCNet2.2', 'ext_bonf_PCNet2.0', 'ext_bonf_PCNet2.1', 'ext_bonf_PCNet2.2', 'ext_top500_PCNet2.0', 'ext_top500_PCNet2.1', 'ext_top500_PCNet2.2', 'ext_FDR_PCNet2.0', 'ext_FDR_PCNet2.1', 'ext_FDR_PCNet2.2', 'ext_st22_PCNet2.0', 'ext_st22_PCNet2.1', 'ext_st22_PCNet2.2', 'loco_mega_fus_naac_bonf_PCNet2.0', 'loco_mega_fus_naac_bonf_PCNet2.1', 'loco_mega_fus_naac_bonf_PCNet2.2', 'loco_mega_fus_naac_FDR_PCNet2.0', 'loco_mega_fus_naac_FDR_PCNet2.1', 'loco_mega_fus_naac_FDR_PCNet2.2', 'ext_fus_naac_bonf_PCNet2.0', 'ext_fus_naac_bonf_PCNet2.1', 'ext_fus_naac_bonf_PCNet2.2', 'ext_fus_naac_top500_PCNet2.0', 'ext_fus_naac_top500_PCNet2.1', 'ext_fus_naac_top500_PCNet2.2', 'ext_fus_naac_FDR_PCNet2.0', 'ext_fus_naac_FDR_

# choose datasets for analysis

In [20]:
seed_dict.keys()

dict_keys(['loco_bonf', 'loco_top500', 'loco_FDR', 'loco_gsem_bonf', 'loco_gsem_top500', 'loco_gsem_FDR', 'ext_bonf', 'ext_top500', 'ext_FDR', 'ext_st22', 'loco_mega_fus_naac_bonf', 'loco_mega_fus_naac_top500', 'loco_mega_fus_naac_FDR', 'ext_fus_naac_bonf', 'ext_fus_naac_top500', 'ext_fus_naac_FDR', 'loco_final_cf_bonf', 'loco_final_cf_top500', 'loco_final_cf_FDR', 'loco_final_mega_bonf', 'loco_final_mega_top500', 'loco_final_mega_FDR', 'facial_hair_FDR', 'facial_hair_bonf', 'facial_hair_top500', 'age_smkinit_FDR', 'age_smkinit_bonf', 'age_smkinit_top500', 'antisoc_FDR', 'antisoc_bonf', 'antisoc_top500', 'friend_sat_FDR', 'friend_sat_bonf', 'friend_sat_top500', 'hr_FDR', 'hr_bonf', 'hr_top500', 'infant_bw_FDR', 'infant_bw_bonf', 'infant_bw_top500', 'LDL_FDR', 'LDL_bonf', 'LDL_top500', 'maternal_smok_FDR', 'maternal_smok_bonf', 'maternal_smok_top500', 'townsend_FDR', 'townsend_bonf', 'townsend_top500', 'age_menarche_FDR', 'age_menarche_bonf', 'age_menarche_top500', 'neurot_FDR', 'neurot

In [22]:
trait_h!=None

False

In [44]:
def return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name):
    #labels
	if not (trait_h==None):
	    if cut_h==None:
	        label_h=trait_h
	    else:
	        label_h=trait_h+'_'+cut_h
	    seed_h=seed_dict[label_h]
	    NPS_h=NPS_dict[label_h+'_'+interactome_name]
	else:
		trait_h=None
		label_h=None
		cut_h=None
		seed_h=None
		NPS_h=None

	if not (trait_r==None):
	    if cut_r==None:
	        label_r=trait_r
	    else:
	        label_r=trait_r+'_'+cut_r
	    seed_r=seed_dict[label_r]
	    NPS_r=NPS_dict[label_r+'_'+interactome_name]

	else:
		trait_r=None
		label_r=None
		cut_r=None
		seed_r=None
		NPS_r=None
    #seed genes
    #NPS scores
	if ((trait_h!=None) and (trait_r!=None)):
	    NPS = NPS_h.join(NPS_r, lsuffix="h", rsuffix="r")
	    NPS = NPS.assign(zhr=NPS.zh * NPS.zr)
	else:
	    NPS=None
	return label_h,label_r,seed_h,seed_r,NPS_h,NPS_r,NPS

In [51]:
#modify for correct genesets
trait_r='loco_final_cf'
cut_r='FDR'
trait_h=None
cut_h=None

_,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)

In [52]:
#set the Z-score cutoffs to calculate network enrichment significance
zlist = [1,2,3,4,5,6,7,8,9,10]
z12list = [1,1.5,2,3,4,5]
cutoff_comb=3
cutoff_single=1.5

In [None]:
for t in ctrl_traits:
    trait_h=str(t)
    if len(seed_dict[t+'_bonf'])<=500:
        print('more than 500 genes in most stringent cutoff- testing top500')
        test500=True
    else:
        test500=False
    for u in ['FDR','bonf']:
        trait_h=str(t)
        cut_h=u
        label_h=trait_h+'_'+cut_h
        seed_h=seed_dict[label_h]
        NPS_h=NPS_dict[label_h+'_'+interactome_name]
        netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                     zthresh_list = zlist,
                                                                                     z12thresh_list=z12list,
                                                                                     verbose=False)
        #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
        print(netcoloc_enrichment_df)
        netcoloc_enrichment_df['rat_dataset']=label_r
        netcoloc_enrichment_df['human_dataset']=label_h
        if save_fig:
            netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
        pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])
    if (test500==True):
        trait_h=str(t)
        cut_h='top500'
        label_h=trait_h+'_'+cut_h
        seed_h=seed_dict[label_h]
        NPS_h=NPS_dict[label_h+'_'+interactome_name]
        netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                     zthresh_list = zlist,
                                                                                     z12thresh_list=z12list,
                                                                                     verbose=False)
        #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
        print(netcoloc_enrichment_df)
        netcoloc_enrichment_df['rat_dataset']=label_r
        netcoloc_enrichment_df['human_dataset']=label_h
        if save_fig:
            netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
        pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])

more than 500 genes in most stringent cutoff- testing top500
    z_comb  z_12  observed_overlap  expected_overlap_mean  \
0        1   1.0               752                 699.57   
1        1   1.5               247                 243.24   
2        1   2.0                86                  87.45   
3        1   3.0                21                  22.79   
4        1   4.0                11                  14.68   
5        1   5.0                10                  12.27   
6        2   1.0               555                 537.54   
7        2   1.5               247                 240.42   
8        2   2.0                86                  87.17   
9        2   3.0                21                  22.06   
10       2   4.0                11                  13.96   
11       2   5.0                10                  12.24   
12       3   1.0               352                 367.51   
13       3   1.5               215                 220.30   
14       3   2.0        

In [None]:
print('analysis complete')