network colocalization for human control traits (positive and negative)

# set-up

In [1]:
import os
import pandas as pd
import ndex2
import networkx as nx
from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization
import sys
import random

In [2]:
import os
os.chdir('/tscc/projects/ps-palmer/brittany/rare_common_alcohol/rare_common_alcohol_comparison/notebooks/')
from rca_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/scripts')
from network_functions import *
from plotting_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [3]:
random_seed=random.seed(211)

In [4]:
save_fig=True

In [5]:
#create a file called environ_ndex_meta.py where you save variables 'ndex_user' and 'ndex_password'
#otherwise will prompt you to define those within the notebooks
if os.path.isfile('../environ_ndex_meta.py'):
    print ('NDEx credentials imported from meta file')
    sys.path.insert(1, '../')
    from environ_ndex_meta import *
    sys.path.pop(1)
else:
    # Prompt the user for a username
    ndex_user = input("Enter your NDEx username: ")
    # Prompt the user for a password
    ndex_password = input("Enter your NDEx password: ")

NDEx credentials imported from meta file


In [6]:
plt.rcParams.update({'font.size': 16})

# Interactome Set-up

In [7]:
tissue_network=False
tissue='global'

pcnet2- versions 
from wright et al. 2024 preprint:
PCNet 2.0= best-performing ranked composite (top 15 interactomes, 3.85M interactions)
PCNet 2.1= top 8 interactomes, 1.75M interactions
PCNet 2.2= top 10 co-citation-free interactomes, 3.32M interactions 

In [8]:
if tissue_network==False:
    interactome_name='PCNet2.0'
    interactome=import_interactome(UUIDs=UUIDs,interactome_name=interactome_name)
    all_nodes=list(interactome.nodes())
    # pre calculate the matricies used for network propagation
    print('\ncalculating w_prime')
    w_prime = netprop.get_normalized_adjacency_matrix(interactome, conserve_heat=True)
    
    print('\ncalculating w_double_prime')
    w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)
else:
    netdir='tissue_networks/intermediate/'
    interactome_name=f'hb_tissue_{tissue}_top'
    #import node list
    with open(f'{netdir}node_list_{tissue}_top.txt', 'r') as file:
        lines = file.readlines()
    # Remove newline characters from each line
    all_nodes=[line.strip() for line in lines]

PCNet2.0
Name: PCNet 2.0
Nodes: 19267
Edges: 3852119
Node Attributes: 19267
Edge Attributes: 19260595

number of nodes:
19267

number of edges:
3852119

calculating w_prime

calculating w_double_prime


# import NPS scores and seed genes

In [11]:
magma=True
rat_network=False

In [13]:
ctrl_traits=['facial_hair', 'age_smkinit', 'antisoc', 'hr', 'infant_bw', 'LDL', 'maternal_smok', 'age_menarche','addict-rf','adhd', 'dpw', 'risk', 'auto_speed', 'nsex', 'bmi', 'height']

psych_traits=['anxiety','panic', 'asd','adhd2022', 'scz', 'bipolar', 'dep', 'ptsd', 'ocd', 'alz', 'park', 'als','epilepsy', 'anorexia','park2019','bipolar_euro', 'dep_euro', 'ptsd_euro', 'epilepsy_euro']


def import_seed_dict(mag_dir,file_dict,ctrl_traits,ctrl_traits_rat,psych_traits,bonf_dict,gene_col_dict,all_nodes):
    #written for MAGMA output- need to rewrite for fusion or ratXcan
    seed_dict={}
    for f in file_dict.keys():
        t=pd.read_csv(file_dict[f],sep='\t')
        gene_col=gene_col_dict[f]
        #print('successfully read in file')
        #print(t.head())
        if f in bonf_dict.keys():
            bonf_cutoff=bonf_dict[f]
        else:
            bonf_cutoff=0.05/len(t)
        #print(f'bonferroni cuttoff ={bonf_cutoff}')
        if ('fus' in f):
            Pcol='TWAS.P'
        else:
            Pcol='P'
        if (f=='ext_st22'):
            seed_dict[f]=(set(t[gene_col]))
        else:
            try:
                seed_dict[f'{f}_bonf']=(set(t[t[Pcol]<bonf_cutoff][gene_col]))
                seed_dict[f'{f}_top500']=set(t[(t[gene_col].isin(all_nodes))].nsmallest(500,Pcol)[gene_col])
                seed_dict[f'{f}_FDR']=(set(t[t['Q']<0.05][gene_col]))
            except:
                print(f'error occurred importing {f}')
    for f in ctrl_traits:
        gene_col=gene_col_dict['hm_ctrl']
        t=pd.read_csv(f'{ctrl_mag_dir}{f}_annot.tsv',sep='\t')
        seed_dict[f'{f}_FDR']=(set(t[t['Q']<0.05][gene_col]))
        seed_dict[f'{f}_bonf']=(set(t[t['P']<(0.05/len(t))][gene_col]))
        seed_dict[f'{f}_top500']=set(t[(t['GENE'].isin(all_nodes))].nsmallest(500,'P')[gene_col])
    for f in psych_traits:
        gene_col=gene_col_dict['hm_ctrl']
        t=pd.read_csv(f'{psych_mag_dir}{f}_annot.tsv',sep='\t')
        seed_dict[f'{f}_FDR']=(set(t[t['Q']<0.05][gene_col]))
        seed_dict[f'{f}_bonf']=(set(t[t['P']<(0.05/len(t))][gene_col]))
        seed_dict[f'{f}_top500']=set(t[(t['GENE'].isin(all_nodes))].nsmallest(500,'P')[gene_col])

    for f in ctrl_traits_rat:
        gene_col=gene_col_dict['rat_ctrl']
        t=pd.read_csv(f'{mag_dir}{f}_annot.tsv',sep='\t')
        seed_dict[f'{f}_FDR']=(set(t[t['Q']<0.05][gene_col]))
        seed_dict[f'{f}_bonf']=(set(t[t['P']<bonf_dict[f]][gene_col]))
        seed_dict[f'{f}_top500']=set(t[(t[gene_col].isin(all_nodes))].nsmallest(500,'P')[gene_col])

    return seed_dict


In [14]:
if not tissue_network:
    seed_dict=import_seed_dict(mag_dir,file_dict,ctrl_traits,ctrl_traits_rat,psych_traits,bonf_dict,gene_col_dict,all_nodes)
else:
    hgnc=pd.read_csv('hgnc_complete_set.txt',sep='\t',low_memory=False)
    hgnc=hgnc[['symbol','entrez_id']].dropna()
    hgnc['entrez_id']=hgnc['entrez_id'].astype(int).astype(str)
    seed_dict=import_seed_dict(mag_dir,file_dict,bonf_dict,gene_col_dict,hgnc[hgnc.entrez_id.isin(all_nodes)]['symbol']) 
seed_dict.keys()

dict_keys(['loco_bonf', 'loco_top500', 'loco_FDR', 'loco_gsem_bonf', 'loco_gsem_top500', 'loco_gsem_FDR', 'ext_bonf', 'ext_top500', 'ext_FDR', 'ext_st22', 'loco_mega_fus_naac_bonf', 'loco_mega_fus_naac_top500', 'loco_mega_fus_naac_FDR', 'ext_fus_naac_bonf', 'ext_fus_naac_top500', 'ext_fus_naac_FDR', 'loco_final_cf_bonf', 'loco_final_cf_top500', 'loco_final_cf_FDR', 'loco_final_mega_bonf', 'loco_final_mega_top500', 'loco_final_mega_FDR', 'ext_rat_bonf', 'ext_rat_top500', 'ext_rat_FDR', 'loco_final_cf_rat_bonf', 'loco_final_cf_rat_top500', 'loco_final_cf_rat_FDR', 'ext_db_bonf', 'ext_db_top500', 'ext_db_FDR', 'ext_rtb_bonf', 'ext_rtb_top500', 'ext_rtb_FDR', 'loco_final_cf_25_bonf', 'loco_final_cf_25_top500', 'loco_final_cf_25_FDR', 'facial_hair_FDR', 'facial_hair_bonf', 'facial_hair_top500', 'age_smkinit_FDR', 'age_smkinit_bonf', 'age_smkinit_top500', 'antisoc_FDR', 'antisoc_bonf', 'antisoc_top500', 'hr_FDR', 'hr_bonf', 'hr_top500', 'infant_bw_FDR', 'infant_bw_bonf', 'infant_bw_top500', 

In [None]:
NPS_dict,NPS_dict_series=import_NPS_scores(seed_dict,interactome_name)

# choose datasets for analysis

In [None]:
def return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name):
    #labels
	if not (trait_h==None):
	    if cut_h==None:
	        label_h=trait_h
	    else:
	        label_h=trait_h+'_'+cut_h
	    seed_h=seed_dict[label_h]
	    NPS_h=NPS_dict[label_h+'_'+interactome_name]
	else:
		trait_h=None
		label_h=None
		cut_h=None
		seed_h=None
		NPS_h=None

	if not (trait_r==None):
	    if cut_r==None:
	        label_r=trait_r
	    else:
	        label_r=trait_r+'_'+cut_r
	    seed_r=seed_dict[label_r]
	    NPS_r=NPS_dict[label_r+'_'+interactome_name]

	else:
		trait_r=None
		label_r=None
		cut_r=None
		seed_r=None
		NPS_r=None
    #seed genes
    #NPS scores
	if ((trait_h!=None) and (trait_r!=None)):
	    NPS = NPS_h.join(NPS_r, lsuffix="h", rsuffix="r")
	    NPS = NPS.assign(zhr=NPS.zh * NPS.zr)
	else:
	    NPS=None
	return label_h,label_r,seed_h,seed_r,NPS_h,NPS_r,NPS

In [None]:
#set the Z-score cutoffs to calculate network enrichment significance
zlist = [1,2,3,4,5,6,7,8,9,10] 
z12list = [1,1.5,2,3,4,5]
cutoff_comb=3
cutoff_single=1.5

# loop over all traits and loco cutoffs

In [25]:
#ctrl_traits.insert(len(ctrl_traits),'ext')
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in traits)]
t=['addict-rf','ext','bmi','height']
rm=['fus','rat','loco','db','rtb','st22']
ls=[x for x in seed_dict.keys() if any(t in x for t in t)]
ls=[x for x in ls if not any(rm in x for rm in rm)]
ls

In [14]:
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in psych_traits)]

In [24]:
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in ['park_'])]

In [25]:
#modify for correct genesets
trait_r='loco_final_cf'
trait_h=None
cut_h=None
overwrite=True
#for cut_r in ['FDR','bonf','top500']:
for cut_r in ['FDR','bonf']:
    _,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)
    print(trait_r)
    for label_h in ls:
            print(label_h)
            coloc_filename=f'colocalization_scores/colocScore_{label_r}_{label_h}_{interactome_name}.tsv'
            if not (os.path.exists(coloc_filename)and overwrite==False):
                print('running analysis')
                seed_h=seed_dict[label_h]
                print(f'{len(seed_h)} seed genes for this dataset.')
                npsh_label=label_h+'_'+interactome_name
                if not (npsh_label in NPS_dict.keys()):
                    print(f'{npsh_label} does not have scores, possibly due to number of seed genes.')
                else:
                    NPS_h=NPS_dict[npsh_label]
                    netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                                 zthresh_list = zlist,
                                                                                                 z12thresh_list=z12list,
                                                                                                 verbose=False)
                    #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
                    #print(netcoloc_enrichment_df)
                    netcoloc_enrichment_df['rat_dataset']=label_r
                    netcoloc_enrichment_df['human_dataset']=label_h
                    if save_fig:
                        netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            else:
                print('file already exists')

loco_final_cf
park_FDR
running analysis
2016 seed genes for this dataset.
park_bonf
running analysis
409 seed genes for this dataset.
park_top500
running analysis
500 seed genes for this dataset.
loco_final_cf
park_FDR
running analysis
2016 seed genes for this dataset.
park_bonf
running analysis
409 seed genes for this dataset.
park_top500
running analysis
500 seed genes for this dataset.


In [40]:
#modify for correct genesets
trait_r='body_length_rn6'
trait_h=None
cut_h=None
#for cut_r in ['FDR','bonf','top500']:
for cut_r in ['FDR','bonf']:
    _,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)
    print(trait_r)
    for label_h in ls:
            print(label_h)
            coloc_filename=f'colocalization_scores/colocScore_{label_r}_{label_h}_{interactome_name}.tsv'
            if not (os.path.exists(coloc_filename)):
                print('running analysis')
                seed_h=seed_dict[label_h]
                NPS_h=NPS_dict[label_h+'_'+interactome_name]
                netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                             zthresh_list = zlist,
                                                                                             z12thresh_list=z12list,
                                                                                             verbose=False)
                #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
                print(netcoloc_enrichment_df)
                netcoloc_enrichment_df['rat_dataset']=label_r
                netcoloc_enrichment_df['human_dataset']=label_h
                if save_fig:
                    netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            else:
                print('file already exists')

body_length_rn6
ext_bonf
running analysis
    z_comb  z_12  observed_overlap  expected_overlap_mean  \
0        1   1.0               949                 832.27   
1        1   1.5               394                 340.31   
2        1   2.0               168                 144.09   
3        1   3.0                59                  45.69   
4        1   4.0                38                  28.67   
5        1   5.0                33                  26.20   
6        2   1.0               779                 681.82   
7        2   1.5               394                 341.52   
8        2   2.0               168                 145.23   
9        2   3.0                59                  44.33   
10       2   4.0                38                  28.00   
11       2   5.0                33                  25.37   
12       3   1.0               600                 513.93   
13       3   1.5               371                 319.67   
14       3   2.0               168         

# old code

In [None]:
ctrl_traits=[x for x in seed_dict.keys() if not any(rm in x for rm in rm)]

In [None]:
rerun_analysis=False

In [16]:
#modify for correct genesets
trait_r='loco_final_cf_25'
cut_r='FDR'
trait_h=None
cut_h=None

_,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)

In [None]:
for t in ctrl_traits:
        trait_h=str(t)
        if len(seed_dict[t+'_bonf'])<=500:
            print('more than 500 genes in most stringent cutoff- testing top500')
            test500=True
        else:
            test500=False
        for u in ['FDR','bonf']:
            trait_h=str(t)
            cut_h=u
            label_h=trait_h+'_'+cut_h
            seed_h=seed_dict[label_h]
            NPS_h=NPS_dict[label_h+'_'+interactome_name]
            netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                         zthresh_list = zlist,
                                                                                         z12thresh_list=z12list,
                                                                                         verbose=False)
            #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
            print(netcoloc_enrichment_df)
            netcoloc_enrichment_df['rat_dataset']=label_r
            netcoloc_enrichment_df['human_dataset']=label_h
            if save_fig:
                netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])
        if (test500==True):
            trait_h=str(t)
            cut_h='top500'
            label_h=trait_h+'_'+cut_h
            seed_h=seed_dict[label_h]
            NPS_h=NPS_dict[label_h+'_'+interactome_name]
            netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                         zthresh_list = zlist,
                                                                                         z12thresh_list=z12list,
                                                                                         verbose=False)
            #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
            print(netcoloc_enrichment_df)
            netcoloc_enrichment_df['rat_dataset']=label_r
            netcoloc_enrichment_df['human_dataset']=label_h
            if save_fig:
                netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])