network colocalization for human control traits (positive and negative)

# set-up

In [1]:
import os
import pandas as pd
import ndex2
import networkx as nx
from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization
import sys
import random

In [3]:
import os
os.chdir('/tscc/projects/ps-palmer/brittany/rare_common_alcohol/rare_common_alcohol_comparison/notebooks/')
from rca_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/scripts')
from network_functions import *
from plotting_functions import *
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [3]:
random_seed=random.seed(211)

In [4]:
save_fig=True

In [5]:
#create a file called environ_ndex_meta.py where you save variables 'ndex_user' and 'ndex_password'
#otherwise will prompt you to define those within the notebooks
if os.path.isfile('../environ_ndex_meta.py'):
    print ('NDEx credentials imported from meta file')
    sys.path.insert(1, '../')
    from environ_ndex_meta import *
    sys.path.pop(1)
else:
    # Prompt the user for a username
    ndex_user = input("Enter your NDEx username: ")
    # Prompt the user for a password
    ndex_password = input("Enter your NDEx password: ")

NDEx credentials imported from meta file


In [6]:
plt.rcParams.update({'font.size': 16})

# Interactome Set-up

In [7]:
tissue='global'

pcnet2- versions 
from wright et al. 2024 preprint:
PCNet 2.0= best-performing ranked composite (top 15 interactomes, 3.85M interactions)
PCNet 2.1= top 8 interactomes, 1.75M interactions
PCNet 2.2= top 10 co-citation-free interactomes, 3.32M interactions 

In [8]:
interactome_name='PCNet2.2'
interactome=import_interactome(UUIDs=UUIDs,interactome_name=interactome_name)
all_nodes=list(interactome.nodes())
# pre calculate the matricies used for network propagation
print('\ncalculating w_prime')
w_prime = netprop.get_normalized_adjacency_matrix(interactome, conserve_heat=True)

print('\ncalculating w_double_prime')
w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)

PCNet2.2
Name: PCNet 2.2
Nodes: 18558
Edges: 3323928
Node Attributes: 18558
Edge Attributes: 16619640

number of nodes:
18558

number of edges:
3323928

calculating w_prime

calculating w_double_prime


# import NPS scores and seed genes

In [None]:
seed_dict=import_seed_dict(mag_dir,file_dict,ctrl_traits,ctrl_traits_rat,psych_traits,bonf_dict,gene_col_dict,all_nodes)
NPS_dict,NPS_dict_series=import_NPS_scores(seed_dict,interactome_name)

# choose datasets for analysis

In [14]:
#set the Z-score cutoffs to calculate network enrichment significance
zlist = [1,2,3,4,5,6,7,8,9,10] 
z12list = [1,1.5,2,3,4,5]
cutoff_comb=3
cutoff_single=1.5

In [15]:
zlist=[cutoff_comb]
z12list=[cutoff_single]

# loop over all traits and loco cutoffs

In [16]:
ctrl_traits.insert(len(ctrl_traits),'ext')
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in traits)]
t=['addict-rf','ext','bmi','height']
rm=['fus','rat','loco','db','rtb','st22']
ls=[x for x in seed_dict.keys() if any(t in x for t in t)]
ls=[x for x in ls if not any(rm in x for rm in rm)]
ls

NameError: name 'traits' is not defined

In [None]:
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in ['park_'])]

In [None]:
ls=[x for x in seed_dict.keys() if any(ctrl_trait in x for ctrl_trait in [x for x in psych_traits if 'euro' in x])]

In [None]:
#modify for correct genesets
trait_r='loco_final_cf'
trait_h=None
cut_h=None
overwrite=True
#for cut_r in ['FDR','bonf','top500']:
for cut_r in ['FDR','bonf']:
    _,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)
    print(trait_r)
    for label_h in ls:
            print(label_h)
            coloc_filename=f'colocalization_scores/colocScore_{label_r}_{label_h}_{interactome_name}.tsv'
            if not (os.path.exists(coloc_filename)and overwrite==False):
                print('running analysis')
                seed_h=seed_dict[label_h]
                print(f'{len(seed_h)} seed genes for this dataset.')
                npsh_label=label_h+'_'+interactome_name
                if not (npsh_label in NPS_dict.keys()):
                    print(f'{npsh_label} does not have scores, possibly due to number of seed genes.')
                else:
                    NPS_h=NPS_dict[npsh_label]
                    netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                                 zthresh_list = zlist,
                                                                                                 z12thresh_list=z12list,
                                                                                                 verbose=False)
                    #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
                    #print(netcoloc_enrichment_df)
                    netcoloc_enrichment_df['rat_dataset']=label_r
                    netcoloc_enrichment_df['human_dataset']=label_h
                    if save_fig:
                        netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            else:
                print('file already exists')

In [25]:
NPS_dict.keys()

dict_keys(['loco_bonf_PCNet2.1', 'loco_FDR_PCNet2.1', 'loco_gsem_bonf_PCNet2.1', 'loco_gsem_FDR_PCNet2.1', 'ext_bonf_PCNet2.1', 'ext_top500_PCNet2.1', 'ext_FDR_PCNet2.1', 'ext_st22_PCNet2.1', 'loco_mega_fus_naac_bonf_PCNet2.1', 'loco_mega_fus_naac_FDR_PCNet2.1', 'ext_fus_naac_bonf_PCNet2.1', 'ext_fus_naac_top500_PCNet2.1', 'ext_fus_naac_FDR_PCNet2.1', 'facial_hair_FDR_PCNet2.1', 'facial_hair_bonf_PCNet2.1', 'facial_hair_top500_PCNet2.1', 'age_smkinit_FDR_PCNet2.1', 'age_smkinit_bonf_PCNet2.1', 'age_smkinit_top500_PCNet2.1', 'antisoc_FDR_PCNet2.1', 'antisoc_bonf_PCNet2.1', 'antisoc_top500_PCNet2.1', 'hr_FDR_PCNet2.1', 'hr_bonf_PCNet2.1', 'hr_top500_PCNet2.1', 'infant_bw_FDR_PCNet2.1', 'infant_bw_bonf_PCNet2.1', 'infant_bw_top500_PCNet2.1', 'LDL_FDR_PCNet2.1', 'LDL_bonf_PCNet2.1', 'LDL_top500_PCNet2.1', 'maternal_smok_FDR_PCNet2.1', 'maternal_smok_bonf_PCNet2.1', 'maternal_smok_top500_PCNet2.1', 'age_menarche_FDR_PCNet2.1', 'age_menarche_bonf_PCNet2.1', 'age_menarche_top500_PCNet2.1', 'a

In [17]:
ls=[x[0:(len(x)-len(f'_{interactome_name}'))] for x in list(NPS_dict.keys())]

In [19]:
#modify for correct genesets
trait_r='loco_final_cf'
trait_h=None
cut_h=None
#for cut_r in ['FDR','bonf','top500']:
for cut_r in ['FDR']:
    _,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)
    print(trait_r)
    for label_h in ls:
            print(label_h)
            coloc_filename=f'colocalization_scores/colocScore_{label_r}_{label_h}_{interactome_name}.tsv'
            if not (os.path.exists(coloc_filename)):
                print('running analysis')
                seed_h=seed_dict[label_h]
                NPS_h=NPS_dict[label_h+'_'+interactome_name]
                netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                             zthresh_list = zlist,
                                                                                             z12thresh_list=z12list,
                                                                                             verbose=False)
                #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
                print(netcoloc_enrichment_df)
                netcoloc_enrichment_df['rat_dataset']=label_r
                netcoloc_enrichment_df['human_dataset']=label_h
                if save_fig:
                    netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            else:
                print('file already exists')

loco_final_cf
loco_bonf
file already exists
loco_FDR
file already exists
loco_gsem_bonf
file already exists
loco_gsem_FDR
file already exists
ext_bonf
file already exists
ext_top500
file already exists
ext_FDR
file already exists
ext_st22
file already exists
loco_mega_fus_naac_bonf
file already exists
loco_mega_fus_naac_FDR
file already exists
ext_fus_naac_bonf
file already exists
ext_fus_naac_top500
file already exists
ext_fus_naac_FDR
file already exists
loco_final_cf_FDR
file already exists
facial_hair_FDR
file already exists
facial_hair_bonf
file already exists
facial_hair_top500
file already exists
age_smkinit_FDR
file already exists
age_smkinit_bonf
file already exists
age_smkinit_top500
file already exists
antisoc_FDR
file already exists
antisoc_bonf
file already exists
antisoc_top500
file already exists
hr_FDR
file already exists
hr_bonf
file already exists
hr_top500
file already exists
infant_bw_FDR
file already exists
infant_bw_bonf
file already exists
infant_bw_top500
file a

# old code

In [None]:
ctrl_traits=[x for x in seed_dict.keys() if not any(rm in x for rm in rm)]

In [None]:
rerun_analysis=False

In [16]:
#modify for correct genesets
trait_r='loco_final_cf_25'
cut_r='FDR'
trait_h=None
cut_h=None

_,label_r,_,seed_r,_,NPS_r,_=return_analysis_datasets(trait_r,cut_r,trait_h,cut_h,seed_dict,NPS_dict,interactome_name)

In [None]:
for t in ctrl_traits:
        trait_h=str(t)
        if len(seed_dict[t+'_bonf'])<=500:
            print('more than 500 genes in most stringent cutoff- testing top500')
            test500=True
        else:
            test500=False
        for u in ['FDR','bonf']:
            trait_h=str(t)
            cut_h=u
            label_h=trait_h+'_'+cut_h
            seed_h=seed_dict[label_h]
            NPS_h=NPS_dict[label_h+'_'+interactome_name]
            netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                         zthresh_list = zlist,
                                                                                         z12thresh_list=z12list,
                                                                                         verbose=False)
            #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
            print(netcoloc_enrichment_df)
            netcoloc_enrichment_df['rat_dataset']=label_r
            netcoloc_enrichment_df['human_dataset']=label_h
            if save_fig:
                netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])
        if (test500==True):
            trait_h=str(t)
            cut_h='top500'
            label_h=trait_h+'_'+cut_h
            seed_h=seed_dict[label_h]
            NPS_h=NPS_dict[label_h+'_'+interactome_name]
            netcoloc_enrichment_df = network_colocalization.calculate_network_enrichment(NPS_r,NPS_h,
                                                                                         zthresh_list = zlist,
                                                                                         z12thresh_list=z12list,
                                                                                         verbose=False)
            #netcoloc_enrichment_df=netcoloc_enrichment_df[netcoloc_enrichment_df['z_comb']>=netcoloc_enrichment_df['NPS_single']]
            print(netcoloc_enrichment_df)
            netcoloc_enrichment_df['rat_dataset']=label_r
            netcoloc_enrichment_df['human_dataset']=label_h
            if save_fig:
                netcoloc_enrichment_df.to_csv('colocalization_scores/colocScore_'+label_r+'_'+label_h+'_'+interactome_name+'.tsv',sep='\t',index=False)
            pOverlap=float(netcoloc_enrichment_df[(netcoloc_enrichment_df['z_comb']==cutoff_comb) & (netcoloc_enrichment_df['z_12']==cutoff_single)]['empirical_p'])