# purpose and use notes

purpose: generates NPS scores for traits used in this paper. If overwrite_file==TRUE, the network propagation scores will be overwritten with the newly generated scores. This will affect all downstream analyses.

runs network propagation (typically in pcnet v1.4) from seed genes saved in a file- accessed from meta data csv. if rerun==TRUE, then network propagation scores will be recalculated.

# setup

In [1]:
#read in libraries
from rca_functions import *
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import matplotlib
from matplotlib_venn import venn2 
from scipy.stats import hypergeom
import statsmodels.stats.multitest
import rca_functions
import ndex2
import networkx as nx
from netcoloc import netprop_zscore
from netcoloc import netprop
from netcoloc import network_colocalization
import sys

In [2]:
os.chdir('../')

In [3]:
os.getcwd()

'/tscc/projects/ps-palmer/brittany/rare_common_alcohol_comparison'

# Interactome Set-up

In [4]:
#create a file called environ_ndex_meta.py where you save variables 'ndex_user' and 'ndex_password'
#otherwise will prompt you to define those within the notebooks
if os.path.isfile('../environ_ndex_meta.py'):
    print ('NDEx credentials imported from meta file')
    sys.path.insert(1, '../')
    from environ_ndex_meta import *
    sys.path.pop(1)
else:
    # Prompt the user for a username
    ndex_user = input("Enter your NDEx username: ")
    # Prompt the user for a password
    ndex_password = input("Enter your NDEx password: ")

NDEx credentials imported from meta file


In [5]:
os.getcwd()

'/tscc/projects/ps-palmer/brittany/rare_common_alcohol_comparison'

In [6]:
interactome_name='pcnet_v14'

In [7]:
interactome=import_interactome(interactome_name)

pcnet_v14
number of nodes:
18630

number of edges:
2687393


In [8]:
# pre calculate the matricies used for network propagation
print('\ncalculating w_prime')
w_prime = netprop.get_normalized_adjacency_matrix(interactome, conserve_heat=True)

print('\ncalculating w_double_prime')
w_double_prime = netprop.get_individual_heats_matrix(w_prime, .5)


calculating w_prime

calculating w_double_prime


In [9]:
w_double_prime

array([[5.00505179e-01, 4.74520511e-04, 9.79187768e-04, ...,
        1.05985691e-05, 5.33902865e-06, 5.32808702e-06],
       [7.86945862e-04, 5.00358784e-01, 7.44330387e-04, ...,
        7.28164864e-05, 2.11532107e-06, 9.52339521e-06],
       [1.07359981e-03, 4.92098827e-04, 5.00535331e-01, ...,
        1.18362017e-05, 5.38897524e-06, 4.46898669e-06],
       ...,
       [5.83943202e-08, 2.41915237e-07, 5.94784005e-08, ...,
        5.00134933e-01, 3.65652715e-08, 1.97560360e-08],
       [1.47080679e-08, 3.51382238e-09, 1.35401388e-08, ...,
        1.82826358e-08, 5.01153008e-01, 3.71865669e-08],
       [1.46779257e-08, 1.58195934e-08, 1.12286098e-08, ...,
        9.87801802e-09, 3.71865669e-08, 5.00197359e-01]])

# common gene data analysis

In [10]:
datasets=pd.read_csv('common_datasets_prepub.csv',sep=',')

In [11]:
datasets

Unnamed: 0,group,snp2gene_method,label,phenotype_group,seed_path,delim,zscore_file,zscore_path,seed_p,seed_gene_name,description,cutoff
0,GSCAN_DPW,magma,GSCAN_DPW_magma,alcohol,GSCAN_DPW/GSCAN_DPW_magma_results.csv,comma,gscan_dpw_magma_zscore.tsv,network_scores/gscan_dpw_magma_zscore.tsv,P,GENE_NAME,drinks_per_week_gscan,bonferroni
1,neale_20153_irnt,magma,neale_20153_irnt_magma,control,neale_ctrl/20153_gscan_dpw_magma_zscore.tsv,neale_20153_magma_zscore.tsv,network_scores/neale_20153_magma_zscore.tsv,P,SYMBOL,FEV1,bonferroni,
2,neale_30110_irnt,magma,neale_30110_irnt_magma,control,neale_ctrl/30110_gscan_dpw_magma_zscore.tsv,tab,neale_30110_magma_zscore.tsv,network_scores/neale_30110_magma_zscore.tsv,P,SYMBOL,platelet_width,bonferroni
3,neale_3148_irnt,magma,neale_3148_irnt_magma,control,neale_ctrl/3148_gscan_dpw_magma_zscore.tsv,tab,neale_3148_irnt_magma_zscore.tsv,network_scores/neale_3148_irnt_magma_zscore.tsv,P,SYMBOL,heel_bone_density,bonferroni
4,neale_4104_irnt,magma,neale_4104_irnt_magma,control,neale_ctrl/4104_gscan_dpw_magma_zscore.tsv,tab,neale_4104_irnt_magma_zscore.tsv,network_scores/neale_4104_irnt_magma_zscore.tsv,P,SYMBOL,heel_bone_density,bonferroni


In [12]:
row_common=0

In [34]:
for row_common in range(len(datasets)):
    print('processing '+datasets['label'][row_common])
    NPSc=run_net_prop(path='input_files/'+datasets['seed_path'][row_common], 
                 trait_name=datasets['label'][row_common],
                 pcol=datasets['seed_p'][row_common],
                 gene_col=datasets['seed_gene_name'][row_common],
                 delim=datasets['delim'][row_common],
                 cutoff=datasets['cutoff'][row_common],
                 graph=interactome,
                 interactome='pcnet_v14',
                 w_double_prime=w_double_prime, 
                 savefile=False)    
    print(NPSc.head)

processing GSCAN_DPW_magma
cutoff not defined/custom- using all genes 
        GENE  CHR   START    STOP  NSNPS  NPARAM       N    ZSTAT        P  \
0      79501    1   59091   80008      6       3  436159 -0.39952  0.65525   
1  100996442    1  131934  184394      5       3  437867  0.23430  0.40738   
2  105378947    1  576287  621297      7       4  444770 -0.45845  0.67668   
3      81399    1  675716  696654      2       1  405668  0.83144  0.20286   
4  105378580    1  793398  815130     71      19  473562  0.52224  0.30075   

      GENE_NAME  
0         OR4F5  
1  LOC100996442  
2  LOC105378947  
3        OR4F16  
4  LOC105378580  
using provided w_double_prime - please ensure that w_double_prime aligns to graph provided


TypeError: tuple indices must be integers or slices, not tuple

In [20]:
os.getcwd()

'/tscc/projects/ps-palmer/brittany/rare_common_alcohol_comparison'

In [26]:
def import_seedgenes(path,pcol='P',gene_col='GENE NAME',delim='comma', cutoff=None):
    if delim=='comma':
        df=pd.read_csv(path,sep=',')
    else:
        df=pd.read_csv(path,sep='\t')
    if pcol==None:
        print('pvalue column not specified- all genes will be used')
        cutoff=None
    if cutoff=='bonferroni':
        df=df[df[pcol]<0.05/len(df)]
    elif cutoff=='FDR':
        df['pval_FDR']=statsmodels.stats.multitest.fdrcorrection(df[pcol],alpha=0.05,method='indep',is_sorted=False)[1]
        df=df[df['pval_FDR']<0.05]
    else:
        print('cutoff not defined/custom- using all genes ')
        df=df
    print(df.head())
    #gene_ls=list(set(df[gene_col]))
    #return(gene_ls)
    return(df)

In [24]:
path='input_files/'+datasets['seed_path'][row_common], 
trait_name=datasets['label'][row_common],
pcol=datasets['seed_p'][row_common],
gene_col=datasets['seed_gene_name'][row_common],
delim=datasets['delim'][row_common],
cutoff=datasets['cutoff'][row_common],
graph=interactome,
interactome='pcnet_v14',
w_double_prime=w_double_prime, 
savefile=False

In [27]:
import_seedgenes(path, pcol, gene_col, delim)

ValueError: Invalid file path or buffer object type: <class 'tuple'>

In [16]:
data = import_seedgenes(path, pcol, gene_col, delim)
data = list(data[gene_col])
if graph is None:
    graph = import_interactome(interactome)
    print("importing network " + interactome)
if w_double_prime is None:
    # pre calculate mats used for netprop
    print('\ncalculating w_prime')
    w_prime = netprop.get_normalized_adjacency_matrix(graph, conserve_heat=True) 
    print('\ncalculating w_double_prime')
    w_double_prime = netprop.get_individual_heats_matrix(w_prime, 0.5)
else:
    print("using provided w_double_prime - please ensure that w_double_prime aligns to graph provided")
graph_nodes = list(graph.nodes())
#print(graph_nodes)
data = list(set(data).intersection(graph_nodes))
#print(data)
##calculate heats
z_score, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(
    w_double_prime,  
    graph_nodes,
    dict(graph.degree), 
    data, num_reps=1000,
    minimum_bin_size=100
)
if savefile:
    export_path = 'calculated_values/network_scores/'
    if graph is None and interactome == 'pcnet_v14':
        prefix = (export_path + trait_name).lower()
    elif graph is None and interactome != 'pcnet_v14':
        prefix = (export_path + trait_name + '_' + interactome).lower()
    elif graph is not None and interactome != 'pcnet_v14':
        prefix = (export_path + trait_name + '_' + interactome).lower()
    else:
        print("saving file without interactome_prefix, please provide an interactome name if prefix wanted")
        prefix = ('network_scores/' + trait_name).lower()

    z_score.to_csv(prefix + '_zscore.tsv', sep='\t', header=False)
    if saveheat:
        Fnew_score.to_csv(prefix + '_heats.tsv', sep='\t', header=False)
        pd.DataFrame(Fnew_rand_score, columns=z_score.index).to_csv((prefix+'_randheats.tsv'),sep='\t')
    else:
        print('calculated NPS not saved')

ValueError: Invalid file path or buffer object type: <class 'tuple'>

In [13]:
print('processing '+datasets['label'][row_common])
NPSc=run_net_prop(path='input_files/'+datasets['seed_path'][row_common], 
             trait_name=datasets['label'][row_common],
             pcol=datasets['seed_p'][row_common],
             gene_col=datasets['seed_gene_name'][row_common],
             delim=datasets['delim'][row_common],
             cutoff=datasets['cutoff'][row_common],
             graph=interactome,
             interactome='pcnet_v14',
             w_double_prime=w_double_prime, 
             savefile=False)    
print(NPSc.head)

processing GSCAN_DPW_magma
cutoff not defined/custom- using all genes 
        GENE  CHR   START    STOP  NSNPS  NPARAM       N    ZSTAT        P  \
0      79501    1   59091   80008      6       3  436159 -0.39952  0.65525   
1  100996442    1  131934  184394      5       3  437867  0.23430  0.40738   
2  105378947    1  576287  621297      7       4  444770 -0.45845  0.67668   
3      81399    1  675716  696654      2       1  405668  0.83144  0.20286   
4  105378580    1  793398  815130     71      19  473562  0.52224  0.30075   

      GENE_NAME  
0         OR4F5  
1  LOC100996442  
2  LOC105378947  
3        OR4F16  
4  LOC105378580  
using provided w_double_prime - please ensure that w_double_prime aligns to graph provided


  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
zscores

In [35]:
datasets['cutoff'][row_common]

'bonferroni'

In [15]:
def run_net_prop(path, trait_name, pcol, gene_col, delim, cutoff=None, graph=None, w_double_prime=None, interactome='pcnet_v14', ndex_user=None, ndex_password=None, savefile=False):
    """
    Executes network propagation analysis for a given trait using provided seed genes and provided interactome.

    Parameters:
    - path (str): The file path to the seed gene file.
    - trait_name (str): The name of the trait for which the analysis is being run.
    - pcol (str): The column name in the seed genes file that contains the p-values.
    - gene_col (str): The column name in the seed genes file that specifies the gene names.
    - delim (str): The delimiter used in the seed genes file.
    - cutoff (float, optional): The p-value cutoff for filtering seed genes. If None (Default), no filtering is applied. Defaults to None.
    - graph (NetworkX graph, optional): The interactome network graph. If None, the graph is imported using the interactome parameter. Defaults to None.
    - w_double_prime (numpy.ndarray, optional): Pre-calculated matrix for network propagation. If None, it is calculated in the function. Defaults to None.
    - interactome (str, optional): The name of the interactome. If no graph is provided, this will be imported using the import_interactome function which accepts UUIDs or keys to the UUIDs dictionary. Will used as a label for exported interactome files. Defaults to 'pcnet_v14', which was used for this analysis.
    - ndex_user (str, optional): NDEx account username, required if uploading results to NDEx. Defaults to None.
    - ndex_password (str, optional): NDEx account password, required if uploading results to NDEx. Defaults to None.

    Returns:
	NPS zscores
 
    Notes:
    - The function requires an external library for network propagation calculations.
    - The seed genes file should contain a column for genes and a column for their associated p-values.
    - The function saves three files: z-scores, raw heats, and randomized heats for the network analysis,
      with the trait name and optionally the interactome name as part of the filenames.
    - If using a private interactome, ensure the ndex_user and ndex_password are correctly provided.
    """
    data = import_seedgenes(path, pcol, gene_col, delim)
    data = list(data[gene_col])
    if graph is None:
        graph = import_interactome(interactome)
        print("importing network " + interactome)
    if w_double_prime is None:
        # pre calculate mats used for netprop
        print('\ncalculating w_prime')
        w_prime = netprop.get_normalized_adjacency_matrix(graph, conserve_heat=True) 
        print('\ncalculating w_double_prime')
        w_double_prime = netprop.get_individual_heats_matrix(w_prime, 0.5)
    else:
        print("using provided w_double_prime - please ensure that w_double_prime aligns to graph provided")
    graph_nodes = list(graph.nodes())
    #print(graph_nodes)
    data = list(set(data).intersection(graph_nodes))
    #print(data)
    ##calculate heats
    z_score, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(
        w_double_prime,  
        graph_nodes,
        dict(graph.degree), 
        data, num_reps=1000,
        minimum_bin_size=100
    )
    if savefile:
        export_path = 'calculated_values/network_scores/'
        if graph is None and interactome == 'pcnet_v14':
            prefix = (export_path + trait_name).lower()
        elif graph is None and interactome != 'pcnet_v14':
            prefix = (export_path + trait_name + '_' + interactome).lower()
        elif graph is not None and interactome != 'pcnet_v14':
            prefix = (export_path + trait_name + '_' + interactome).lower()
        else:
            print("saving file without interactome_prefix, please provide an interactome name if prefix wanted")
            prefix = ('network_scores/' + trait_name).lower()

        z_score.to_csv(prefix + '_zscore.tsv', sep='\t', header=False)
        if saveheat:
            Fnew_score.to_csv(prefix + '_heats.tsv', sep='\t', header=False)
            pd.DataFrame(Fnew_rand_score, columns=z_score.index).to_csv((prefix+'_randheats.tsv'),sep='\t')
        else:
            print('calculated NPS not saved')
    return z_score

In [16]:
def import_seedgenes(path,pcol='P',gene_col='GENE NAME',delim='comma', cutoff=None):
    if delim=='comma':
        df=pd.read_csv(path,sep=',')
    else:
        df=pd.read_csv(path,sep='\t')
    if pcol==None:
        print('pvalue column not specified- all genes will be used')
        cutoff=None
    if cutoff=='bonferroni':
        df=df[df[pcol]<0.05/len(df)]
    elif cutoff=='FDR':
        df['pval_FDR']=statsmodels.stats.multitest.fdrcorrection(df[pcol],alpha=0.05,method='indep',is_sorted=False)[1]
        df=df[df['pval_FDR']<0.05]
    else:
        print('cutoff not defined/custom- using all genes ')
        df=df
    print(df.head())
    #gene_ls=list(set(df[gene_col]))
    #return(gene_ls)
    return(df)

In [11]:
z_score, Fnew_score, Fnew_rand_score = netprop_zscore.calculate_heat_zscores(w_double_prime, pc_nodes, 
                                                            dict(interactome.degree), 
                                                            data, num_reps=1000,
                                                            minimum_bin_size=100)
trait_name=trait_name
z_score.to_csv('network_scores/'+trait_name+'_zscore.tsv',sep='\t',header=False)
Fnew_score.to_csv('network_scores/'+trait_name+'_heats.tsv',sep='\t',header=False)
pd.DataFrame(Fnew_rand_score, columns=z_score.index).to_csv('network_scores/'+trait_name+'_randheats.tsv',sep='\t')
print(str(trait_name+'_zscore.tsv'))
print(str('network_scores/'+trait_name+'_zscore.tsv'))

  0%|          | 0/1000 [00:00<?, ?it/s]

ADH1C_zscore.tsv
network_scores/ADH1C_zscore.tsv


# rare gene data analysis

In [21]:
os.getcwd()

'/tscc/projects/ps-palmer/brittany/rare_common_alcohol'

In [19]:
datasets=pd.read_csv('rare_datasets_prepub.csv')
runsets=datasets
runsets=runsets.reset_index()

In [20]:
runsets

Unnamed: 0,index,label,cutoff used,seed_path,delim,zscore_file,zscore_path,Unnamed: 6,seed_gene_name,phenotype_group
0,0,alcoholintake_FDR_25,all tests FDR <0.25,rare_variant_genebass/alcohol_intake/alcohol_i...,comma,alcoholintake_fdr_25_zscore.tsv,network_scores/alcoholintake_FDR_25_zscore.tsv,network_scores/alcoholintake_fdr_25_zscore.tsv,Gene Name,alcohol
1,1,rare_neale_20153_irnt_FDR_25,all tests FDR <0.25,rare_variant_genebass/20153_irnt/20153_irnt_25...,tab,rare_neale_20153_irnt_fdr_25_zscore.tsv,network_scores/rare_neale_20153_irnt_FDR_25_zs...,network_scores/rare_neale_20153_irnt_fdr_25_zs...,Gene Name,control
2,2,rare_neale_30110_irnt_FDR_25,all tests FDR <0.25,rare_variant_genebass/30110_irnt/30110_irnt_25...,tab,rare_neale_30110_irnt_fdr_25_zscore.tsv,network_scores/rare_neale_30110_irnt_FDR_25_zs...,network_scores/rare_neale_30110_irnt_fdr_25_zs...,Gene Name,control
3,3,rare_neale_20016_FDR_25,all tests FDR <0.25,rare_variant_genebass/20016/20016_25FDR.tsv,tab,rare_neale_20016_fdr_25_zscore.tsv,network_scores/rare_neale_20016_FDR_25_zscore.tsv,network_scores/rare_neale_20016_fdr_25_zscore.tsv,Gene Name,control
4,4,rare_neale_20502_FDR_25,all tests FDR <0.25,rare_variant_genebass/20502/20502_25FDR.tsv,tab,rare_neale_20502_fdr_25_zscore.tsv,network_scores/rare_neale_20502_FDR_25_zscore.tsv,network_scores/rare_neale_20502_fdr_25_zscore.tsv,Gene Name,control
5,5,rare_neale_2443_FDR_25,all tests FDR <0.25,rare_variant_genebass/2443/2443_25FDR.tsv,tab,rare_neale_2443_fdr_25_zscore.tsv,network_scores/rare_neale_2443_FDR_25_zscore.tsv,network_scores/rare_neale_2443_fdr_25_zscore.tsv,Gene Name,control
6,6,rare_neale_4194_FDR_25,all tests FDR <0.25,rare_variant_genebass/4194/4194_25FDR.tsv,tab,rare_neale_4194_fdr_25_zscore.tsv,network_scores/rare_neale_4194_FDR_25_zscore.tsv,network_scores/rare_neale_4194_fdr_25_zscore.tsv,Gene Name,control
7,7,rare_neale_78_FDR_25,all tests FDR <0.25,rare_variant_genebass/78/78_25FDR.tsv,tab,rare_neale_78_fdr_25_zscore.tsv,network_scores/rare_neale_78_FDR_25_zscore.tsv,network_scores/rare_neale_78_fdr_25_zscore.tsv,Gene Name,control
8,8,rare_neale_C50_FDR_25,all tests FDR <0.25,rare_variant_genebass/C50/C50_25FDR.tsv,tab,rare_neale_c50_fdr_25_zscore.tsv,network_scores/rare_neale_C50_FDR_25_zscore.tsv,network_scores/rare_neale_c50_fdr_25_zscore.tsv,Gene Name,control
9,9,rare_neale_C44_FDR_25,all tests FDR <0.25,rare_variant_genebass/C44/C44_25FDR.tsv,tab,rare_neale_c44_fdr_25_zscore.tsv,network_scores/rare_neale_C44_FDR_25_zscore.tsv,network_scores/rare_neale_c44_fdr_25_zscore.tsv,Gene Name,control


In [None]:
for row in range(len(runsets)):
    print('processing '+runsets['label'][row])
    run_net_prop(runsets['seed_path'][row], runsets['label'][row],'0',runsets['seed_gene_name'][row],
                 runsets['delim'][row],'no_cutoff',
                interactome_name)