# Import

In [1]:
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
from decimal import Decimal
import numpy as np
import pandas as pd
import json
import csv

# Download dataset

In [2]:
kegg_df = pd.read_pickle('../extract_pathway_df/output/kegg.pkl')
panther_df = pd.read_pickle('../extract_pathway_df/output/panther.pkl')
biocarta_df = pd.read_pickle('../extract_pathway_df/output/biocarta.pkl')
wikipathway_df = pd.read_pickle('../extract_pathway_df/output/wikipathway.pkl')
mirtarbase_df = pd.read_pickle('../extract_pathway_df/output/mirtarbase.pkl')

databases = [kegg_df, panther_df, biocarta_df, wikipathway_df, mirtarbase_df]

for df in databases:
    print('=====', list(df.database)[0], df.shape, '=====')
    print("number of pathways", len(set(df.pathway_id)))
    print("number of unique genes", len(set(df.entrez)))
    print("total number of genes (rows)", len(df.entrez))

merged_df = pd.concat(databases)

===== kegg (28617, 4) =====
number of pathways 322
number of unique genes 7301
total number of genes (rows) 28617
===== panther (5239, 4) =====
number of pathways 175
number of unique genes 2188
total number of genes (rows) 5239
===== biocarta (24477, 4) =====
number of pathways 520
number of unique genes 6405
total number of genes (rows) 24477
===== wikipathway (11068, 4) =====
number of pathways 253
number of unique genes 4095
total number of genes (rows) 11068
===== MiRTarBase (380639, 4) =====
number of pathways 2599
number of unique genes 15064
total number of genes (rows) 380639


In [3]:
# import our extracted signature
signature_genes = []
total_genes = []

# import our extracted signature
with open('../9. project-signature-extraction/data/signature.json', 'r') as f:
    signature_genes = json.loads(f.read())
    print("imported signature genes from" ,"data/signature.json")
    
# import our extracted signature
with open('../9. project-signature-extraction/data/total_genes.json', 'r') as f:
    total_genes = json.loads(f.read())
    print("imported signature genes from" ,"data/total_genes.json")
    
print("total_genes", len(total_genes))
print("signature", len(signature_genes))

imported signature genes from data/signature.json
imported signature genes from data/total_genes.json
total_genes 17285
signature 14


In [4]:
# remove genes that we don't have in our big list
# as they said in the method part (paper)

print("numer of genes before removal", merged_df.shape)
merged_df = merged_df[merged_df['entrez'].isin(total_genes)]
print("numer of genes after removal", merged_df.shape)
print("number of unique genes after removal", len(set(merged_df.entrez)))

numer of genes before removal (450040, 4)
numer of genes after removal (416507, 4)
number of unique genes after removal 15053


In [5]:
# get only the pathway that contains at least 1 of the genes in our signature
ids_pathway = sorted(list(set(merged_df.pathway_id)))
number_of_pathways = len(ids_pathway)
print("number of pathways", number_of_pathways)

# we are going to create a dictionary in which
# "keys" are pathway_id that contains at least one gene
# and the "values" are list of entrez_id of the gene of the signature
# that are in that pathway
relevant_pathways = {}

for pathway_id in ids_pathway:
    # get the dataframe of the pathway with a specific id
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    
    # extract all the genes in that pathway
    pathway_genes_list = list(pathway_df.entrez)
    
    # for each gene in our signature
    # check if it is in the payway gene list
    # and append to the corresponding list
    # inside the dictionary
    for sig_gene in signature_genes:
        if sig_gene in pathway_genes_list:
            if pathway_id not in relevant_pathways:
                relevant_pathways[pathway_id] = []
            relevant_pathways[pathway_id].append(sig_gene)

print("number of relevant pathways:", len(relevant_pathways.keys()))
relevant_pathways

number of pathways 3834
number of relevant pathways: 152


{'00062': ['79993'],
 '00230': ['5146'],
 '03010': ['6192'],
 '03013': ['9086'],
 '04144': ['8853'],
 '04145': ['3117'],
 '04360': ['27289'],
 '04514': ['3117'],
 '04610': ['735'],
 '04612': ['3117'],
 '04640': ['3117'],
 '04658': ['3117'],
 '04659': ['3117'],
 '04666': ['8853'],
 '04672': ['3117'],
 '04940': ['3117'],
 '05020': ['735'],
 '05140': ['3117'],
 '05145': ['3117'],
 '05146': ['735'],
 '05150': ['3117'],
 '05152': ['3117'],
 '05164': ['3117'],
 '05166': ['3117'],
 '05168': ['3117'],
 '05169': ['3117'],
 '05310': ['3117'],
 '05320': ['3117'],
 '05321': ['3117'],
 '05322': ['735', '3117'],
 '05323': ['3117'],
 '05330': ['3117'],
 '05332': ['3117'],
 '05416': ['3117'],
 '67139': ['6192'],
 '89328': ['9086'],
 '89636': ['735'],
 '90000': ['27289'],
 '92211': ['735', '3117'],
 '94793': ['8853', '3117'],
 '95109': ['735'],
 'BIOCARTA_ARAP_PATHWAY': ['8853'],
 'BIOCARTA_CLASSIC_PATHWAY': ['735'],
 'BIOCARTA_COMP_PATHWAY': ['735'],
 'BIOCARTA_LECTIN_PATHWAY': ['735'],
 'HALLMARK_ALL

In [6]:
greater_2 = []
uniq = [] 
for p_id in relevant_pathways.keys():
    if len(relevant_pathways[p_id]) >= 2:
        greater_2.append(relevant_pathways[p_id])
        uniq += relevant_pathways[p_id]

print("set of unique matching genes from the signature")
set(uniq)

set of unique matching genes from the signature


{'3117', '735', '8284', '8853', '9086'}

In [7]:
# hypergeometric test 

# keys: pathway_id
# value: numpy array with probability mass function
pathway_pmf_dict = {}

for pathway_id in relevant_pathways.keys():
    
    if len(relevant_pathways[pathway_id]) < 2:
        continue 
    
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    
    # M: is the number of total genes
    # n: is our signature (95)
    # N: is the number of genes present in the pathway
    number_of_genes_in_our_signature = len(signature_genes)
    total_number_of_our_genes = merged_df.shape[0]
    [M, n, N] = [total_number_of_our_genes, len(pathway_df.entrez), number_of_genes_in_our_signature]
    print(pathway_id, M,n,N)
    
    # run the test
    rv = hypergeom(M, n, N)
    x = np.arange(0, n+1)
    
    # probability mass function
    pmf_genes_pathway = rv.pmf(x)
    pathway_pmf_dict[pathway_id] = pmf_genes_pathway
    
pathway_pmf_dict

hsa-miR-26b-5p 416507 1743 14
94793 416507 171 14
05322 416507 88 14
92211 416507 54 14


{'05322': array([  9.97046079e-01,   2.94991131e-03,   4.00611625e-06,
          3.30950411e-09,   1.85778066e-12,   7.49518865e-16,
          2.24093741e-19,   5.04327081e-23,   8.58382949e-27,
          1.09939685e-30,   1.04285810e-34,   7.10329283e-39,
          3.28368888e-43,   9.22003869e-48,   1.18613859e-52,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,

In [8]:
# keys: pathway_id
# value: significance value
pathway_significance_associated_value = {}

for pathway_id in pathway_pmf_dict.keys():
    list_of_genes = relevant_pathways[pathway_id]
    number_of_genes_present_in_pathway = len(list_of_genes)
    #print(pathway_id, number_of_genes_present_in_pathway)
    significance_value = pathway_pmf_dict[pathway_id][number_of_genes_present_in_pathway]
#    if significance_value < 0.0000000001:
    pathway_significance_associated_value[pathway_id] = '%.2E' % Decimal(significance_value)

results = []
for path_id in pathway_significance_associated_value.keys():
    name = list(merged_df[merged_df.pathway_id == path_id].pathway_name)[0]
    database = list(merged_df[merged_df.pathway_id == path_id].database)[0]
    result = {'database': database, 'pathway_id': path_id, 'pathway_name': name, 'significance_value': pathway_significance_associated_value[path_id]}
    results.append(result)
    
results_df = pd.DataFrame(results)
results_df.to_pickle("data/deregulated_pathways.pkl")
results_df

Unnamed: 0,database,pathway_id,pathway_name,significance_value
0,MiRTarBase,hsa-miR-26b-5p,hsa-miR-26b-5p,0.00151
1,kegg,05322,Systemic lupus erythematosus,4.01e-06
2,wikipathway,94793,Vitamin D Receptor Pathway,1.52e-05
3,wikipathway,92211,Allograft Rejection,1.5e-06


## Obtain the second input for pathifier
We are extracting each of the resulting pathway with all its associated genesm, this because that one is the input required by Pathifier to compute the PDS.

In [9]:
ids_pathway = list(results_df.pathway_id)

dictios= []
for pathway_id in ids_pathway:
    # get the dataframe of the pathway with a specific id
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    list_of_entrez = list(pathway_df["entrez"])
    dictio = {"id":pathway_id, "blank":"", "entrez_id":list_of_entrez}
    dictios.append(dictio)

# sort by length of the entrez_id value list
# it is important to use a sorted list in descending order
# because then R use the length of the first row as a fixed length
# for all the rows in the dataframe
# sorting the list fix
sorted_list = sorted(dictios, reverse=True, key=lambda k: len(k["entrez_id"])) 

with open('data/second_input_pathifier.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    for dictio in sorted_list:
        writer.writerow([dictio["id"]] + [""] + dictio["entrez_id"])
    print("output in second_input_pathifier.csv")

output in second_input_pathifier.csv
