# Import

In [1]:
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
from decimal import Decimal
import numpy as np
import pandas as pd
import json
import csv

# Download dataset

In [2]:
kegg_df = pd.read_pickle('../extract_pathway_df/output/kegg.pkl')
panther_df = pd.read_pickle('../extract_pathway_df/output/panther.pkl')
biocarta_df = pd.read_pickle('../extract_pathway_df/output/biocarta.pkl')
wikipathway_df = pd.read_pickle('../extract_pathway_df/output/wikipathway.pkl')
mirtarbase_df = pd.read_pickle('../extract_pathway_df/output/mirtarbase.pkl')

databases = [kegg_df, panther_df, biocarta_df, wikipathway_df, mirtarbase_df]

for df in databases:
    print('=====', list(df.database)[0], df.shape, '=====')
    print("number of pathways", len(set(df.pathway_id)))
    print("number of unique genes", len(set(df.entrez)))
    print("total number of genes (rows)", len(df.entrez))

merged_df = pd.concat(databases)

===== kegg (28617, 4) =====
number of pathways 322
number of unique genes 7301
total number of genes (rows) 28617
===== panther (5239, 4) =====
number of pathways 175
number of unique genes 2188
total number of genes (rows) 5239
===== biocarta (24477, 4) =====
number of pathways 520
number of unique genes 6405
total number of genes (rows) 24477
===== wikipathway (11068, 4) =====
number of pathways 253
number of unique genes 4095
total number of genes (rows) 11068
===== MiRTarBase (380639, 4) =====
number of pathways 2599
number of unique genes 15064
total number of genes (rows) 380639


In [3]:
# import our extracted signature
signature_genes = []
total_genes = []

# import our extracted signature
with open('../3. signature-extraction/data/signature.json', 'r') as f:
    signature_genes = json.loads(f.read())
    print("imported signature genes from" ,"data/signature.json")
    
# import our extracted signature
with open('../3. signature-extraction/data/total_genes.json', 'r') as f:
    total_genes = json.loads(f.read())
    print("imported signature genes from" ,"data/total_genes.json")
    
print("total_genes", len(total_genes))
print("signature", len(signature_genes))

imported signature genes from data/signature.json
imported signature genes from data/total_genes.json
total_genes 15452
signature 63


In [4]:
# remove genes that we don't have in our big list
# as they said in the method part (paper)

print("numer of genes before removal", merged_df.shape)
merged_df = merged_df[merged_df['entrez'].isin(total_genes)]
print("numer of genes after removal", merged_df.shape)
print("number of unique genes after removal", len(set(merged_df.entrez)))

numer of genes before removal (450040, 4)
numer of genes after removal (396017, 4)
number of unique genes after removal 13829


In [5]:
# get only the pathway that contains at least 1 of the genes in our signature
ids_pathway = sorted(list(set(merged_df.pathway_id)))
number_of_pathways = len(ids_pathway)
print("number of pathways", number_of_pathways)

# we are going to create a dictionary in which
# "keys" are pathway_id that contains at least one gene
# and the "values" are list of entrez_id of the gene of the signature
# that are in that pathway
relevant_pathways = {}

for pathway_id in ids_pathway:
    # get the dataframe of the pathway with a specific id
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    
    # extract all the genes in that pathway
    pathway_genes_list = list(pathway_df.entrez)
    
    # for each gene in our signature
    # check if it is in the payway gene list
    # and append to the corresponding list
    # inside the dictionary
    for sig_gene in signature_genes:
        if sig_gene in pathway_genes_list:
            if pathway_id not in relevant_pathways:
                relevant_pathways[pathway_id] = []
            relevant_pathways[pathway_id].append(sig_gene)

print("number of relevant pathways:", len(relevant_pathways.keys()))
relevant_pathways

number of pathways 3830
number of relevant pathways: 706


{'00140': ['3290'],
 '00240': ['1890'],
 '00532': ['55790'],
 '00561': ['84803'],
 '00564': ['84803'],
 '00565': ['7941'],
 '00830': ['10170'],
 '00910': ['761'],
 '00980': ['3290'],
 '00983': ['1890'],
 '01100': ['10170', '55790', '1890', '3290', '7941', '84803'],
 '01522': ['4854', '4318'],
 '03320': ['11001'],
 '04010': ['2246', '929'],
 '04014': ['2246'],
 '04015': ['2246'],
 '04060': ['6351', '4982', '3576', '2920', '6355'],
 '04062': ['6351', '3576', '2920', '6355'],
 '04064': ['6351', '3576', '2920', '929'],
 '04066': ['7076'],
 '04071': ['2207'],
 '04072': ['3576', '2207'],
 '04142': ['1520'],
 '04145': ['4360', '929', '1520'],
 '04146': ['11001'],
 '04151': ['2246', '1292', '6696'],
 '04210': ['1520'],
 '04218': ['3576'],
 '04310': ['6424'],
 '04330': ['4854'],
 '04371': ['4854', '1490', '6696'],
 '04380': ['4982', '7305'],
 '04390': ['2246', '1490'],
 '04392': ['9770'],
 '04510': ['1292', '6696'],
 '04512': ['1292', '6696'],
 '04610': ['714', '11326', '2162', '713'],
 '04611'

In [None]:
greater_2 = []
uniq = [] 
for p_id in relevant_pathways.keys():
    if len(relevant_pathways[p_id]) >= 2:
        greater_2.append(relevant_pathways[p_id])
        uniq += relevant_pathways[p_id]

print("set of unique matching genes from the signature")
set(uniq)

set of unique matching genes from the signature


{'10170',
 '10894',
 '11001',
 '11326',
 '115908',
 '1292',
 '1490',
 '1520',
 '1880',
 '1890',
 '1959',
 '2014',
 '2124',
 '2162',
 '219972',
 '2207',
 '2246',
 '25878',
 '25975',
 '28959',
 '2920',
 '3290',
 '3512',
 '3576',
 '3936',
 '4069',
 '4239',
 '4318',
 '4332',
 '4360',
 '4854',
 '4982',
 '5396',
 '5552',
 '55790',
 '5803',
 '6351',
 '6355',
 '6424',
 '6696',
 '7076',
 '7107',
 '713',
 '714',
 '7262',
 '7305',
 '761',
 '7941',
 '80114',
 '8076',
 '83483',
 '84803',
 '85379',
 '8839',
 '929',
 '9770'}

In [None]:
# hypergeometric test 

# keys: pathway_id
# value: numpy array with probability mass function
pathway_pmf_dict = {}

for pathway_id in relevant_pathways.keys():
    
    if len(relevant_pathways[pathway_id]) < 2:
        continue 
    
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    
    # M: is the number of total genes
    # n: is our signature (60)
    # N: is the number of genes present in the pathway
    number_of_genes_in_our_signature = len(signature_genes)
    total_number_of_our_genes = merged_df.shape[0]
    [M, n, N] = [total_number_of_our_genes, number_of_genes_in_our_signature, len(pathway_df.entrez)]
    print(pathway_id, M,n,N)
    
    # run the test
    rv = hypergeom(M, n, N)
    x = np.arange(0, n+1)
    
    # probability mass function
    pmf_genes_pathway = rv.pmf(x)
    pathway_pmf_dict[pathway_id] = pmf_genes_pathway
    
pathway_pmf_dict

hsa-miR-200c-3p 396017 63 199
hsa-miR-526b-3p 396017 63 578
hsa-miR-6819-3p 396017 63 102
hsa-miR-133b 396017 63 73
05120 396017 63 65
hsa-miR-326 396017 63 116
BIOCARTA_CLASSIC_PATHWAY 396017 63 11
04060 396017 63 219
hsa-miR-4480 396017 63 115
hsa-miR-4662b 396017 63 84
hsa-miR-491-5p 396017 63 157
hsa-miR-508-5p 396017 63 396
hsa-miR-1-3p 396017 63 822
HALLMARK_TNFA_SIGNALING_VIA_NFKB 396017 63 190
hsa-miR-30a-5p 396017 63 686
hsa-miR-17-5p 396017 63 1077
05146 396017 63 94
01522 396017 63 93
04657 396017 63 84
hsa-miR-150-5p 396017 63 465
hsa-miR-519d-3p 396017 63 832
hsa-miR-942-5p 396017 63 186
NABA_SECRETED_FACTORS 396017 63 251
hsa-miR-6749-3p 396017 63 340
04810 396017 63 188
hsa-miR-155-5p 396017 63 835
hsa-miR-5011-5p 396017 63 586
NABA_MATRISOME 396017 63 821
HALLMARK_ESTROGEN_RESPONSE_EARLY 396017 63 186
05202 396017 63 151
hsa-miR-4426 396017 63 84
04151 396017 63 309
hsa-miR-410-3p 396017 63 228
hsa-miR-6867-5p 396017 63 405
hsa-miR-143-3p 396017 63 198
hsa-miR-302c-3p 3

In [None]:
# keys: pathway_id
# value: significance value
pathway_significance_associated_value = {}

for pathway_id in pathway_pmf_dict.keys():
    list_of_genes = relevant_pathways[pathway_id]
    number_of_genes_present_in_pathway = len(list_of_genes)
    #print(pathway_id, number_of_genes_present_in_pathway)
    significance_value = pathway_pmf_dict[pathway_id][number_of_genes_present_in_pathway]
    pathway_significance_associated_value[pathway_id] = '%.2E' % Decimal(significance_value)

results = []
for path_id in pathway_significance_associated_value.keys():
    name = list(merged_df[merged_df.pathway_id == path_id].pathway_name)[0]
    database = list(merged_df[merged_df.pathway_id == path_id].database)[0]
    result = {'database': database, 'pathway_id': path_id, 'pathway_name': name, 'significance_value': pathway_significance_associated_value[path_id]}
    results.append(result)
    
results_df = pd.DataFrame(results)
results_df.to_pickle("data/deregulated_pathways.pkl")
results_df

## Obtain the second input for pathifier
We are extracting each of the resulting pathway with all its associated genesm, this because that one is the input required by Pathifier to compute the PDS.

In [None]:
ids_pathway = list(results_df.pathway_id)

dictios= []
for pathway_id in ids_pathway:
    # get the dataframe of the pathway with a specific id
    pathway_df = merged_df[merged_df.pathway_id == pathway_id]
    list_of_entrez = list(pathway_df["entrez"])
    dictio = {"id":pathway_id, "blank":"", "entrez_id":list_of_entrez}
    dictios.append(dictio)

# sort by length of the entrez_id value list
# it is important to use a sorted list in descending order
# because then R use the length of the first row as a fixed length
# for all the rows in the dataframe
# sorting the list fix
sorted_list = sorted(dictios, reverse=True, key=lambda k: len(k["entrez_id"])) 

with open('data/second_input_pathifier.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    for dictio in sorted_list:
        writer.writerow([dictio["id"]] + [""] + dictio["entrez_id"])
    print("output in second_input_pathifier.csv")