In [1]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cmcrameri import cm

In [2]:
## Import full enrichment results
data_dir = r"C:\Users\neil_\DellXPS_June2024\OneDrive\Desktop\New UHN\Precision Medicine\carotid_plaque\workflow\pathway_enrichment\\"
kegg_mirna = pd.read_csv(data_dir+"dem_pathways_kegg.csv")
go_mirna = pd.read_csv(data_dir+"dem_pathways_go.csv")
go_mirna.drop(columns=["pthwy_id"],inplace=True)

In [3]:
## Import filtered lists (filtered by SR)
data_dir = r"C:\Users\neil_\DellXPS_June2024\OneDrive\Desktop\New UHN\Precision Medicine\carotid_plaque\data\filtered_lists\\"
kegg_top = pd.read_csv(data_dir+"dem_kegg_top.txt", sep="\t", header=None) # NP removed redundant pathways "Longevity regulating pathway - multiple species", "Apoptosis - multiple species"
kegg_ec =  pd.read_csv(data_dir+"dem_kegg_ec.txt", sep="\t", header=None)

go_top = pd.read_csv(data_dir+"dem_go_top.txt", sep="\t", header=None)
go_ec =  pd.read_csv(data_dir+"dem_go_ec.txt", sep="\t", header=None)

In [4]:
class Enrichment:
    def __init__(self, db, all_df, top_df, ec_df):
        self.database = db
        self.enrichment_df = all_df
        self.top_pathways = list(set(top_df[0]))
        self.endo_pathways = list(set(top_df[0]))

In [5]:
GO_miRNA = Enrichment("GO BP", go_mirna, go_top, go_ec)
KEGG_miRNA = Enrichment("KEGG", kegg_mirna, kegg_top, kegg_ec)

In [6]:
def subset_plot_df(plot_df, n=20):
    '''Include the top n (by fold enrichment) pathways for both cohorts in plot_df'''
    pathways_to_plot = set()
    for cohort in ["symptomatic", "asymptomatic"]:
        cohort_df = plot_df[plot_df["cohort"] == cohort]
        cohort_pthwys = cohort_df.sort_values(by="fold_enrichment", ascending=False).head(n)["pathway"]
        pathways_to_plot.update(cohort_pthwys)

    # Filter plot_df to include top n pathways for either cohort
    filtered_df = plot_df[plot_df["pathway"].isin(pathways_to_plot)]

    return filtered_df


In [7]:
for Enr in [GO_miRNA, KEGG_miRNA]:
    full_df = Enr.enrichment_df
    top_pthwys = Enr.top_pathways
    # Only include pathways specified by SR
    Enr.prebubble_df = full_df[full_df["pathway"].isin(top_pthwys)]
    Enr.plot_df = subset_plot_df(Enr.prebubble_df)

In [8]:
### KEGG

In [10]:
from bioservices import *
import time

kg = KEGG()
kg.organism = "hsa" #specify human as organism 


res = kg.list("pathway", organism="hsa")
all_kegg = res.rstrip("\n").split("\n")
all_kegg = [x.rstrip("Homo sapiens (human)") for x in all_kegg]
all_kegg = [x.rstrip("-") for x in all_kegg]
all_kegg = [x.rstrip() for x in all_kegg]
#pathway_dict = {kegg_id: kegg_pthwy for kegg_id, kegg_pthwy in (pathway.split("\t") for pathway in pathways)}

# Initialize an empty dictionary
kegg_id_pthwy_dict = {}

# Iterate over each string in the pathways list
for pathway in all_kegg:
    # Split the string into KEGG ID and KEGG pathway
    kegg_id, kegg_pthwy = pathway.split("\t")
    # Add the KEGG ID and pathway to the dictionary
    kegg_id_pthwy_dict[kegg_pthwy] = kegg_id



In [11]:
# Check if all KEGG pathways we need to annotate have IDs
cp_kegg_list = list(KEGG_miRNA.plot_df["pathway"])
set(cp_kegg_list).issubset(kegg_id_pthwy_dict.keys())


True

In [12]:
def get_kegg_class(kegg_pthwy):
    kegg_id = kegg_id_pthwy_dict[kegg_pthwy]
    class_ = (kg.parse(kg.get(kegg_id))["CLASS"]).split("; ") #This will be a list

    class_0 = class_[0] # Take the first level class
    return class_0

KEGG_miRNA.plot_df["class"] = KEGG_miRNA.plot_df["pathway"].apply(get_kegg_class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  KEGG_miRNA.plot_df["class"] = KEGG_miRNA.plot_df["pathway"].apply(get_kegg_class)


In [13]:
def encode_cohort(plot_df):
    encode_cohort_map = {"asymptomatic":0,
                         "symptomatic":1}
    encoded_df = plot_df.copy()
    encoded_df["cohort"] = encoded_df["cohort"].map(encode_cohort_map)
    encoded_df.rename(columns = {"cohort":"cohort_encoded"}, inplace=True)
    return encoded_df

KEGG_miRNA.plot_df = encode_cohort(KEGG_miRNA.plot_df)

Unnamed: 0,pathway,q-value,fold_enrichment,cohort_encoded,class
4,AGE-RAGE signaling pathway in diabetic complic...,1.068315e-18,3.200126,1,Human Diseases
9,p53 signaling pathway,3.870042e-11,2.862847,1,Cellular Processes
11,Cellular senescence,5.984576e-21,2.804925,1,Cellular Processes
13,Prolactin signaling pathway,1.56937e-09,2.705646,1,Organismal Systems
16,Cell cycle,1.360287e-15,2.686084,1,Cellular Processes
20,Type II diabetes mellitus,6.373188e-06,2.555558,1,Human Diseases
21,FoxO signaling pathway,1.721645e-14,2.542553,1,Environmental Information Processing
22,ErbB signaling pathway,7.895223e-10,2.535514,1,Environmental Information Processing
25,TNF signaling pathway,3.026586e-12,2.507388,1,Environmental Information Processing
27,Apoptosis,2.062204e-14,2.497097,1,Cellular Processes


In [14]:
def format_bubble_df(bubble_df_):
    '''Formats bubble_df by taking the columns needed for plot maker. Also adds rows for wrapping figure'''
    
    #new_df = bubble_df_[["pathway", "cohort_encoded", "q-value", "fold_enrichment"]].copy()
    
    wrap_row1 = pd.DataFrame([{'pathway': 'wrap', 'cohort_encoded': -0.5, 'q-value': 0.001, 'fold_enrichment': 1, 'class':'z'}])
    wrap_row2 = pd.DataFrame([{'pathway': 'wrap', 'cohort_encoded': 1.5, 'q-value': 0.001, 'fold_enrichment': 1, 'class':'z'}])
    
    wrap_rows = pd.concat([wrap_row1, wrap_row2])
    
    formatted_bubble_df = pd.concat([bubble_df_, wrap_rows], ignore_index=True).copy()
    formatted_bubble_df = formatted_bubble_df[["pathway", "cohort_encoded", "q-value", "fold_enrichment", "class"]]
    return formatted_bubble_df

KEGG_miRNA.plot_df = format_bubble_df(KEGG_miRNA.plot_df)
#KEGG_miRNA.plot_df = KEGG_miRNA.plot_df[["pathway", "cohort_encoded", "q-value", "fold_enrichment", "class"]]
KEGG_miRNA.plot_df


Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,AGE-RAGE signaling pathway in diabetic complic...,1.0,1.068315e-18,3.200126,Human Diseases
1,p53 signaling pathway,1.0,3.870042e-11,2.862847,Cellular Processes
2,Cellular senescence,1.0,5.984576e-21,2.804925,Cellular Processes
3,Prolactin signaling pathway,1.0,1.56937e-09,2.705646,Organismal Systems
4,Cell cycle,1.0,1.360287e-15,2.686084,Cellular Processes
5,Type II diabetes mellitus,1.0,6.373188e-06,2.555558,Human Diseases
6,FoxO signaling pathway,1.0,1.721645e-14,2.542553,Environmental Information Processing
7,ErbB signaling pathway,1.0,7.895223e-10,2.535514,Environmental Information Processing
8,TNF signaling pathway,1.0,3.026586e-12,2.507388,Environmental Information Processing
9,Apoptosis,1.0,2.062204e-14,2.497097,Cellular Processes


In [21]:
# Shorten class text for figure
KEGG_miRNA.plot_df.loc[KEGG_miRNA.plot_df["class"] == "Environmental Information Processing", "class"] = "Env. Info. Processing"
KEGG_miRNA.plot_df.loc[KEGG_miRNA.plot_df["class"] == "Human Diseases", "class"] = "Diseases"
KEGG_miRNA.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,AGE-RAGE signaling pathway in diabetic complic...,1.0,1.068315e-18,3.200126,Diseases
1,p53 signaling pathway,1.0,3.870042e-11,2.862847,Cellular Processes
2,Cellular senescence,1.0,5.984576e-21,2.804925,Cellular Processes
3,Prolactin signaling pathway,1.0,1.56937e-09,2.705646,Organismal Systems
4,Cell cycle,1.0,1.360287e-15,2.686084,Cellular Processes
5,Type II diabetes mellitus,1.0,6.373188e-06,2.555558,Diseases
6,FoxO signaling pathway,1.0,1.721645e-14,2.542553,Env. Info. Processing
7,ErbB signaling pathway,1.0,7.895223e-10,2.535514,Env. Info. Processing
8,TNF signaling pathway,1.0,3.026586e-12,2.507388,Env. Info. Processing
9,Apoptosis,1.0,2.062204e-14,2.497097,Cellular Processes


In [22]:

KEGG_miRNA.plot_df.to_csv('dem_kegg_bubble.txt', sep ='\t',index=False)

## GO

In [23]:
GO_miRNA.plot_df

Unnamed: 0,pathway,fold_enrichment,q-value,cohort
0,cochlear nucleus development,9.996117,0.008949369,symptomatic
4,sphingolipid translocation,9.996117,0.008902322,symptomatic
5,regulation of mast cell apoptotic process,9.996117,0.008928398,symptomatic
10,fungiform papilla formation,9.996117,0.008881571,symptomatic
11,osteoblast fate commitment,9.996117,0.008933632,symptomatic
12,regulation of cell proliferation involved in h...,9.996117,0.008876398,symptomatic
14,apoptotic process involved in embryonic digit ...,9.996117,0.008938871,symptomatic
15,intrinsic apoptotic signaling pathway in respo...,9.996117,0.00891795,symptomatic
17,central nervous system morphogenesis,9.996117,0.008944117,symptomatic
18,regulation of primary miRNA processing,8.330097,0.0007402361,symptomatic


In [28]:
# GO doesnt have class info like KEGG. Assign categories manually
go_bp_categories = {
    'cochlear nucleus development': 'Developmental Processes',
    'sphingolipid translocation': 'Metabolism and Transport',
    'regulation of mast cell apoptotic process': 'Immune and Apoptotic Processes',
    'fungiform papilla formation': 'Developmental Processes',
    'osteoblast fate commitment': 'Developmental Processes',
    'regulation of cell proliferation involved in heart valve morphogenesis': 'Developmental Processes',
    'apoptotic process involved in embryonic digit morphogenesis': 'Immune and Apoptotic Processes',
    'intrinsic apoptotic signaling pathway in response to hypoxia': 'Immune and Apoptotic Processes',
    'central nervous system morphogenesis': 'Developmental Processes',
    'regulation of primary miRNA processing': 'Gene Expression and Regulation',
    'cytoplasmic sequestering of NF-kappaB': 'Gene Expression and Regulation',
    'maintenance of DNA repeat elements': 'Gene Expression and Regulation',
    'regulation of protein localization to endoplasmic reticulum': 'Metabolism and Transport',
    'regulation of termination of DNA-templated transcription': 'Gene Expression and Regulation',
    'cranial ganglion development': 'Developmental Processes',
    'glial cell apoptotic process': 'Immune and Apoptotic Processes',
    'hair follicle placode formation': 'Developmental Processes',
    'fungiform papilla morphogenesis': 'Developmental Processes',
    'regulation of metalloendopeptidase activity': 'Metabolism and Transport',
    'regulation of ureteric bud formation': 'Developmental Processes',
    'regulation of epithelial cell differentiation': 'Developmental Processes'
}


In [30]:
GO_miRNA.plot_df["class"] = GO_miRNA.plot_df["pathway"].map(go_bp_categories)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GO_miRNA.plot_df["class"] = GO_miRNA.plot_df["pathway"].map(go_bp_categories)


In [32]:
GO_miRNA.plot_df = encode_cohort(GO_miRNA.plot_df)
GO_miRNA.plot_df = format_bubble_df(GO_miRNA.plot_df)

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,cochlear nucleus development,1.0,0.008949369,9.996117,Developmental Processes
1,sphingolipid translocation,1.0,0.008902322,9.996117,Metabolism and Transport
2,regulation of mast cell apoptotic process,1.0,0.008928398,9.996117,Immune and Apoptotic Processes
3,fungiform papilla formation,1.0,0.008881571,9.996117,Developmental Processes
4,osteoblast fate commitment,1.0,0.008933632,9.996117,Developmental Processes
5,regulation of cell proliferation involved in h...,1.0,0.008876398,9.996117,Developmental Processes
6,apoptotic process involved in embryonic digit ...,1.0,0.008938871,9.996117,Immune and Apoptotic Processes
7,intrinsic apoptotic signaling pathway in respo...,1.0,0.00891795,9.996117,Immune and Apoptotic Processes
8,central nervous system morphogenesis,1.0,0.008944117,9.996117,Developmental Processes
9,regulation of primary miRNA processing,1.0,0.0007402361,8.330097,Gene Expression and Regulation


In [34]:
# Shorten class text for figure
GO_miRNA.plot_df.loc[GO_miRNA.plot_df["class"] == "Gene Expression and Regulation", "class"] = "Gene Regulation"
GO_miRNA.plot_df.loc[GO_miRNA.plot_df["class"] == "Immune and Apoptotic Processes", "class"] = "Immune and Apoptosis"
GO_miRNA.plot_df.loc[GO_miRNA.plot_df["class"] == "Metabolism and Transport", "class"] = "Metabolism"
GO_miRNA.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,cochlear nucleus development,1.0,0.008949369,9.996117,Developmental Processes
1,sphingolipid translocation,1.0,0.008902322,9.996117,Metabolism
2,regulation of mast cell apoptotic process,1.0,0.008928398,9.996117,Immune and Apoptosis
3,fungiform papilla formation,1.0,0.008881571,9.996117,Developmental Processes
4,osteoblast fate commitment,1.0,0.008933632,9.996117,Developmental Processes
5,regulation of cell proliferation involved in h...,1.0,0.008876398,9.996117,Developmental Processes
6,apoptotic process involved in embryonic digit ...,1.0,0.008938871,9.996117,Immune and Apoptosis
7,intrinsic apoptotic signaling pathway in respo...,1.0,0.00891795,9.996117,Immune and Apoptosis
8,central nervous system morphogenesis,1.0,0.008944117,9.996117,Developmental Processes
9,regulation of primary miRNA processing,1.0,0.0007402361,8.330097,Gene Regulation


In [35]:
GO_miRNA.plot_df.to_csv('dem_go_bubble.txt', sep ='\t',index=False)