In [1]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cmcrameri import cm

In [2]:
## Import full enrichment results
data_dir = r"C:\Users\neil_\DellXPS_June2024\OneDrive\Desktop\New UHN\Precision Medicine\carotid_plaque\workflow\pathway_enrichment\\"
kegg_permod = pd.read_csv(data_dir+"permod_KEGG.csv")
go_permod = pd.read_csv(data_dir+"permod_GO.csv")

for df in [kegg_permod, go_permod]:
    df.drop(columns=["Unnamed: 0", "module"], inplace=True)
    df.rename(columns={"Term": "pathway",
                       "Adjusted P-value":"q-value",
                       "Fold_Enrichment":"fold_enrichment"},
    inplace=True)

In [3]:
## Import filtered lists (filtered by SR)
data_dir = r"C:\Users\neil_\DellXPS_June2024\OneDrive\Desktop\New UHN\Precision Medicine\carotid_plaque\data\filtered_lists\\"
kegg_top = pd.read_csv(data_dir+"permod_kegg_top.txt", sep="\t", header=None) # NP removed redundant pathways "Longevity regulating pathway - multiple species", "Apoptosis - multiple species"
kegg_ec =  pd.read_csv(data_dir+"permod_kegg_ec.txt", sep="\t", header=None)

go_top = pd.read_csv(data_dir+"permod_go_top.txt", sep="\t", header=None)
go_ec =  pd.read_csv(data_dir+"permod_go_ec.txt", sep="\t", header=None)

In [4]:
class Enrichment:
    def __init__(self, db, all_df, top_df, ec_df):
        self.database = db
        self.enrichment_df = all_df
        self.top_pathways = list(set(top_df[0]))
        self.endo_pathways = list(set(top_df[0]))

In [5]:
GO_permod = Enrichment("GO BP", go_permod, go_top, go_ec)
KEGG_permod = Enrichment("KEGG", kegg_permod, kegg_top, kegg_ec)

In [6]:
def subset_plot_df(plot_df, n=20):
    '''Include the top n (by fold enrichment) pathways for both cohorts in plot_df'''
    pathways_to_plot = set()
    for cohort in ["symptomatic", "asymptomatic"]:
        cohort_df = plot_df[plot_df["cohort"] == cohort]
        cohort_pthwys = cohort_df.sort_values(by="fold_enrichment", ascending=False).drop_duplicates(subset="pathway").head(n)["pathway"]
        pathways_to_plot.update(cohort_pthwys)

    # Filter plot_df to include top n pathways for either cohort
    symp_df = plot_df[plot_df["cohort"] == "symptomatic"].sort_values(by="fold_enrichment", ascending=False).drop_duplicates(subset="pathway")
    symp_df = symp_df[symp_df["pathway"].isin(pathways_to_plot)]

    asymp_df = plot_df[plot_df["cohort"] == "asymptomatic"].sort_values(by="fold_enrichment", ascending=False).drop_duplicates(subset="pathway")
    asymp_df = asymp_df[asymp_df["pathway"].isin(pathways_to_plot)]
    
    filtered_df = pd.concat([symp_df, asymp_df])

    return filtered_df


In [7]:
for Enr in [GO_permod, KEGG_permod]:
    full_df = Enr.enrichment_df
    top_pthwys = Enr.top_pathways
    # Only include pathways specified by SR
    Enr.prebubble_df = full_df[full_df["pathway"].isin(top_pthwys)]
    Enr.plot_df = subset_plot_df(Enr.prebubble_df)

In [8]:
### KEGG

In [9]:
from bioservices import *
import time

kg = KEGG()
kg.organism = "hsa" #specify human as organism 


res = kg.list("pathway", organism="hsa")
all_kegg = res.rstrip("\n").split("\n")
all_kegg = [x.rstrip("Homo sapiens (human)") for x in all_kegg]
all_kegg = [x.rstrip("-") for x in all_kegg]
all_kegg = [x.rstrip() for x in all_kegg]
#pathway_dict = {kegg_id: kegg_pthwy for kegg_id, kegg_pthwy in (pathway.split("\t") for pathway in pathways)}

# Initialize an empty dictionary
kegg_id_pthwy_dict = {}

# Iterate over each string in the pathways list
for pathway in all_kegg:
    # Split the string into KEGG ID and KEGG pathway
    kegg_id, kegg_pthwy = pathway.split("\t")
    # Add the KEGG ID and pathway to the dictionary
    kegg_id_pthwy_dict[kegg_pthwy] = kegg_id



In [10]:
# Check if all KEGG pathways we need to annotate have IDs
cp_kegg_list = list(KEGG_permod.plot_df["pathway"])
set(cp_kegg_list).issubset(kegg_id_pthwy_dict.keys())


False

In [11]:
[x for x in cp_kegg_list if x not in kegg_id_pthwy_dict.keys()]
# No hsa ID

['Mitophagy']

In [12]:
kegg_id_pthwy_dict['Mitophagy'] = 'hsa04137' #Looked up on https://www.genome.jp/entry/pathway+hsa04137

In [13]:
def get_kegg_class(kegg_pthwy):
    kegg_id = kegg_id_pthwy_dict[kegg_pthwy]
    class_ = (kg.parse(kg.get(kegg_id))["CLASS"]).split("; ") #This will be a list

    class_0 = class_[0] # Take the first level class
    return class_0

KEGG_permod.plot_df["class"] = KEGG_permod.plot_df["pathway"].apply(get_kegg_class)

In [14]:
def encode_cohort(plot_df):
    encode_cohort_map = {"asymptomatic":0,
                         "symptomatic":1}
    encoded_df = plot_df.copy()
    encoded_df["cohort"] = encoded_df["cohort"].map(encode_cohort_map)
    encoded_df.rename(columns = {"cohort":"cohort_encoded"}, inplace=True)
    return encoded_df

KEGG_permod.plot_df = encode_cohort(KEGG_permod.plot_df)

In [15]:
def format_bubble_df(bubble_df_):
    '''Formats bubble_df by taking the columns needed for plot maker. Also adds rows for wrapping figure'''
    
    #new_df = bubble_df_[["pathway", "cohort_encoded", "q-value", "fold_enrichment"]].copy()
    
    wrap_row1 = pd.DataFrame([{'pathway': 'wrap', 'cohort_encoded': -0.5, 'q-value': 0.001, 'fold_enrichment': 1, 'class':'z'}])
    wrap_row2 = pd.DataFrame([{'pathway': 'wrap', 'cohort_encoded': 1.5, 'q-value': 0.001, 'fold_enrichment': 1, 'class':'z'}])
    
    wrap_rows = pd.concat([wrap_row1, wrap_row2])
    
    formatted_bubble_df = pd.concat([bubble_df_, wrap_rows], ignore_index=True).copy()
    formatted_bubble_df = formatted_bubble_df[["pathway", "cohort_encoded", "q-value", "fold_enrichment", "class"]]
    return formatted_bubble_df

KEGG_permod.plot_df = format_bubble_df(KEGG_permod.plot_df)
#KEGG_miRNA.plot_df = KEGG_miRNA.plot_df[["pathway", "cohort_encoded", "q-value", "fold_enrichment", "class"]]
KEGG_permod.plot_df


Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,"Phenylalanine, tyrosine and tryptophan biosynt...",1.0,0.02884242,443.791667,Metabolism
1,Complement and coagulation cascades,1.0,1.230231e-11,250.611765,Organismal Systems
2,Steroid biosynthesis,1.0,0.02394276,177.516667,Metabolism
3,Caffeine metabolism,1.0,0.002677471,83.537255,Metabolism
4,RNA polymerase,1.0,1.713876e-55,66.15528,Genetic Information Processing
5,Cholesterol metabolism,1.0,5.896419e-05,53.255,Organismal Systems
6,p53 signaling pathway,1.0,0.01273469,53.05604,Cellular Processes
7,Proteasome,1.0,7.108092e-65,36.7132,Genetic Information Processing
8,Spliceosome,1.0,4.488386e-149,32.840583,Genetic Information Processing
9,Phototransduction,1.0,0.03939781,27.664935,Organismal Systems


In [16]:
# Shorten class text for figure
KEGG_permod.plot_df.loc[KEGG_permod.plot_df["class"] == "Genetic Information Processing", "class"] = "Gene. Info. Processing"
KEGG_permod.plot_df.loc[KEGG_permod.plot_df["class"] == "Human Diseases", "class"] = "Diseases"
KEGG_permod.plot_df.loc[KEGG_permod.plot_df["class"] == "Cellular Processes", "class"] = "Cell Proc."
KEGG_permod.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,"Phenylalanine, tyrosine and tryptophan biosynt...",1.0,0.02884242,443.791667,Metabolism
1,Complement and coagulation cascades,1.0,1.230231e-11,250.611765,Organismal Systems
2,Steroid biosynthesis,1.0,0.02394276,177.516667,Metabolism
3,Caffeine metabolism,1.0,0.002677471,83.537255,Metabolism
4,RNA polymerase,1.0,1.713876e-55,66.15528,Gene. Info. Processing
5,Cholesterol metabolism,1.0,5.896419e-05,53.255,Organismal Systems
6,p53 signaling pathway,1.0,0.01273469,53.05604,Cell Proc.
7,Proteasome,1.0,7.108092e-65,36.7132,Gene. Info. Processing
8,Spliceosome,1.0,4.488386e-149,32.840583,Gene. Info. Processing
9,Phototransduction,1.0,0.03939781,27.664935,Organismal Systems


In [17]:

KEGG_permod.plot_df.to_csv('permod_kegg_bubble.txt', sep ='\t',index=False)

## GO

In [18]:
GO_permod.plot_df

Unnamed: 0,pathway,q-value,fold_enrichment,cohort
122,Regulation Of Hepatocyte Proliferation (GO:200...,0.02846111,608.628571,symptomatic
116,C-terminal Protein Lipidation (GO:0006501),0.02339896,532.55,symptomatic
120,Regulation Of Endothelial Cell Differentiation...,4.575242e-06,499.265625,symptomatic
109,Sphingomyelin Biosynthetic Process (GO:0006686),0.001556115,387.309091,symptomatic
84,"Complement Activation, Lectin Pathway (GO:0001...",3.212822e-08,156.632353,symptomatic
101,Protein Retention In ER Lumen (GO:0006621),0.002776245,148.445993,symptomatic
86,Telomere Maintenance Via Recombination (GO:000...,7.081041e-05,139.228758,symptomatic
111,Polyol Metabolic Process (GO:0019751),0.03041205,119.338936,symptomatic
37,snRNA Transcription By RNA Polymerase II (GO:0...,1.9582e-14,59.539752,symptomatic
100,Histone H3-K14 Acetylation (GO:0044154),1.783098e-14,58.272948,symptomatic


In [19]:
list(GO_permod.plot_df["pathway"])

['Regulation Of Hepatocyte Proliferation (GO:2000345)',
 'C-terminal Protein Lipidation (GO:0006501)',
 'Regulation Of Endothelial Cell Differentiation (GO:0045601)',
 'Sphingomyelin Biosynthetic Process (GO:0006686)',
 'Complement Activation, Lectin Pathway (GO:0001867)',
 'Protein Retention In ER Lumen (GO:0006621)',
 'Telomere Maintenance Via Recombination (GO:0000722)',
 'Polyol Metabolic Process (GO:0019751)',
 'snRNA Transcription By RNA Polymerase II (GO:0042795)',
 'Histone H3-K14 Acetylation (GO:0044154)',
 'U2-type Prespliceosome Assembly (GO:1903241)',
 'Ethanol Catabolic Process (GO:0006068)',
 'Histone H2A Monoubiquitination (GO:0035518)',
 'Regulation Of Macrophage Proliferation (GO:0120040)',
 'ISG15-protein Conjugation (GO:0032020)',
 'Nuclear-Transcribed mRNA Catabolic Process, Deadenylation-Independent Decay (GO:0031086)',
 'Neutrophil Degranulation (GO:0043312)',
 'Regulation Of Mitochondrial mRNA Stability (GO:0044528)',
 'Regulation Of SA Node Cell Action Potential

In [20]:
# GO doesnt have class info like KEGG. Assign categories manually
go_bp_categories = {
    'Regulation Of Hepatocyte Proliferation (GO:2000345)': 'Developmental and Differentiation Processes',
    'C-terminal Protein Lipidation (GO:0006501)': 'Gene Expression and Regulation',
    'Regulation Of Endothelial Cell Differentiation (GO:0045601)': 'Developmental and Differentiation Processes',
    'Sphingomyelin Biosynthetic Process (GO:0006686)': 'Metabolism and Biosynthesis',
    'Complement Activation, Lectin Pathway (GO:0001867)': 'Immune and Stress Response',
    'Protein Retention In ER Lumen (GO:0006621)': 'Cellular Structure and Transport',
    'Telomere Maintenance Via Recombination (GO:0000722)': 'Gene Expression and Regulation',
    'Polyol Metabolic Process (GO:0019751)': 'Metabolism and Biosynthesis',
    'snRNA Transcription By RNA Polymerase II (GO:0042795)': 'Gene Expression and Regulation',
    'Histone H3-K14 Acetylation (GO:0044154)': 'Gene Expression and Regulation',
    'U2-type Prespliceosome Assembly (GO:1903241)': 'Gene Expression and Regulation',
    'Ethanol Catabolic Process (GO:0006068)': 'Metabolism and Biosynthesis',
    'Histone H2A Monoubiquitination (GO:0035518)': 'Gene Expression and Regulation',
    'Regulation Of Macrophage Proliferation (GO:0120040)': 'Immune and Stress Response',
    'ISG15-protein Conjugation (GO:0032020)': 'Gene Expression and Regulation',
    'Nuclear-Transcribed mRNA Catabolic Process, Deadenylation-Independent Decay (GO:0031086)': 'Gene Expression and Regulation',
    'Neutrophil Degranulation (GO:0043312)': 'Immune and Stress Response',
    'Regulation Of Mitochondrial mRNA Stability (GO:0044528)': 'Gene Expression and Regulation',
    'Regulation Of SA Node Cell Action Potential (GO:0098907)': 'Developmental and Differentiation Processes',
    'DNA Replication-Dependent Chromatin Assembly (GO:0006335)': 'Gene Expression and Regulation',
    'Parallel Actin Filament Bundle Assembly (GO:0030046)': 'Cellular Structure and Transport',
    'Histone H4-K5 Acetylation (GO:0043981)': 'Gene Expression and Regulation',
    'Sphingomyelin Biosynthetic Process (GO:0006686)': 'Metabolism and Biosynthesis',
    'Regulation Of T-helper 1 Cell Differentiation (GO:0045625)': 'Immune and Stress Response',
    'Regulation Of Hepatocyte Proliferation (GO:2000345)': 'Developmental and Differentiation Processes',
    'Regulation Of Endothelial Cell Differentiation (GO:0045601)': 'Developmental and Differentiation Processes',
    'Inclusion Body Assembly (GO:0070841)': 'Cellular Structure and Transport',
    'Nucleobase Catabolic Process (GO:0046113)': 'Metabolism and Biosynthesis',
    'Telomere Maintenance Via Recombination (GO:0000722)': 'Gene Expression and Regulation',
    'Polyol Metabolic Process (GO:0019751)': 'Metabolism and Biosynthesis',
    'Ethanol Catabolic Process (GO:0006068)': 'Metabolism and Biosynthesis',
    'Histone H2A Monoubiquitination (GO:0035518)': 'Gene Expression and Regulation',
    'Regulation Of Macrophage Proliferation (GO:0120040)': 'Immune and Stress Response',
    'Formation Of Cytoplasmic Translation Initiation Complex (GO:0001732)': 'Gene Expression and Regulation',
    'Mitochondrial Proton-Transporting ATP Synthase Complex Assembly (GO:0033615)': 'Metabolism and Biosynthesis',
    'Regulation Of Mitochondrial mRNA Stability (GO:0044528)': 'Gene Expression and Regulation',
    'Vesicle-Mediated Transport (GO:0016192)': 'Cellular Structure and Transport',
    'Regulation Of SA Node Cell Action Potential (GO:0098907)': 'Developmental and Differentiation Processes',
    'Gap Junction Assembly (GO:0016264)': 'Cellular Structure and Transport',
    'DNA Replication-Dependent Chromatin Assembly (GO:0006335)': 'Gene Expression and Regulation',
    'Parallel Actin Filament Bundle Assembly (GO:0030046)': 'Cellular Structure and Transport',
    'Histone H4-K5 Acetylation (GO:0043981)': 'Gene Expression and Regulation',
    'Nuclear-Transcribed mRNA Catabolic Process, Deadenylation-Independent Decay (GO:0031086)': 'Gene Expression and Regulation',
    'Neutrophil Degranulation (GO:0043312)': 'Immune and Stress Response'
}



In [21]:
GO_permod.plot_df["class"] = GO_permod.plot_df["pathway"].map(go_bp_categories)

In [22]:
GO_permod.plot_df = encode_cohort(GO_permod.plot_df)
GO_permod.plot_df = format_bubble_df(GO_permod.plot_df)

In [23]:
GO_permod.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,Regulation Of Hepatocyte Proliferation (GO:200...,1.0,0.02846111,608.628571,Developmental and Differentiation Processes
1,C-terminal Protein Lipidation (GO:0006501),1.0,0.02339896,532.55,Gene Expression and Regulation
2,Regulation Of Endothelial Cell Differentiation...,1.0,4.575242e-06,499.265625,Developmental and Differentiation Processes
3,Sphingomyelin Biosynthetic Process (GO:0006686),1.0,0.001556115,387.309091,Metabolism and Biosynthesis
4,"Complement Activation, Lectin Pathway (GO:0001...",1.0,3.212822e-08,156.632353,Immune and Stress Response
5,Protein Retention In ER Lumen (GO:0006621),1.0,0.002776245,148.445993,Cellular Structure and Transport
6,Telomere Maintenance Via Recombination (GO:000...,1.0,7.081041e-05,139.228758,Gene Expression and Regulation
7,Polyol Metabolic Process (GO:0019751),1.0,0.03041205,119.338936,Metabolism and Biosynthesis
8,snRNA Transcription By RNA Polymerase II (GO:0...,1.0,1.9582e-14,59.539752,Gene Expression and Regulation
9,Histone H3-K14 Acetylation (GO:0044154),1.0,1.783098e-14,58.272948,Gene Expression and Regulation


In [24]:
# Shorten class text for figure
GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Gene Expression and Regulation", "class"] = "Gene Regulation"
GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Immune and Stress Response", "class"] = "Immune"
GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Metabolism and Biosynthesis", "class"] = "Metabolism"
GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Developmental and Differentiation Processes", "class"] = "Development"
GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Cellular Structure and Transport", "class"] = "Cell Structure"

#GO_permod.plot_df.loc[GO_permod.plot_df["class"] == "Developmental and Differentiation Processes", "class"] = "Development"
GO_permod.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,Regulation Of Hepatocyte Proliferation (GO:200...,1.0,0.02846111,608.628571,Development
1,C-terminal Protein Lipidation (GO:0006501),1.0,0.02339896,532.55,Gene Regulation
2,Regulation Of Endothelial Cell Differentiation...,1.0,4.575242e-06,499.265625,Development
3,Sphingomyelin Biosynthetic Process (GO:0006686),1.0,0.001556115,387.309091,Metabolism
4,"Complement Activation, Lectin Pathway (GO:0001...",1.0,3.212822e-08,156.632353,Immune
5,Protein Retention In ER Lumen (GO:0006621),1.0,0.002776245,148.445993,Cell Structure
6,Telomere Maintenance Via Recombination (GO:000...,1.0,7.081041e-05,139.228758,Gene Regulation
7,Polyol Metabolic Process (GO:0019751),1.0,0.03041205,119.338936,Metabolism
8,snRNA Transcription By RNA Polymerase II (GO:0...,1.0,1.9582e-14,59.539752,Gene Regulation
9,Histone H3-K14 Acetylation (GO:0044154),1.0,1.783098e-14,58.272948,Gene Regulation


In [25]:
def format_go_bp_label(go_bp_label):
    new_label = go_bp_label.split("(GO:")[0].rstrip()
    return new_label

In [26]:
GO_permod.plot_df["pathway"] = GO_permod.plot_df["pathway"].apply(format_go_bp_label)
GO_permod.plot_df

Unnamed: 0,pathway,cohort_encoded,q-value,fold_enrichment,class
0,Regulation Of Hepatocyte Proliferation,1.0,0.02846111,608.628571,Development
1,C-terminal Protein Lipidation,1.0,0.02339896,532.55,Gene Regulation
2,Regulation Of Endothelial Cell Differentiation,1.0,4.575242e-06,499.265625,Development
3,Sphingomyelin Biosynthetic Process,1.0,0.001556115,387.309091,Metabolism
4,"Complement Activation, Lectin Pathway",1.0,3.212822e-08,156.632353,Immune
5,Protein Retention In ER Lumen,1.0,0.002776245,148.445993,Cell Structure
6,Telomere Maintenance Via Recombination,1.0,7.081041e-05,139.228758,Gene Regulation
7,Polyol Metabolic Process,1.0,0.03041205,119.338936,Metabolism
8,snRNA Transcription By RNA Polymerase II,1.0,1.9582e-14,59.539752,Gene Regulation
9,Histone H3-K14 Acetylation,1.0,1.783098e-14,58.272948,Gene Regulation


In [27]:
GO_permod.plot_df.to_csv('permod_go_bubble.txt', sep ='\t',index=False)