In [None]:
# install libraries if you haven't
# !pip install scanpy
# !pip install pywikipathways
# !pip install gseapy

import scanpy as sc
import pywikipathways as pwpw
import gseapy as gp
import pandas as pd
import requests
import urllib
from IPython.display import display, HTML

In [None]:
# Dataset example #1 - SLE-related genes (Lupus) - just hardcoded for now
#################################
# gene_list = [ "IFI44L", "CCL2", "MMP9", "RSAD2", "STAT1", "IRF7", "OAS1", "MX1", "CD3D", "IL2RB", "GZMK", "PRF1", "NKG7"]
# gene_values_dict = None

# print("Gene List: ", gene_list)
# print("Gene Values Dict: ", gene_values_dict)

In [None]:
# Dataset example #2 - RPE cell TF genes + their associated scores (-log10(p-value)), pulled from another notebook
#################################
gene_list = ["IRX1", "OTX2", "TBX15"]
gene_values_dict = {"IRX1": .000367, "OTX2": .000367, "TBX15": .000367}

print("Gene List: ", gene_list)
print("Gene Values Dict: ", gene_values_dict)

In [None]:
# Dataset example #3 - sc-rnaseq data from a cellxgene study on eye tissue, processed via nsforest
#################################
# # get nsforest eye data from google cloud bucket
# !gcloud storage cp gs://fc-331a3eab-697c-4910-85fe-5528e6c8d3e1/uploads/nsforest-eye-data/li_eye_2023_author_cell_type_nsforest.tsv .
#################################
# # load nsforest eye data into a dataframe
# eye_nsforest_df = pd.read_csv("li_eye_2023_author_cell_type_nsforest.tsv", sep='\t')
# eye_nsforest_df.sort_values(by="f_score", ascending=False, inplace=True)

# # show eye data in dataframe
# eye_nsforest_df[:50]

# # for now we're just looking at the genes for cell clusters of interest (pulled from results of running nsforest on scRNA-seq data generated from eye tissue)
# # filter nsforest results dataframe for cell types of interest and get binary genes
# target_cell_list = ["RPE", "Rod", "Cone3", "Cone1", "Cone2"]
# genes_list_arr = eye_nsforest_df[eye_nsforest_df["clusterName"].isin(target_cell_list)]["binary_genes"].to_list()

# # do some formatting to convert strings to a single list
# gene_list = "".join(genes_list_arr).replace("]", ",").replace("[", " ").replace("'", "")[1:-1].split(", ")
# gene_values_dict = None

# print("Gene List: ", gene_list)
# print("Gene Values Dict: ", gene_values_dict)

In [None]:
# input enrichr gene_set parameter for gseapy enrichr tool
enrichr_gene_set = "WikiPathways_2024_Human"

# perform GSEA using the gseapy library
enr_res = gp.enrichr(gene_list=gene_list, gene_sets=enrichr_gene_set, organism='human')

# show results of running gseapy.enrichr tool
results_df = pd.DataFrame(enr_res.results)
results_df

In [None]:
# assemble URL output for visualization tool
# function to generate urls
def generate_url(base_url, pathway, genes, gene_values_dict=None,):
    # currently, the wikipathways id is the last element in the value stored in the "Term" column that's being passed in as the pathway
    pathway = pathway.split()[-1]
    
    # genes is currently being passed in a semi-colon delimited string
    # if a list of gene values is passed in we need to include this in the url along with the gene
    if gene_values_dict:
        genes_split_list = genes.split(";")
        gene_values = [gene_values_dict.get(gene) for gene in genes_split_list]
        serialized_genes = ";".join([f"{gene},{value}" for gene, value in zip(genes_split_list, gene_values)])
        # make the serialized_genes url safe so semi-colon values aren't converted to ""%3B"
        serialized_genes = urllib.parse.quote(serialized_genes, safe=";,")
    else:
        # make the serialized_genes url safe so semi-colon values aren't converted to ""%3B"
        serialized_genes = urllib.parse.quote(genes, safe=";")
    
    # manually construct URL parameters to prevent encoding
    params_str = f"pathway={pathway}&genes={serialized_genes}"
    
    # Create a request object
    request = requests.Request('GET', f"{base_url}?{params_str}")

    # Prepare the request to get the full URL
    prepared_request = request.prepare()

    return prepared_request.url

# add new columns to results dataframe by applying generate_url to results_df
base_url = "https://bioithackathons.github.io/Interactive-Analysis-with-Biological-Pathways/"
results_df['WPID'] = results_df.apply(lambda row: row["Term"].split()[-1], axis=1)
if gene_values_dict:
    results_df['url'] = results_df.apply(lambda row: generate_url(base_url, row['Term'], row['Genes'], gene_values_dict=gene_values_dict), axis=1)
else:
    results_df['url'] = results_df.apply(lambda row: generate_url(base_url, row['Term'], row['Genes']), axis=1)

# show results_df: make URL's clickable, make sure URL's aren't truncated, restrict df to two columns
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)
results_df['url'] = results_df['url'].apply(make_clickable)

pd.set_option('display.max_colwidth', None)
display(HTML(results_df.loc[:, ['WPID', 'url']].to_html(escape=False)))