In [26]:

from appyter import magic
magic.init(lambda _ = globals: _())

# GeneSet Library Set Appyter
This appyter is designed to perform basic statistics, analysis, and visualizations on a Gene Matrix Transpose (.GMT) file. This will allow bioinformatics researchers to analyze relationships between many different gene sets from several gene set libraries.
 To create your own GMT file, please see Enrichr. Enrichr, hosted by the Ma'ayan Laboratory at Mt. Sinai Icahn School of Medicine, is a collection of geneset libraries. 

In [47]:
import numpy as np 
import pandas as pd
import itertools 
import bokeh
import pathlib
import scanpy as sc
from IPython.display import display, FileLink, HTML
import anndata
from sklearn.feature_extraction.text import TfidfVectorizer
from maayanlab_bioinformatics.enrichment import crisp
from collections import OrderedDict
from bokeh.palettes import Category20
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource, RangeSlider
from bokeh.plotting import figure, show, save, output_file
import os

output_notebook()

In [75]:
%%appyter hide_code


{% do SectionField(name='GMTSubmission', title='1. Submit a GMT file', subtitle = 'Sumbit a GMT (Gene Matrix Transpose file) for analysis.', img = 'bulb.png') %}
{% do SectionField (name = 'UMAP_visualization', title = '2. Scatterplot Visualization', subtitle= 'Visualize relative Geneset similarities on an interactive scatterplot', img = 'bulb.png') %}
{% do SectionField(name = 'Pairwise Similarity Table', title = '3. GeneSet Pairwise Similarity Table', subtitle = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section.', img = 'bulb.png') %}
{% do SectionField(name = 'Jaccard Similarity Table', title = '4. Jaccard Similarity Table', subtitle = '##TODO', img = 'bulb.png') %}
{% do SectionField(name = 'Intersection Search', title = '5. Gene Intersection Search', subtitle = '###TODO', img = 'bulb.png')%}
{% do SectionField(name = 'GMT Descriptive Statistics', title = '6. Descriptive Statistics', subtitle = '#TODO', img = 'bulb.png') %}


## 1. Submitted Variables

In [96]:
%%appyter code_exec

{% set gs = TabField(
    name='gs_type',
    label='Gene Sets',
    default='Upload',
    choices={'Upload': [
            FileField(
                name='gs',
                label='Gene Set Files',
                default='static/gene_sets_for_breast_cancer.gmt',
                example={
                    'example.gmt': url_for('static', filename = 'Geneshot_PainGenes.gmt')
                }
            ),
        ],},
    
section = 'GMTSubmission',)%}

gs = {{gs.value[0]}}



int_tbl = {{BoolField(name = 'SimilarityTbl', label = 'Intersection Size Table', default = 'true', description = 'In this table, the value in row A, column B, is the size of the intersection of A and B. If you would like to get a list of genes from a specific intersection of two library terms, please see the Intersection Search Section. Select \'Yes\' if you would like to generate a Intersection Size Table. Otherwise, select \'No\'', section = 'Pairwise Similarity Table') }}
jaccard_tbl = {{BoolField(name = 'JaccardTbl', label = 'Jaccard Similarity Table', default = 'true', description = '##TODO', section = 'Jaccard Similarity Table') }}


umap = {{ BoolField(name = 'umap', label = 'ScatterPlot Visualization', default = 'true', description = 'Select \'Yes\' if you would like to generate a Scatter Plot. Otherwise, select \'No\'', section = 'UMAP_visualization')}}

umap_num_neighbors = {{ IntField(name = 'nneighbors', label = 'Number of Neighbors', default = 5, min = 1, max = 30, description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_maxdf = {{ ChoiceField(name = 'max_df', label = 'Max df setting', choices = {'0.5': '0.5', '0.75': '.75', '0.9': '.9', '1.0': '1'}, default = '0.5',  description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}
umap_mindf = {{ ChoiceField(name = 'min_df', label = 'Min df setting', choices = {'0.1' : '0.1', '0.25' : '0.25', '0.5': '0.5' }, default = '0.25', description = '##TODO: Play around with parameter settings', section = 'UMAP_visualization')}}

{% if gs == '' %}:
    raise Exception('Please upload a GMT File!')
{% endif %}

```python
gs = 'static/gene_sets_for_breast_cancer.gmt'
int_tbl = True
jaccard_tbl = True
umap = True
umap_num_neighbors = 5
umap_maxdf = 0.5
umap_mindf = 0.25
```

## 2. Process the GMT FILE

In [64]:
%%appyter code_exec

def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()        
            ret_list.append(' '.join(genes))
    return ret_list

def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)

    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df              

```python
def series_to_list(gene_list):
    ##helper function to convert a gene pd.series to a gene list
    ret_list = []
    for genes in gene_list:
        if type(genes) is str:
            ret_list.append(genes)
        else: ##pd series case
            genes = genes.tolist()
            ret_list.append(' '.join(genes))
    return ret_list
def load_set(file):
    ''' Load a set of files into pairs of labeled sets
    '''
    lst= []
    path = pathlib.Path(file)
    with open(path) as f:
        lines = f.readlines()
        for line in lines:
            parsed_line = line.split('\t')
            term, library, genes = parsed_line[0], parsed_line[1], parsed_line[2:]
            if genes[-1][:-2] == '\n':
                genes[-1] = genes[-1][:-2] ##trim off newline regex '\n'
            lst.append((term,  library, ' '.join(genes)))
    zip_lst = [list(i) for i in zip(*lst)]
    term, library, genes = zip_lst[0], zip_lst[1], zip_lst[2]
    genes = series_to_list(genes)
    df = pd.DataFrame({'Genes': genes, 'Library': library}, index = term)
    return df
```

In [6]:

df = load_set(gs)
if df.shape[0] < umap_num_neighbors:
    umap_num_neighbors = int(np.ceil(df.shape[0]/2)) ##arbitrary right now. May want to change based on parameter settings
    print('Number of Neighbors parameter in scatterplot is too large for the submitted dataset. Resetting number of neighbors to '+ str(umap_num_neighbors)+'')
        


In [7]:
def calculate_FET(set1, set2, background = 20000):
    ##inputs: set1, set2 - python sets
    ##output: p-value of the fisher exact test
    res = crisp.fisher_overlap(set1, set2, n_background_entities= background, preserve_overlap=True)
    if res == None:
        return 0
    else:
        return res.pvalue

In [83]:
def series_to_str(el):
    if type(el) == str:
        return el
    else:
        return ' '.join(el.tolist())

def generate_pairs_df(df, background = 20000):
    ##inputs: df - pandas dataframe that is the result of GMT_to_df transformation
    ##output: pair_df - pandas dataframe whose rows are indexed by a tuple/ pair of terms in the set of Gene set 
    # #terms and columns represent calculated set properties between the two sets

    intersection = []
    in_A_not_B = []
    in_B_not_A = []
    union = []
    jaccard = []
    FET_pval = []

    to_set = lambda el: set(series_to_str(el).split(' '))
    space_counter = lambda str1: str1.count(" ") +1


    terms = list(df.index.values)
    int_df = pd.DataFrame(index = terms, columns = terms)
    jac_df = pd.DataFrame(index = terms, columns = terms)
    pairwise_perms = list(itertools.combinations(terms,2))
    for term1,term2 in pairwise_perms:
        setA, setB = df.loc[term1]['Genes'], df.loc[term2]['Genes']
        set1, set2 = to_set(setA), to_set(setB)
        intersect = set1 & set2
        union_set = set1 | set2
        intersection.append(' '.join(intersect))
        in_A_not_B.append(' '.join(list(set1 -set2)))
        in_B_not_A.append(' '.join(list(set2 - set1)))
        union.append(' '.join(list(union_set)))
        pval = calculate_FET(set1, set2)
        FET_pval.append(pval)

        int_size = len(intersect)
        uni_size = len(union_set)
        jaccard = int_size/uni_size
        
        jac_df.loc[term1, term2] = jaccard
        jac_df.loc[term2, term1] = jaccard
        

        int_df.loc[term1, term2] = int_size
        int_df.loc[term2,term1] = int_size





    pair_df = pd.DataFrame({'Intersection' : intersection, 'A-B' : in_A_not_B, 'B-A' : in_B_not_A, 'Union': union, 'FET_pval': FET_pval}, index = pairwise_perms)
    pair_df['intersect_size'] = pair_df['Intersection'].map(space_counter)
    pair_df['union_size'] = pair_df['Union'].map(space_counter)
    pair_df['Jaccard'] = pair_df['intersect_size'] / pair_df['union_size']
    
    np.fill_diagonal(int_df.values,0)
    np.fill_diagonal(jac_df.values, 0)
    
    

    return pair_df, int_df, jac_df




In [87]:
pair_df, int_df, jac_df = generate_pairs_df(df)

## 3. Pairwise Intersection Matrix

In [67]:
if int_tbl:
    os.makedirs("P.I_matrix", exist_ok = True)
    int_df.to_csv('P.I_matrix/intersection_matrix.csv')
    display(int_df.head())
    display(FileLink('P.I_matrix/intersection_matrix.csv', result_html_prefix= str('Download Pairwise Intersection Matrix:')))

Unnamed: 0,Integrated breast cancer pathway,Stathmin and breast cancer resistance to antimicrotubule agents,Integrated breast cancer pathway WP1984,Breast cancer pathway WP4262,Breast cancer,NOTCH1 Signaling in Breast Cancer,Genes with Mutations Associated with Breast Cancer,Genes with Mutations Associated with Hereditary Breast and/or Ovarian Cancer Syndrome,Proteins Involved in Breast Cancer Related to ERBB2/VEGFR/Akt Signaling Pathway,Hereditary Breast and Ovarian Cancer Syndrome,...,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:39,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shSCR) GSE57677 ligand:242,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:41,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shSCR) GSE57677 ligand:242.1,estradiol human MCF-7 breast cancer cells GDS3283 ligand:42,estradiol human MCF-7 breast cancer cells GDS3283 ligand:43,Interleukin-13 human Breast cancer - MCF10CA1a cell line (pLKO-shIL13RA2) GSE57677 ligand:243,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:39.1,estradiol human estrogen receptor (ER)-positive MCF7 breast cancer cells GDS3217 ligand:40,estradiol human MCF-7 breast cancer cells GDS3105 ligand:38
Integrated breast cancer pathway,0,0,146,29,25,12,4,6,23,16,...,5,0,6,0,3,12,2,5,5,6
Stathmin and breast cancer resistance to antimicrotubule agents,0,0,0,0,0,1,0,0,1,0,...,5,1,3,1,1,1,0,5,6,2
Integrated breast cancer pathway WP1984,146,0,0,30,25,12,4,6,23,16,...,6,0,7,0,3,12,2,6,5,6
Breast cancer pathway WP4262,29,0,30,0,139,14,6,6,27,14,...,5,1,6,1,11,13,1,5,3,9
Breast cancer,25,0,25,139,0,15,3,6,27,10,...,5,1,6,1,11,13,2,5,3,9


## 4. Jaccard Similarity Matrix

In [90]:
if jaccard_tbl:
    display(jac_df.head())
    jac_df.to_csv('P.I_matrix/jaccard_matrix.csv')
    display(FileLink('P.I_matrix/jaccard_matrix.csv', result_html_prefix= str('Download Pairwise Intersection Matrix:')))
    

## 5. ScatterPlot Visualization

In [60]:
class NoResults(Exception):
    pass


class APIFailure(Exception):
    pass


class NotValidFile(Exception):
    pass


class UMAP_Visualization:

    def __init__(self, query_set=[], gene_libraries=[], sig_value=.05, gmt_files=[], gmt_df = []):
        self.query_set = [gene.strip() for gene in query_set]
        self.gene_libraries = gene_libraries
        self.significant_value = sig_value
        self.term_library_map = {}
        self.dataset = OrderedDict()
        self.dataset.update(self.process_gmt_df(gmt_df))
        
        
    def process_gmt_df(self, gmt_df):
        if gmt_df == []:
            return OrderedDict() ##return the empty Dictionary when no gmt passed in
        else:
            gmt_df = gmt_df[0]
            self.term_library_map.update(pd.Series(gmt_df['Library'].values,index=gmt_df.index.values).to_dict())
            print(self.term_library_map)
            return OrderedDict(pd.Series(gmt_df['Genes'].values,index=gmt_df.index.values).to_dict())
            
   

    def process_scatterplot(self, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
        libdict = self.dataset
        print("\tTF-IDF vectorizing gene set data...")
        # computes tdfidf score--look this up
        vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
        X = vec.fit_transform(libdict.values())
        print(X.shape)
        adata = anndata.AnnData(X)
        adata.obs.index = libdict.keys()

        print("\tPerforming Leiden clustering...")
        # the n_neighbors and min_dist parameters can be altered
        sc.pp.neighbors(adata, n_neighbors=nneighbors)
        sc.tl.leiden(adata, resolution=1.0)
        sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

        new_order = adata.obs.sort_values(by='leiden').index.tolist()
        adata = adata[new_order, :]
        adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

        df = pd.DataFrame(adata.obsm['X_umap'])
        df.columns = ['x', 'y']

        df['cluster'] = adata.obs['leiden'].values
        df['term'] = adata.obs.index
        df['genes'] = [libdict[l] for l in df['term']]
        df['library'] = [self.term_library_map[l] for l in df['term']]

        return df

    def get_scatter_colors(self, df):
        clusters = pd.unique(df['cluster']).tolist()
        colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
        color_mapper = {clusters[i]: colors[i % 20]
                        for i in range(len(clusters))}
        return color_mapper

    # def get_marker_mapper(self, df):
    #     markers = ["circle", "square", "triangle",
    #                "hex", "inverted_triangle", "diamond"]
    #     libs = pd.unique(df['library']).tolist()
    #     marker_mapper = {libs[i]: markers[i] for i in range(len(libs))}
    #     return marker_mapper

    def get_scatterplot(self, scatterdf):
        df = scatterdf.copy()
        color_mapper = self.get_scatter_colors(df)
        # marker_mapper = self.get_marker_mapper(df)
        df['color'] = df['cluster'].apply(lambda x: color_mapper[x])
        # df['marker'] = df['library'].apply(lambda x: marker_mapper[x])

        # range_slider = RangeSlider("title = Adjust x-axis",
        #                            start=0,
        #                            end=10,
        #                            step=1)

        tooltips = [
            ("Gene Set", "@gene_set"),
            ("Cluster", "@label"),
            ("Library", "@library")
        ]

        hover_emb = HoverTool(tooltips=tooltips)
        tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

        plot_emb = figure(
            width=900,
            height=700,
            tools=tools_emb
        )

        source = ColumnDataSource(
            data=dict(
                x=df['x'],
                y=df['y'],
                gene_set=df['term'],
                colors=df['color'],
                label=df['cluster'],
                library=df['library'],
                # markers=df['marker']

            )
        )

        # hide axis labels and grid lines
        plot_emb.xaxis.major_tick_line_color = None
        plot_emb.xaxis.minor_tick_line_color = None
        plot_emb.yaxis.major_tick_line_color = None
        plot_emb.yaxis.minor_tick_line_color = None
        plot_emb.xaxis.major_label_text_font_size = '0pt'
        plot_emb.yaxis.major_label_text_font_size = '0pt'

        plot_emb.output_backend = "svg"

        plot_emb.xaxis.axis_label = "UMAP_1"
        plot_emb.yaxis.axis_label = "UMAP_2"

        s = plot_emb.scatter(
            'x',
            'y',
            size=4,
            source=source,
            color='colors',
            legend_group='label',
            # marker='markers'
        )

        plot_emb.add_layout(plot_emb.legend[0], 'right')

        return plot_emb


In [61]:
%%appyter code_eval
if umap:
    umap = UMAP_Visualization(gmt_df = [df])
    umap_df = umap.process_scatterplot(maxdf = umap_maxdf, mindf = umap_mindf, nneighbors = umap_num_neighbors)
    fig = umap.get_scatterplot(umap_df)
    show(fig)

```python
if umap:
    umap = UMAP_Visualization(gmt_df = [df])
    umap_df = umap.process_scatterplot(maxdf = umap_maxdf, mindf = umap_mindf, nneighbors = umap_num_neighbors)
    fig = umap.get_scatterplot(umap_df)
    show(fig)
```

{'Integrated breast cancer pathway': 'BioPlanet_2019', 'Stathmin and breast cancer resistance to antimicrotubule agents': 'BioPlanet_2019', 'Integrated breast cancer pathway WP1984': 'WikiPathway_2021_Human', 'Breast cancer pathway WP4262': 'WikiPathways_2019_Human', 'Breast cancer': 'KEGG_2021_Human', 'NOTCH1 Signaling in Breast Cancer': 'Elsevier_Pathway_Collection', 'Genes with Mutations Associated with Breast Cancer': 'Elsevier_Pathway_Collection', 'Genes with Mutations Associated with Hereditary Breast and/or Ovarian Cancer Syndrome': 'Elsevier_Pathway_Collection', 'Proteins Involved in Breast Cancer Related to ERBB2/VEGFR/Akt Signaling Pathway': 'Elsevier_Pathway_Collection', 'Hereditary Breast and Ovarian Cancer Syndrome': 'Elsevier_Pathway_Collection', 'Proteins Involved in Breast Cancer Related to ESR1 Signaling Pathway': 'Elsevier_Pathway_Collection', 'Proteins Involved in Breast Cancer Related to IGF1R/Akt Signaling Pathway': 'Elsevier_Pathway_Collection', 'Proteins Involved

  adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')


None

NameError: name 'umap_df' is not defined