In [21]:
import sys
sys.path.append('../../')

import mudata

from src.evaluation import compute_categorical_association
from src.evaluation import compute_explained_variance_ratio
from src.evaluation import compute_geneset_enrichment
from src.evaluation import compute_trait_enrichment
from src.evaluation import compute_motif_enrichment

from plotly.subplots import make_subplots

import plotly.graph_objects as go
import plotly.express as px

import ipywidgets as w
from IPython.display import display

import numpy as np
import pandas as pd
from scipy import stats

from tqdm.auto import tqdm

In [2]:
# Import test data
mdata = mudata.read('../../../../../data/TeloHAEC_Perturb-seq_2kG/2kG.library_K60_kangh.h5mu')

sel_idx = []
for batch in mdata['cNMF'].obs['batch'].unique():
    for samp in mdata['cNMF'].obs['sample'].unique():
        mdata_obs_ = mdata['cNMF'].obs.loc[(mdata['cNMF'].obs['batch']==batch) & \
                                           (mdata['cNMF'].obs['sample']==samp)]

        sel_idx.extend(mdata_obs_.iloc[:200].index.tolist())

mdata = mdata[sel_idx].copy()
mdata



In [3]:
# Explained variance ratio plot
explained_ratios = compute_explained_variance_ratio(mdata, prog_key='cNMF', data_key='rna', inplace=False)

Computing explained variance: 100%|██████████| 60/60 [01:01<00:00,  1.02s/programs]


In [4]:
# Categorical association plot
association_df, posthoc_df = compute_categorical_association(mdata, prog_key='cNMF',
                                                             pseudobulk_key='sample',
                                                             categorical_key='batch',
                                                             inplace=False, 
                                                             n_jobs=-1)

INFO:root:Perform testing by averaging over sample
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dropna().reset_index()
  prog_data = prog_data.groupby([pseudobulk_key, categorical_key]).mean().dr

In [5]:
# Geneset enrichment 
gsea_df = compute_geneset_enrichment(mdata, prog_key='cNMF', library='Reactome_2013', inplace=False, n_jobs=-1)

INFO:root:Downloading and generating Enrichr library gene sets...
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be arbitrary, which may produce unexpected results.
The order of those genes will be

In [6]:
# Trait enrichment 
gwas_df = compute_trait_enrichment(mdata, '../../smk/resources/OpenTargets_L2G_Filtered.csv.gz', prog_key='cNMF', inplace=False)

Running Fisher enrichment: 100%|██████████| 60/60 [00:03<00:00, 16.28programs/s]


In [7]:
# Motif enrichment
motif_match_df, motif_count_df, motif_enrichment_df = \
compute_motif_enrichment(mdata, prog_key='cNMF', data_key='rna', motif_file='../tests/test_data/motifs.meme',
                         seq_file='../../../../../data/hg38/hg38.fa', coords_file='../tests/test_data/p2g_links.txt',
                         n_jobs=1, inplace=False)

Motif scanning: 100%|██████████| 505/505 [00:29<00:00, 16.93genes/s]/s]
Motif scanning: 100%|██████████| 505/505 [00:29<00:00, 17.17genes/s]83s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:30<00:00, 16.43genes/s]59s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:28<00:00, 17.49genes/s]11s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:31<00:00, 16.25genes/s]62s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:33<00:00, 15.23genes/s]15s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:34<00:00, 14.81genes/s]18s/motifs]
Motif scanning: 100%|██████████| 505/505 [00:33<00:00, 15.14genes/s]14s/motifs]
Matching motifs to sequences: 100%|██████████| 8/8 [04:10<00:00, 31.32s/motifs]
  motif_match_df = pd.concat(motif_match_dfs)
Computing motif enrichment: 100%|██████████| 8/8 [00:02<00:00,  3.43motifs/s]


In [14]:
# # Correlation matrix

# def sparse_corr(A):
#     N = A.shape[0]
#     C=((A.T*A -(sum(A).T*sum(A)/N))/(N-1)).todense()
#     V=np.sqrt(np.mat(np.diag(C)).T*np.mat(np.diag(C)))
#     COR = np.divide(C,V+1e-119)
#     return COR

# corr = sparse_corr(mdata['rna'].X)


In [37]:
# Plot unique terms per program
def count_unique(categorical_var, count_var, dataframe):

    counts_df = dataframe.value_counts([categorical_var, count_var])
    counts_df = counts_df.groupby(categorical_var).sum()
    counts_df = counts_df.sort_values(ascending=False).cumsum()

    counts_df = pd.DataFrame(counts_df.reset_index().values, 
                             columns=[categorical_var,
                                      count_var])
    return counts_df

In [56]:
# Assemble dashboard

fig = make_subplots(
    rows=4, cols=4,
    specs=[
           [{"rowspan": 1, "colspan":2}, None, {"rowspan": 1, "colspan":2}, None],
           [{"rowspan": 1, "colspan":2}, None, {"rowspan": 1, "colspan":2}, None],
           [{"rowspan": 1, "colspan":4}, None, None, None],
           [None, None, None, None]
          ],
    print_grid=True,
    subplot_titles=('Component wise R2 scores', 'Cummulative unique enriched gene-sets',
                     'Cummulative unique enriched GWAS traits', 'Cummulative unique enriched Motifs'),
    vertical_spacing = 0.05, horizontal_spacing = 0.1)

# Explained variance ratios
explained_ratios = explained_ratios.sort_values(explained_ratios.columns[0], ascending=False)
plot_1 = px.scatter(x=explained_ratios.index.values, y=explained_ratios.values[:,0])
plot_1.update_layout(xaxis_title='Components', yaxis_title='R2 score')
fig.add_trace(plot_1['data'][0], row=1, col=1)
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(tickvals=np.arange(-5, 5, 0.25), ticksuffix = "  ", row=1, col=1)

# Plot unique GSEA Terms per program
gsea_unique_df = count_unique('program_name', 'Term', gsea_df.loc[gsea_df['FDR q-val']<=0.05])
plot_2 = px.scatter(data_frame=gsea_unique_df,
                    x='program_name', 
                    y='Term')
fig.add_trace(plot_2['data'][0], row=1, col=3)
fig.update_xaxes(showticklabels=False, row=1, col=3)
fig.update_yaxes(tickvals=np.arange(0,300, 25), ticksuffix = "  ", row=1, col=3)

# Plot unique GWAS Terms per program
gwas_unique_df = count_unique('program_name', 'Term', gwas_df.loc[gwas_df['Adjusted P-value']<=0.05])
plot_3 = px.scatter(data_frame=gwas_unique_df,
                    x='program_name', 
                    y='Term')
fig.add_trace(plot_3['data'][0], row=2, col=1)
fig.update_xaxes(showticklabels=False, row=2, col=1)
fig.update_yaxes(tickvals=np.arange(0,300, 25),ticksuffix = "  ", row=2, col=1)

# Plot unique Motif Terms per program
motif_enrichment_df['fdr'] = stats.false_discovery_control(motif_enrichment_df.pval.astype(float))
motif_unique_df = count_unique('index', 'motif', motif_enrichment_df.loc[motif_enrichment_df['fdr']<=0.05])
plot_4 = px.scatter(data_frame=motif_unique_df,
                    x='index', 
                    y='motif')
fig.add_trace(plot_4['data'][0], row=2, col=3)
fig.update_xaxes(showticklabels=False, row=2, col=3)
fig.update_yaxes(tickvals=np.arange(0,300, 25), ticksuffix = "  ", row=2, col=3)

# Plot loadings
loadings = pd.DataFrame(mdata['cNMF'].varm['loadings'], 
                        index=mdata['cNMF'].var_names,
                        columns=mdata['cNMF'].uns['var_names']).T    
title = "Loadings per component"


fig.update_layout(height=1415, width=1000, title_text="GEP Dashboard - v0.1")
fig.show()


This is the format of your plot grid:
[ (1,1) x,y             -      ]  [ (1,3) x2,y2           -      ]
[ (2,1) x3,y3           -      ]  [ (2,3) x4,y4           -      ]
[ (3,1) x5,y5           -                -                -      ]
    (empty)          (empty)          (empty)          (empty)    



In [81]:
def gen_bar(dataframe, title):

    """
    Displays an interactive plotly graph using the given column and dataframe.
    
    dataframe: dataframe containing relevant data
    title: title for given visualization

    """
    
    # Define plot
    fig = go.Figure()

    for k, r in enumerate(dataframe.columns):

        dfp = dataframe.sort_values(r, ascending=True)[-100:]
        fig.add_traces(
            go.Bar(x=dfp.index, 
                   y=dfp[r],
                   customdata=dfp.index.values,
                   name='', 
                   orientation='v',
                   hovertemplate="%{customdata}: %{y}",
                   visible=True if k == 0 else False,
                   marker_color='cyan',
                   ))
        
    
# Define buttons for dropdown
    col_opts = list(dataframe.columns)
    buttons_opts = []
    for i, opt in enumerate(col_opts):
        args = [False] * len(col_opts)
        args[i] = True
        buttons_opts.append(
            dict(
                method='restyle',
                label=opt,
                args=[{
                    'visible': args, #this is the key line!
                    'title': opt,
                    'showlegend': False
                }]
            )
        )
        
    # Styling
    title = f"{title}"
    fig.update_layout(
        updatemenus = [go.layout.Updatemenu(
            active=0,
            buttons=buttons_opts,
            x=1.12,
            xanchor='right',
            y=1.1,
            yanchor='top'
            )],
        xaxis={
            'autorange': "reversed",
            'showline': True,
            'linecolor': 'black',
            'title': None,
            'showticklabels': False
        },
        yaxis={'showticklabels': False},
        title=dict(text=title),
        showlegend=False,
        width=1000,
        height=400,
        plot_bgcolor='whitesmoke',
        #paper_bgcolor='#f0f0f0',
        xaxis_title=None,
        margin=dict(l=85, r=85, t=95, b=45)
    )

    fig.show()
gen_bar(loadings, title)