In [1]:
import sys
sys.path.append('../../')

import mudata

from src.evaluation import compute_categorical_association
from src.evaluation import compute_explained_variance_ratio
from src.evaluation import compute_geneset_enrichment
from src.evaluation import compute_trait_enrichment
from src.evaluation import compute_motif_enrichment

from plotly.subplots import make_subplots

import plotly.graph_objects as go
import plotly.express as px

import ipywidgets as w
from IPython.display import display

import numpy as np
import pandas as pd
from scipy import stats

from tqdm.auto import tqdm

In [2]:
# Import test data
mdata = mudata.read('../../../../../data/TeloHAEC_Perturb-seq_2kG/2kG.library_K60_kangh.h5mu')

sel_idx = []
for batch in mdata['cNMF'].obs['batch'].unique():
    for samp in mdata['cNMF'].obs['sample'].unique():
        mdata_obs_ = mdata['cNMF'].obs.loc[(mdata['cNMF'].obs['batch']==batch) & \
                                           (mdata['cNMF'].obs['sample']==samp)]

        sel_idx.extend(mdata_obs_.iloc[:200].index.tolist())

mdata = mdata[sel_idx].copy()
mdata



In [3]:
# Explained variance ratio plot
explained_ratios = compute_explained_variance_ratio(mdata, prog_key='cNMF', data_key='rna', inplace=False)

# Since original data wasn't available we use this as a stand in
from sklearn.decomposition import PCA

pca = PCA(n_components=60)

explained_ratios['explained_variance_ratio_X'] = pca.fit(mdata['cNMF'].X).explained_variance_ratio_

# Categorical association plot
association_df, posthoc_df = compute_categorical_association(mdata, prog_key='cNMF',
                                                             pseudobulk_key='sample',
                                                             categorical_key='batch',
                                                             inplace=False, 
                                                             n_jobs=-1)

# Geneset enrichment 
gsea_df = compute_geneset_enrichment(mdata, prog_key='cNMF', library='Reactome_2013', inplace=False, n_jobs=-1)

# Trait enrichment 
gwas_df = compute_trait_enrichment(mdata, '../../smk/resources/OpenTargets_L2G_Filtered.csv.gz', prog_key='cNMF', inplace=False)

# Motif enrichment
motif_match_df, motif_count_df, motif_enrichment_df = \
compute_motif_enrichment(mdata, prog_key='cNMF', data_key='rna', motif_file='../tests/test_data/motifs.meme',
                         seq_file='../../../../../data/hg38/hg38.fa', coords_file='../tests/test_data/p2g_links.txt',
                         n_jobs=1, inplace=False)

Computing explained variance:   0%|          | 0/60 [00:00<?, ?programs/s]

In [1]:
# Assemble dashboard - page 1

# Plot unique terms per program
def count_unique(categorical_var, count_var, dataframe):

    counts_df = dataframe.value_counts([categorical_var, count_var])
    counts_df = counts_df.groupby(categorical_var).sum()
    counts_df = counts_df.sort_values(ascending=False).cumsum()

    counts_df = pd.DataFrame(counts_df.reset_index().values, 
                             columns=[categorical_var,
                                      count_var])
    return counts_df

fig = make_subplots(
    rows=2, cols=2,
    specs=[
           [{}, {}],
           [{}, {}],
          ],
    print_grid=True,
    subplot_titles=('Cummulative component wise R2 scores', 'Cummulative enriched gene-sets',
                     'Cummulative enriched GWAS traits', 'Cummulative enriched Promoter Motifs'),
    vertical_spacing = 0.05, horizontal_spacing = 0.1)

# Explained variance ratios
explained_ratios = explained_ratios.sort_values(explained_ratios.columns[0], ascending=False)

# Plot unique GSEA Terms per program
gsea_unique_df = count_unique('program_name', 'Term', gsea_df.loc[gsea_df['FDR q-val']<=0.05])

# Plot unique GWAS Terms per program
gwas_unique_df = count_unique('program_name', 'Term', gwas_df.loc[gwas_df['Adjusted P-value']<=0.05])

# Plot unique Motif Terms per program
motif_enrichment_df['fdr'] = stats.false_discovery_control(motif_enrichment_df.pval.astype(float))

# Generate example data for dashapp

a = explained_ratios.reset_index()

a['prog_name'] = a['index']
a['Cummulative component wise R2 scores'] = a['explained_variance_ratio_X'].cumsum()
a = a.drop(['index', 'explained_variance_ratio_X'], axis=1)

gwas_unique_df['Cummulative enriched GWAS traits'] = gwas_unique_df['Term']
gwas_unique_df['prog_name'] = gwas_unique_df['program_name']
a = a.merge(gwas_unique_df.loc[:, ['prog_name', 'Cummulative enriched GWAS traits']], left_on='prog_name', right_on='prog_name', how='left')

gsea_unique_df['Cummulative enriched gene-sets'] = gsea_unique_df['Term']
gsea_unique_df['prog_name'] = gsea_unique_df['program_name']
a = a.merge(gsea_unique_df.loc[:, ['prog_name', 'Cummulative enriched gene-sets']], left_on='prog_name', right_on='prog_name', how='left')

motif_unique_df['Cummulative enriched Promoter Motifs'] = motif_unique_df['motif']
motif_unique_df['prog_name'] = motif_unique_df['index']
a = a.merge(motif_unique_df.loc[:, ['prog_name', 'Cummulative enriched Promoter Motifs']], left_on='prog_name', right_on='prog_name', how='left')

a['Cummulative enriched GWAS traits'] = a['Cummulative enriched GWAS traits'].fillna(a['Cummulative enriched GWAS traits'].max())
a['Cummulative enriched gene-sets'] = a['Cummulative enriched gene-sets'].fillna(a['Cummulative enriched gene-sets'].max())
a['Cummulative enriched Promoter Motifs'] = a['Cummulative enriched Promoter Motifs'].fillna(a['Cummulative enriched Promoter Motifs'].max())

a.to_csv('dashapp/example_data/fit_metrics.txt', sep='\t', index=False)

plots = {}
for i, col in enumerate([col for col in a.columns if 'prog_name' not in col]):

    plots[col] = px.scatter(x=a.sort_values(col)['prog_name'], 
                            y=a.sort_values(col)[col],)
    plots[col].update_layout(xaxis_title='Components', yaxis_title=col)

    row_num = int((i)/2) + 1
    col_num = i - (row_num-1)*2 + 1
    fig.add_trace(plots[col]['data'][0], row=row_num, col=col_num)
    fig.update_xaxes(showticklabels=False, row=row_num, col=col_num)
    fig.update_yaxes(ticksuffix = "  ", row=row_num, col=col_num)

fig.update_traces(hovertemplate="Program Name: %{x} <br> Value: %{y}", marker_color='silver')
fig.update_layout(height=700, width=1000, 
                  plot_bgcolor='whitesmoke',
                  title_text="GEP Dashboard - v0.1 - Goodness of fit measures")
fig.show()


NameError: name 'make_subplots' is not defined

In [2]:
# Page 2 - program assessment

# Batch assocation
association_df['batch_sample_kruskall_wallis_fdr'] = stats.false_discovery_control(association_df.batch_sample_kruskall_wallis_pval)
association_df['batch_sample_kruskall_wallis_neg_log_fdr'] = association_df['batch_sample_kruskall_wallis_fdr'].apply(lambda x: -np.log(x))

association_df['batch_sample_kruskall_wallis_log_stat'] = association_df['batch_sample_kruskall_wallis_stat'].apply(lambda x: np.log(x))

# Program scores across batch
prog_df = pd.DataFrame(mdata['cNMF'].X, index=mdata['cNMF'].obs.index)
prog_df['sample'] = mdata['cNMF'].obs['sample']
prog_df['batch'] = mdata['cNMF'].obs['batch']

# Loadings
loadings = pd.DataFrame(mdata['cNMF'].varm['loadings'], 
                        index=mdata['cNMF'].var_names,
                        columns=mdata['cNMF'].uns['var_names']).T    
loadings = loadings/abs(loadings).max(axis=0)

# Make figure
fig = make_subplots(rows=2, cols=2,
                    specs=[
                           [{}, {}],
                           [{'colspan':2}, None],
                          ],
                    print_grid=True,
                    subplot_titles=('Enrichment w.r.t. batch', 
                                    'Program distribution across batch', 
                                    'Program-gene loadings (normalised)'),
                    vertical_spacing = 0.1, 
                    horizontal_spacing = 0.1)

for k, r in enumerate([col for col in prog_df.columns if col not in ['sample', 'batch']]):
       # Plot volcano
       volcano = go.Scatter(x=association_df.batch_sample_kruskall_wallis_log_stat,
                            y=association_df.batch_sample_kruskall_wallis_neg_log_fdr,
                            customdata=association_df.index.values,
                            hovertemplate=" Program Name: %{customdata}",
                            showlegend=False, mode='markers', 
                            visible=True if k == 0 else False,
                            marker_color='silver',
                            )
       fig.add_trace(volcano, row=1, col=1)
       fig.update_xaxes(showticklabels=False, row=1, col=1)
       fig.update_yaxes(title='Neg. log adjusted pval', ticksuffix = "  ", row=1, col=1)

# Plot box
for k, r in enumerate([col for col in prog_df.columns if col not in ['sample', 'batch']]):

    fig.add_trace(
                  go.Box(x=prog_df.batch, 
                         y=prog_df[r],
                         visible=True if k == 0 else False,
                         customdata=prog_df.index.values,
                         hovertemplate="Cell barcode: %{customdata}",
                         marker_color='silver',
                        ), 
                   row=1, col=2)
# fig.update_xaxes(showticklabels=False, row=2, col=1)
fig.update_yaxes(ticksuffix = "  ", row=1, col=2)

# Plot loadings
for k, r in enumerate(loadings.columns):

    dfp = loadings.sort_values(r, ascending=False)[:100]
    fig.add_trace(
                  go.Bar(x=dfp.index, 
                         y=dfp[r],
                         name='', 
                         orientation='v',
                         hovertemplate="Gene Name: %{x} <br> Loading: %{y}",
                         visible=True if k == 0 else False,
                         marker_color='silver',
                        ), 
                   row=2, col=1)
fig.update_xaxes(showticklabels=False, row=2, col=1)
fig.update_yaxes(tickvals=np.arange(0,1.25,0.25), ticksuffix = "  ", row=2, col=1)
        
# Define buttons for dropdown
col_opts = list(loadings.columns)
buttons_opts = []
for i, opt in enumerate(col_opts):
    args = [False] * len(col_opts)
    args[i] = True
    buttons_opts.append(
        dict(
             method='update',
             label=opt,
             args=[{
                    'visible': args, #this is the key line!
                    'title': opt,
                    'showlegend': False
                   }]
            ))
    
fig.update_layout(height=700, width=1000, 
                  plot_bgcolor='whitesmoke',

                  title_text="GEP Dashboard - v0.1 - Investigate GEPs",
                  updatemenus = [go.layout.Updatemenu(
                                 active=0,
                                 buttons=buttons_opts,
                                 x=-0.1,
                                 xanchor='left',
                                 y=1,
                                 yanchor='bottom')]
                 )
fig.show()





NameError: name 'stats' is not defined