In [29]:
import scripts.notebook_utils as utils
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'ticks')
import importlib
import warnings
import os
warnings.filterwarnings("ignore")

# Load Data

In [3]:
fragments = utils.read_bias_file(snakemake.input.fragments)

corrected_data = utils.read_sparse_countmatrix(snakemake.input.barcodes, snakemake.input.peaks, snakemake.input.corrected_counts)
corrected_data_lsi_model = utils.process_counts(corrected_data)

raw_data = utils.read_sparse_countmatrix(snakemake.input.barcodes, snakemake.input.peaks, snakemake.input.counts)
raw_data_lsi_model = utils.process_counts(raw_data)

fragments = fragments.merge(corrected_data.obs.reset_index().rename(columns = {'index' : 'barcode'})['barcode'], on = 'barcode', how = 'right')

# Fragment Model Performance

In [4]:
points = utils.benchmark_fragment_model(fragments)

ax = sns.displot(points, x = 'log_duprate', y = 'true_log_duprate', kind = 'kde')
ax.set(xlabel = 'Predicted Fragment Count', ylabel = 'Observed Fragment Count')

# Compare UMAPs

In [16]:
fig, ax = plt.subplots(1,2, figsize = (14,5))
utils.plot_umap(corrected_data, color_key=raw_data.obs.leiden, quantitative=False, ax = ax[0])
ax[0].set(title = 'Bias-corrected UMAP', xlabel = 'Colored by uncorrected clustering')
utils.plot_umap(raw_data, color_key='leiden', quantitative=False, ax = ax[1], legend = True)
ax[1].set(title = 'Uncorrected UMAP')

# Observe Bias Concentration

In [6]:
barcode_stats = utils.aggregate_cell_stats(fragments).join(raw_data.obs.leiden)

In [7]:
ax = sns.scatterplot(data = barcode_stats, x = 'fragment_count', y ='mean_log_duprate', hue = 'leiden', legend = False)
ax.set(xscale = 'log', xlabel = 'log(Fragment Count)', ylabel = 'Mean log(duplication rate) per cell')
sns.despine()

In [8]:
raw_data.obs = raw_data.obs.join(barcode_stats[['mean_log_duprate', 'fragment_count']])

In [9]:
fig, ax = plt.subplots(1,2, figsize=(14,5))

utils.plot_umap(raw_data, color_key = 'mean_log_duprate', ax = ax[0])
ax[0].set(title = 'Mean Bias Per Cell')
utils.plot_umap(raw_data, color_key='fragment_count', ax = ax[1])
ax[1].set(title = 'Fragment Count Per Cell')

# Enrichment of Biased Peaks Per Cluster

In [10]:
peak_ranks = utils.rank_peaks(fragments).dropna()

cluster_enrichments = utils.get_bias_peak_enrichment(fragments, raw_data.obs.leiden, peak_ranks).astype(float)

In [11]:
sns.displot(data = cluster_enrichments.reset_index(), hue = 'cluster', x = 'rank', 
            kind = 'kde', common_norm = False)

# Cell-Specific Effects Analysis

In [14]:
stratified_sample = utils.get_fragment_distribution_by_peak_and_cluster(fragments, 
        raw_data.obs.leiden[raw_data.obs.leiden.isin(['0','1'])], peak_ranks)

In [15]:
fig, ax = plt.subplots(figsize = (15,5))
sns.swarmplot(data = stratified_sample, x = 'peak_id', y = 'log_duprate_y', hue = 'cluster', 
              ax = ax, dodge = True, size = 1.5, 
              order = stratified_sample.groupby('peak_id')['rank'].first().sort_values().index.values)
sns.despine()
ax.set(xticks = [], ylabel = 'Log(Duplication Rate)', xlabel = 'less bias ← Peaks → more bias')

# Differential Peaks Analysis

In [18]:
diffpeaks = utils.get_differential_peaks(raw_data)

In [30]:
for cluster, data in diffpeaks.T.iterrows():
    with open(os.path.join(snakemake.output[0], 'cluster_{}_diffpeaks.bed'.format(str(cluster))), 'w') as f:
        print('\n'.join([x.replace('_','\t') for x in data.values]), file = f)