In [1]:
import numpy as np
import pandas as pd
import anndata as ad
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from scipy import sparse

%matplotlib inline

# Load adata & csv

1. Run brain1_vs_brain3_gene_qc.ipynb to generate ../results/Brain_1_3_TH_ZI_geneQC.h5ad

In [2]:
# will need to run brain1_vs_brain3_gene_qc.ipynb to generate this file if on a
# new codeocean run
adata_qc = ad.read_h5ad('/root/capsule/results/Brain_1_3_TH_ZI_geneQC.h5ad')

In [3]:
adata_qc

In [4]:
adata_qc.uns['mean_diff_plot_results_flags']

In [5]:
# Load csv from Brian
wmb_prelim_genes_csv = pd.read_csv('/code/resources/brain3_PreliminaryGeneList_testmousewb_fromBrian.csv')

In [6]:
# set the index to the gene name to make indexing/subsetting easier
wmb_prelim_genes_csv.set_index("Vizgen Gene", inplace=True)
wmb_prelim_genes_csv

## Find list of overlapping genes

In [8]:
# figure out which genes are in both the csv & the ad.uns
genes_csv = sorted(wmb_prelim_genes_csv.index.tolist())
genes_ad = sorted(adata_qc.uns['mean_diff_plot_results_flags'].index.tolist())
genes_both = sorted(list(set(genes_csv) & set(genes_ad)))

print(f'{len(genes_csv)=}')
print(f'{len(genes_ad)=}')
print(f'{len(genes_both)=}')

if set(genes_ad).issubset(set(genes_csv)):
    genes = genes_ad
else:
    genes = genes_both
    
# print(genes)

## Subset to just those genes

In [9]:
# subset both to be just the genes we want
mean_diff_plot_results_df = adata_qc.uns['mean_diff_plot_results_flags'].loc[genes,:]
mean_diff_plot_results_df

In [10]:
# subset both to be just the genes we want
blank_qc_df = adata_qc.uns['gene_qc_flags'].loc[genes,:]
blank_qc_df

In [11]:
wmb_df = wmb_prelim_genes_csv.loc[genes,:]
wmb_df

# Plot csv data

In [12]:
# Plot gene # of probe target regions vs gene abundance
fig = plt.figure(figsize=(20,10))
ax = plt.gca()
sc = ax.scatter(wmb_df['Target Regions'], 
                wmb_df['Abundance'],
                s=18, color='cornflowerblue')
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_ylabel('gene abundance')
ax.set_xlabel('# of target regions on gene')


for gene in wmb_df.index.tolist():
    curr_row = wmb_df[wmb_df.index==gene]
    ax.annotate(gene, (curr_row['Target Regions'], curr_row['Abundance']), 
                fontsize=10, color='k')
    
plt.show()

# Plot target regions vs QC metrics

## Blank metrics

In [17]:
# Plot gene # of probe target regions vs gene abundance
fig = plt.figure(figsize=(20,10))
ax = plt.gca()
sc = ax.scatter(wmb_df['Abundance'], wmb_df['Target Regions'],
                s=18, color='cornflowerblue')
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('gene abundance')
ax.set_ylabel('# of target regions on gene')


for gene in wmb_df.index.tolist():
    curr_row_csv = wmb_df[wmb_df.index==gene]
    curr_row_ad = blank_qc_df[blank_qc_df.index==gene]
    
    if curr_row_ad['below_blanks_all_cells_brain3'].bool():
            color = 'magenta'
            fontsize=16
    elif curr_row_ad['below_blanks_all_cells_brain1'].bool():
        color = 'lightsteelblue'
        fontsize=16
    else:
        color = 'k'
        fontsize=10
            
    ax.annotate(gene, (curr_row_csv['Abundance'], curr_row_csv['Target Regions']), 
                fontsize=fontsize, color=color)

ax.annotate('Genes flagged for reads below mean blank reads in brain 1', 
            (0.04, 350), 
            fontsize=12, color='lightsteelblue')
ax.annotate('Genes flagged for reads below mean blank reads in brain 3', 
            (0.04, 300), 
            fontsize=12, color='magenta')
    
plt.show()

## Mean-difference plot metrics

In [14]:
# Plot gene # of probe target regions vs mean diff plot results column
data_col = ['b1b3_diff_95pctReads_allCells', 
            'b1b3_diff_99pctLogReads_allCells',
            'b1b3_diff_99pctLogReads_neurons',
            'b1b3_diff_99pctLogReads_nn',
            'b1b3_diff_99pctLogReads_exc',
            'b1b3_diff_99pctLogReads_inh',
            'b1b3_diff_95pctReads_diffAsPctOfMean_allCells'
           ]
flag_col_b3 = ['b3_flagged_95pctReads_allCells', 
               'b3_flagged_99pctLogReads_allCells',
               'b3_flagged_99pctLogReads_neurons',
               'b3_flagged_99pctLogReads_nn',
               'b3_flagged_99pctLogReads_exc',
               'b3_flagged_99pctLogReads_inh',
               'b3_flagged_95pctReads_diffAsPctOfMean_allCells'
              ]
flag_col_b1 = ['b1_flagged_95pctReads_allCells', 
               'b1_flagged_99pctLogReads_allCells',
               'b1_flagged_99pctLogReads_neurons',
               'b1_flagged_99pctLogReads_nn',
               'b1_flagged_99pctLogReads_exc',
               'b1_flagged_99pctLogReads_inh',
               'b1_flagged_95pctReads_diffAsPctOfMean_allCells'
              ]

In [24]:
for i, col in enumerate(data_col):
    fig = plt.figure(figsize=(20,10))
    ax = plt.gca()

    sc = ax.scatter(mean_diff_plot_results_df[data_col[i]],
                    wmb_df['Target Regions'],
                    s=18, 
                    color='cornflowerblue')
    # ax.set_yscale('log')
    # ax.invert_xaxis()  # invert y-axis to have brain3 flagged genes in lower left
    ax.set_yscale('log')
    ax.set_xlabel(data_col[i])
    ax.set_ylabel('# of target regions on gene')


    for gene in genes:
        curr_row_csv = wmb_df[wmb_df.index==gene]
        curr_row_ad = mean_diff_plot_results_df[mean_diff_plot_results_df.index==gene]

        if curr_row_ad[flag_col_b3[i]].bool():
            color = 'magenta'
            fontsize=16
        elif curr_row_ad[flag_col_b1[i]].bool():
            color = 'lightsteelblue'
            fontsize=16
        else:
            color = 'k'
            fontsize=10
        ax.annotate(gene, (curr_row_ad[data_col[i]], curr_row_csv['Target Regions']), 
                    fontsize=fontsize, color=color)
    
    ax.annotate('genes flagged in brain1 for low reads (relative to brain3)', 
                (-1.3, 40), 
                fontsize=12, color='lightsteelblue')    
    ax.annotate('genes flagged in brain3 for low reads (relative to brain1)', 
                (0.5, 40), 
                fontsize=12, color='magenta')

    plt.show()
    
    print('brain3 flagged genes:', mean_diff_plot_results_df[mean_diff_plot_results_df[flag_col_b3[i]]].index.tolist())