# Introduction

STAR went through a few variations to get splitseq to work. Does it matter for 10x?

In [12]:
import csv
from collections import Counter
import scanpy
import pysam
from pathlib import Path
import pandas
import requests
import numpy
import os
import re
import shutil
import scipy
import gzip
import sys
from matplotlib import pyplot
import matplotlib
import upsetplot
import warnings
from urllib.parse import urljoin
import yaml

from common import scanpy_load_solo_mtx

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client import ENCODED

In [5]:
MEX = str(Path("~/proj/mex_gene_archive").expanduser())
if MEX not in sys.path:
    sys.path.append(MEX)
from mex_gene_archive.reader import read_mex_archive_as_anndata

In [4]:
WRS = str(Path("~/proj/woldlab-rna-seq").expanduser())
if WRS not in sys.path:
    sys.path.append(WRS)
from woldrnaseq.plots.scrna_matrix_qc import (
    calculate_qc,
    generate_violin_plot,
    make_pct_mt_scatter,
    make_gene_by_count_scatter,
)


In [26]:
def compute_spearman_pandas(left, right):
    assert left.shape == right.shape, "Shapes must be the same"

    cors=[]
    for c in left.columns:
        cors.append(scipy.stats.spearmanr(left[c], right[c])[0])
    cors = pandas.Series(cors, index=left.columns)
    return cors


In [22]:
def compute_correlations(tables):
    programs = list(tables.keys())
    cell_correlations = {}

    for name_x in programs:
        for name_y in programs[programs.index(name_x):]:
            assert tables[name_x].shape == tables[name_y].shape
            cs_cors = compute_spearman_pandas(tables[name_x], tables[name_y])
            assert len(cs_cors) == len(tables[name_x].columns)
            cell_correlations.setdefault(name_x, {})[name_y] = cs_cors
    return cell_correlations

In [23]:
def set_all_spines(ax, state):
    for name in ["left", "right", "top", "bottom"]:
        ax.spines[name].set_visible(state)
        ax.tick_params(
            axis='both', 
            which='both', 
            bottom=False,
            labelbottom=False, 
            top=False, 
            labeltop=False,
            right=False,
            labelright=False,
            left=False, 
            labelleft=False)

def plot_cell_correlation_histogram(
    table, *, programs=None, bins=50, y_annot=0.6, count_nans=False, 
    correlation_name='Spearman',
    xlim=None
):
    #fontname = 'Arial'
    override_labels = {
        'Alevin': 'Salmon Alevin (EM)',
        'Kallisto EM': 'Kallisto Bus (EM)'
    }
    fontname = 'DejaVu Sans'
    fontsize = 14    
    if programs is None:
        programs = list(table.keys())
    cell_hists = {}
    f = pyplot.figure(figsize=(7,7))
    #f.suptitle('Per-cell {} correlation'.format(correlation_name), fontsize=fontsize)
    plot_size = len(programs)-1

    axes = f.subplots(plot_size, plot_size, sharex=True, sharey=True)
    
    for i in range(0, plot_size):
        for j in range(i+1, plot_size):
            set_all_spines(axes[i, j], False)

    if plot_size == 1:
        axes = numpy.asarray([[axes]])

    for x, name_x in enumerate(programs):
        for y, name_y in enumerate(programs[programs.index(name_x)+1:]):
            x_label = override_labels.get(name_x, name_x)
            y_label = override_labels.get(name_y, name_y)
            #plot_index = plot_size * (y+x) + x + 1
            #ax = f.add_subplot(plot_size, plot_size, plot_index)
            ax = axes[y+x, x]
            if xlim is not None:
                ax.set_xlim(xlim)
            #set_all_spines(ax, True)
            if x == 0:
                ax.set_ylabel(y_label, fontname=fontname, fontsize=fontsize)

            spearman = table[name_x][name_y]
            spearman = numpy.array(spearman)
            spearman = spearman[~numpy.isnan(spearman)]
            count = len(spearman)
            median = numpy.median(spearman)
            mean = numpy.mean(spearman)
            cell_hists.setdefault(name_x, {})[name_y] = ax.hist(spearman, bins=bins, density=True)
            nantext=""
            if count_nans:
                nantext="\nNaNs {}".format(table[name_x][name_y].isna().sum())
            ax.annotate(f'Mean {mean:0.2}\nMedian {median:0.2}\nCells {count}{nantext}', xy=(0.1, y_annot), xycoords='axes fraction')
            #ax.annotate(f'Mean {mean}\nMedian {median}\nCells {count}{nantext}', xy=(0.1, y_annot), xycoords='axes fraction')
    for y in range(plot_size):
        x_label = override_labels.get(programs[y], programs[y])
        axes[0, y].set_title(x_label, fontname=fontname, fontsize=fontsize)
        axes[plot_size-1, y].set_xlabel(x_label, fontname=fontname, fontsize=fontsize)
    #f.tight_layout()
    return f

In [24]:
gene_id_name_map = {}
genome_dir = Path("~/proj/genome/GRCh38-V29-male-2.7.8a").expanduser()
with open(genome_dir / "geneInfo.tab", "rt") as instream:
    count = instream.readline()
    for line in instream:
        gene_id, gene_name, gene_type = line.rstrip().split("\t")
        if gene_name == "NULL":
            gene_name = gene_id
        gene_id_name_map[gene_id] = gene_name
        


In [6]:
experiment_root = Path("adrenal/ENCSR726IPC_59f_nuc")

In [9]:
full_eoi_dir = experiment_root / "fullsolo_multi_dev_EoI_2.7.9a_2021-09-10"
full_ssc_dir = experiment_root / "ENCLB025XJD"

In [15]:
scanpy_load_solo_mtx?

In [19]:
#full_eoi = read_mex_archive_as_anndata(full_eoi_dir / "GeneFull_Ex50pAS_EM_filtered.tar.gz")
full_eoi = scanpy_load_solo_mtx(full_eoi_dir, gene="GeneFull_Ex50pAS")

calculate_qc(full_eoi, gene_id_name_map)

print("count sum", full_eoi.X.sum().sum())
print("shape", full_eoi.shape)

count sum 55598700.0
shape (12698, 59526)


In [20]:
full_ssc = read_mex_archive_as_anndata(full_ssc_dir / "GeneFull_Ex50pAS_Unique_filtered.tar.gz")

calculate_qc(full_ssc, gene_id_name_map)

print("count sum", full_ssc.X.sum().sum())
print("shape", full_ssc.shape)

count sum 55598700.0
shape (12698, 59526)


In [27]:
raw_datasets = {
    "old": full_eoi.to_df().T,
    "new": full_ssc.to_df().T,
}

raw_cors = compute_correlations(raw_datasets)

In [29]:
f = plot_cell_correlation_histogram(raw_cors)

  return n/db/n.sum(), bin_edges
  return n/db/n.sum(), bin_edges


In [30]:
numpy.all(full_eoi.X == full_ssc.X)

  exec(code_obj, self.user_global_ns, self.user_ns)


<12698x59526 sparse matrix of type '<class 'numpy.bool_'>'
	with 755861148 stored elements in Compressed Sparse Row format>

In [31]:
a = _

In [32]:
a.sum()

755861148

In [33]:
(full_eoi.X - full_ssc.X).sum()

0.0