# Introduction

For the minimal datasets there are many cells for which the spearman calculation fails. I thought I should look a little harder into why.

In [20]:
import anndata
import numpy
import pandas
import scanpy
import scipy
from matplotlib import pyplot
from pathlib import Path
from collections import namedtuple

from common import (
    scanpy_load_alevin_mtx,
    scanpy_load_solo_mtx,
    scanpy_load_kallisto_gene_mtx,
    compute_spearman_anndata,
    load_barcode_list,
)

In [2]:
project_dir = Path('~/proj/encode-202006-jamboree-detrout-rna-sc-pipeline/').expanduser()
genome_dir = project_dir / 'genome' / 'mm10-M21_minimal-male'
analysis_dir = project_dir / 'ENCSR874BOF_e10_5_minimal_limb'

In [3]:
solo_filtered = scanpy_load_solo_mtx(analysis_dir / 'star_solo', mode='filtered')
solo_filtered.shape

(6289, 31635)

In [4]:
kallisto_em_raw = scanpy_load_kallisto_gene_mtx(analysis_dir / 'kallisto_em' / 'genecount')
filtered = analysis_dir / 'kallisto_em' / 'filtered-barcodes.txt'
kallisto_em_filtered = scanpy_load_kallisto_gene_mtx(analysis_dir / 'kallisto_em' / 'genecount', filtered)

In [5]:
kallisto_em_raw.shape

(584238, 31635)

In [6]:
def compute_spearman_anndata(left, right):
    """Compute spearman on AnnData using scipy.

    Does need memory
    """
    common_names = list(set(left.obs_names).intersection(right.obs_names))
    left = left[common_names,:]
    right = right[common_names,:]
    
    assert left.shape == right.shape, "Shapes must be the same"
    obs_names = left.obs_names
    if isinstance(left, anndata._core.anndata.AnnData):
        left = left.X.todense()
    if isinstance(right, anndata._core.anndata.AnnData):
        right = right.X.todense()

    #print(left.shape[0])
    #print(left[0].shape)
    #print(right[0].shape)
    cors=[]
    for i in range(left.shape[0]):
        spearman = scipy.stats.spearmanr(left[:,i], right[:,i])[0]
        if pandas.isnull(spearman):
            print(i, common_names[i])
            return
        cors.append(spearman)
    cors = pandas.Series(cors, index=obs_names)
    #[~numpy.isnan(cors)]
    return cors


In [7]:
cors = compute_spearman_anndata(solo_filtered, kallisto_em_raw)

1 GATCAGTAGGATGCGT


  c /= stddev[:, None]


In [8]:
solo_filtered_dense = solo_filtered['CGGACGTCATTATCTC'].X.todense()
kallisto_em_raw_dense = kallisto_em_raw['CGGACGTCATTATCTC'].X.todense()

In [9]:
solo_filtered_rank = scipy.stats.rankdata(solo_filtered_dense)

In [10]:
kallisto_em_raw_rank = scipy.stats.rankdata(kallisto_em_raw_dense)

In [11]:
solo_filtered_rank_std = numpy.std(solo_filtered_rank)
solo_filtered_rank_std

6092.285348806457

In [12]:
kallisto_em_raw_raw_std = numpy.std(kallisto_em_raw_rank)
kallisto_em_raw_raw_std

6174.819301269869

In [13]:
scipy.stats.pearsonr(solo_filtered_rank, kallisto_em_raw_rank)

(0.9172305172141176, 0.0)

In [17]:
cov = numpy.cov(solo_filtered_rank, kallisto_em_raw_rank)
cov

array([[37117114.06396598, 34506166.51554498],
       [34506166.51554498, 38129598.70122337]])

In [18]:
cov / (solo_filtered_rank_std * kallisto_em_raw_raw_std)

array([[0.98666498, 0.91725951],
       [0.91725951, 1.01357933]])

In [31]:
import numpy.ma as ma
import numpy as np
from scipy.stats.mstats_basic import _chk_asarray

SpearmanrResult = namedtuple('SpearmanrResult', ('correlation', 'pvalue'))


def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate'):
    """
    Calculates a Spearman rank-order correlation coefficient and the p-value
    to test for non-correlation.
    The Spearman correlation is a nonparametric measure of the linear
    relationship between two datasets. Unlike the Pearson correlation, the
    Spearman correlation does not assume that both datasets are normally
    distributed. Like other correlation coefficients, this one varies
    between -1 and +1 with 0 implying no correlation. Correlations of -1 or
    +1 imply a monotonic relationship. Positive correlations imply that
    as `x` increases, so does `y`. Negative correlations imply that as `x`
    increases, `y` decreases.
    Missing values are discarded pair-wise: if a value is missing in `x`, the
    corresponding value in `y` is masked.
    The p-value roughly indicates the probability of an uncorrelated system
    producing datasets that have a Spearman correlation at least as extreme
    as the one computed from these datasets. The p-values are not entirely
    reliable but are probably reasonable for datasets larger than 500 or so.
    Parameters
    ----------
    x, y : 1D or 2D array_like, y is optional
        One or two 1-D or 2-D arrays containing multiple variables and
        observations. When these are 1-D, each represents a vector of
        observations of a single variable. For the behavior in the 2-D case,
        see under ``axis``, below.
    use_ties : bool, optional
        DO NOT USE.  Does not do anything, keyword is only left in place for
        backwards compatibility reasons.
    axis : int or None, optional
        If axis=0 (default), then each column represents a variable, with
        observations in the rows. If axis=1, the relationship is transposed:
        each row represents a variable, while the columns contain observations.
        If axis=None, then both arrays will be raveled.
    nan_policy : {'propagate', 'raise', 'omit'}, optional
        Defines how to handle when input contains nan. 'propagate' returns nan,
        'raise' throws an error, 'omit' performs the calculations ignoring nan
        values. Default is 'propagate'.
    Returns
    -------
    correlation : float
        Spearman correlation coefficient
    pvalue : float
        2-tailed p-value.
    References
    ----------
    [CRCProbStat2000] section 14.7
    """
    if not use_ties:
        raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")

    # Always returns a masked array, raveled if axis=None
    x, axisout = _chk_asarray(x, axis)
    if y is not None:
        # Deal only with 2-D `x` case.
        y, _ = _chk_asarray(y, axis)
        if axisout == 0:
            x = ma.column_stack((x, y))
        else:
            x = ma.row_stack((x, y))

    if axisout == 1:
        # To simplify the code that follow (always use `n_obs, n_vars` shape)
        x = x.T

    if nan_policy == 'omit':
        x = ma.masked_invalid(x)

    def _spearmanr_2cols(x):
        # Mask the same observations for all variables, and then drop those
        # observations (can't leave them masked, rankdata is weird).
        x = ma.mask_rowcols(x, axis=0)
        x = x[~x.mask.any(axis=1), :]

        # If either column is entirely NaN or Inf
        if not np.any(x.data):
            return SpearmanrResult(np.nan, np.nan)

        m = ma.getmask(x)
        n_obs = x.shape[0]
        dof = n_obs - 2 - int(m.sum(axis=0)[0])
        if dof < 0:
            raise ValueError("The input must have at least 3 entries!")

        # Gets the ranks and rank differences
        x_ranked = rankdata(x, axis=0)
        rs = ma.corrcoef(x_ranked, rowvar=False).data

        # rs can have elements equal to 1, so avoid zero division warnings
        #with np.errstate(divide='ignore'):
            # clip the small negative values possibly caused by rounding
            # errors before taking the square root
        t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))

        prob = 2 * distributions.t.sf(np.abs(t), dof)

        # For backwards compatibility, return scalars when comparing 2 columns
        if rs.shape == (2, 2):
            return SpearmanrResult(rs[1, 0], prob[1, 0])
        else:
            return SpearmanrResult(rs, prob)

    # Need to do this per pair of variables, otherwise the dropped observations
    # in a third column mess up the result for a pair.
    n_vars = x.shape[1]
    if n_vars == 2:
        return _spearmanr_2cols(x)
    else:
        rs = np.ones((n_vars, n_vars), dtype=float)
        prob = np.zeros((n_vars, n_vars), dtype=float)
        for var1 in range(n_vars - 1):
            for var2 in range(var1+1, n_vars):
                result = _spearmanr_2cols(x[:, [var1, var2]])
                rs[var1, var2] = result.correlation
                rs[var2, var1] = result.correlation
                prob[var1, var2] = result.pvalue
                prob[var2, var1] = result.pvalue

        return SpearmanrResult(rs, prob)

In [35]:
numpy.concat([solo_filtered_dense, kallisto_em_raw_dense])

AttributeError: module 'numpy' has no attribute 'concat'

In [39]:
spearmanr()

IndexError: tuple index out of range

In [44]:
combined = numpy.asarray([solo_filtered_dense[0], kallisto_em_raw_dense[0]])
combined.shape

(2, 1, 31635)

(2, 1, 31635)

In [48]:
solo_filtered_dense.flatten().shape

(1, 31635)