In [1]:
import scvelo as scv
import scanpy as sc
import numpy as np


In [2]:
# load dataset
anndata = scv.datasets.pancreatic_endocrinogenesis()
anndata

AnnData object with n_obs × n_vars = 3696 × 27998
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
    var: 'highly_variable_genes'
    uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'

In [3]:
# filter by counts of genes
min_counts_genes = 10
sc.pp.filter_genes(anndata, 
                   min_counts=min_counts_genes
)

# filter by counts of cell
# min_counts_cell = None
# sc.pp.filter_cells(anndata,
#                    min_counts=min_counts_cell
# )

# normalize counts
sc.pp.normalize_total(anndata)

# log1p transformation
# without log1p row sums are all equal, with log1p they slightly differ
sc.pp.log1p(anndata)

# get highly varaible genes (hvg)
n_hvg = 200
sc.pp.highly_variable_genes(anndata, n_top_genes=n_hvg, subset=True)

In [8]:
list(anndata.var.index)

['Resp18',
 'Scg2',
 'Rgs4',
 'Fcgr3',
 'Fcer1g',
 'Cenpf',
 'Atf3',
 'Col3a1',
 'Igfbp2',
 'Ccl20',
 'Arg1',
 'Srgn',
 'Cdk1',
 'Lyz2',
 'Nab2',
 'Ppp1r14c',
 'Pkib',
 'Dcn',
 'Tspan8',
 'Igfbp3',
 'Rasd1',
 'Serpinf2',
 'Ccl9',
 'Ccl3',
 'Lhx1',
 'Neurod2',
 'Top2a',
 'Krtap17-1',
 'Hap1',
 'Ppy',
 'Igfbp1',
 'Upp1',
 'Mfap4',
 'Aurkb',
 'Ccl2',
 'Ccl4',
 'Col1a1',
 'Gip',
 'Hoxb4',
 'Hoxb2',
 'Krtap31-1',
 'Gast',
 'Wnt3',
 'Mapt',
 'Ttyh2',
 'Birc5',
 'Rtn1',
 'Serpina6',
 'Serpina1b',
 'Serpina1e',
 'Rrm2',
 'Sostdc1',
 'Pou6f2',
 'Fbp1',
 'Irx1',
 'Tmem171',
 'Cartpt',
 'Ccnb1',
 'Fam159b',
 'Cks2',
 'Gadd45g',
 'Irx2',
 'Pnoc',
 'Pcdh8',
 'Lgals3',
 'Stmn4',
 'Rac2',
 'Ppp1r1a',
 'Aard',
 'Slc30a8',
 'Lgals1',
 'Prph',
 'Tmem114',
 'Sdf2l1',
 'Sst',
 'Gap43',
 'Hes1',
 'Tff3',
 'Tff2',
 'Hspa1a',
 'Lrpprc',
 'Pde10a',
 'Dpysl3',
 'Ttr',
 'Ifit1bl1',
 'Fgf8',
 'Hhex',
 'Ins1',
 'Spc25',
 'Mdk',
 'Epb42',
 'Tgm7',
 'Avp',
 'Stmn3',
 'Olfm1',
 'Dapl1',
 'G6pc2',
 'Nusap1',
 'Ctxn2'

In [4]:
anndata.X.toarray().sum(axis=1)

array([20.012465, 10.617302, 29.051683, ..., 17.684248, 39.785583,
       37.8008  ], dtype=float32)

In [7]:
anndata.obs

Unnamed: 0_level_0,clusters_coarse,clusters,S_score,G2M_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCTGAGAGGGATA,Pre-endocrine,Pre-endocrine,-0.224902,-0.252071
AAACCTGAGCCTTGAT,Ductal,Ductal,-0.014707,-0.232610
AAACCTGAGGCAATTA,Endocrine,Alpha,-0.171255,-0.286834
AAACCTGCATCATCCC,Ductal,Ductal,0.599244,0.191243
AAACCTGGTAAGTGGC,Ngn3 high EP,Ngn3 high EP,-0.179981,-0.126030
...,...,...,...,...
TTTGTCAAGTGACATA,Pre-endocrine,Pre-endocrine,-0.235896,-0.266101
TTTGTCAAGTGTGGCA,Ngn3 high EP,Ngn3 high EP,0.279374,-0.204047
TTTGTCAGTTGTTTGG,Ductal,Ductal,-0.045692,-0.208907
TTTGTCATCGAATGCT,Endocrine,Alpha,-0.240576,-0.206865


In [21]:
def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
        """
        Digitize the data into bins. This method spreads data uniformly when bins
        have same values.

        Args:

        x (:class:`np.ndarray`):
            The data to digitize.
        bins (:class:`np.ndarray`):
            The bins to use for digitization, in increasing order.

        Returns:

        :class:`np.ndarray`:
            The digitized data.
        """
        assert x.ndim == 1 and bins.ndim == 1

        left_digits = np.digitize(x, bins)
        right_digits = np.digitize(x, bins, right=True)

        rands = np.random.rand(len(x))  # uniform random numbers

        digits = rands * (right_digits - left_digits) + left_digits
        digits = np.ceil(digits).astype(np.int64)
        return digits

In [24]:
# get full data matrix (includes zeros)
data = anndata.X.toarray()
n_bins = 10
binned_rows = []
bin_edges = []
# perform value binning for each cell
for row in data[1:2]:
    non_zero_ids = row.nonzero()
    print(non_zero_ids)
    non_zero_row = row[non_zero_ids]
    print(non_zero_row)
    # get borders of equally distributed bins
    bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
    print(bins)
    # spread all values equally across the bins
    non_zero_digits = _digitize(non_zero_row, bins)
    print(non_zero_digits)
    binned_row = np.zeros_like(row, dtype=np.int64)
    # assign genes to bins
    binned_row[non_zero_ids] = non_zero_digits
    binned_rows.append(binned_row)
    bin_edges.append(np.concatenate([[0], bins]))
binned_data = np.stack(binned_rows)

(array([  1,  13,  20,  37,  47,  62,  75,  80,  90,  97, 106, 109, 111,
       117, 118, 119, 138, 143, 146, 155, 158, 186, 195, 202, 208, 219,
       230, 231, 243, 244, 265, 272, 279, 281, 282, 298, 309, 310, 330,
       354, 365, 375, 395, 401, 410, 412, 428, 429, 431, 439, 443, 444,
       449, 460, 462, 465, 476, 481]),)
[1.0123336 0.6291461 1.0123336 1.2886881 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.2886881 0.6291461 1.2886881 0.6291461 1.2886881
 0.6291461 0.6291461 1.2886881 0.6291461 1.0123336 3.0921414 0.6291461
 1.5049729 0.6291461 0.6291461 2.5167365 1.0123336 1.2886881 1.0123336
 1.2886881 0.6291461 1.8335485 1.5049729 0.6291461 0.6291461 1.6826957
 0.6291461 1.0123336 1.0123336 0.6291461 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.0123336 1.0123336 1.5049729 0.6291461 1.2886881
 0.6291461 1.5049729 2.4433982 0.6291461 1.2886881 0.6291461 1.6826957
 0.6291461 0.6291461]
[0.6291461  0.6291461  0.6291461  0.6291461  1.01233363 1.01233363
 1.28868806 1.

In [25]:
binned_data

array([[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 5, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 5, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7,
        0, 2, 0, 0, 0, 0, 0, 7, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
        0, 9, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
        0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 8, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 2, 0, 2, 8, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [32]:
data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.2886881 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.47240812,
        0.47240812],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [33]:
from scanpy.get import _get_obs_rep, _set_obs_rep