In [16]:
import scvelo as scv
import scanpy as sc
import numpy as np


In [17]:
# load dataset
anndata = scv.datasets.pancreatic_endocrinogenesis()
anndata

AnnData object with n_obs × n_vars = 3696 × 27998
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
    var: 'highly_variable_genes'
    uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'

In [18]:
# filter by counts of genes
min_counts_genes = 10
sc.pp.filter_genes(anndata, 
                   min_counts=min_counts_genes
)

# filter by counts of cell
# min_counts_cell = None
# sc.pp.filter_cells(anndata,
#                    min_counts=min_counts_cell
# )

# normalize counts
sc.pp.normalize_total(anndata)

# log1p transformation
# without log1p row sums are all equal, with log1p they slightly differ
sc.pp.log1p(anndata)

# get highly varaible genes (hvg)
n_hvg = 500
sc.pp.highly_variable_genes(anndata, n_top_genes=n_hvg, subset=True)

In [19]:
anndata.X.toarray().sum(axis=1)

array([ 97.20135 ,  60.387436, 105.9458  , ...,  72.68019 , 110.29435 ,
       120.79845 ], dtype=float32)

In [20]:
anndata.var

Unnamed: 0_level_0,highly_variable_genes,n_counts,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fhl2,True,888.0,True,0.232534,0.818194,2.813627
Akr1cl,True,719.0,True,0.134404,0.679629,2.966423
Mreg,False,245.0,True,0.072796,0.551569,2.326211
Fev,True,7929.0,True,1.267251,2.473334,3.330596
Cryba2,True,7184.0,True,1.141365,2.059144,3.168725
...,...,...,...,...,...,...
Pak3,True,3246.0,True,0.651349,1.196751,2.431485
Wnk3,True,692.0,True,0.174621,0.883510,3.985689
Cypt3,True,43.0,True,0.012533,1.141024,5.273086
Rai2,True,919.0,True,0.232043,0.652113,2.112570


In [21]:
def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
        """
        Digitize the data into bins. This method spreads data uniformly when bins
        have same values.

        Args:

        x (:class:`np.ndarray`):
            The data to digitize.
        bins (:class:`np.ndarray`):
            The bins to use for digitization, in increasing order.

        Returns:

        :class:`np.ndarray`:
            The digitized data.
        """
        assert x.ndim == 1 and bins.ndim == 1

        left_digits = np.digitize(x, bins)
        right_digits = np.digitize(x, bins, right=True)

        rands = np.random.rand(len(x))  # uniform random numbers

        digits = rands * (right_digits - left_digits) + left_digits
        digits = np.ceil(digits).astype(np.int64)
        return digits

In [24]:
# get full data matrix (includes zeros)
data = anndata.X.toarray()
n_bins = 10
binned_rows = []
bin_edges = []
# perform value binning for each cell
for row in data[1:2]:
    non_zero_ids = row.nonzero()
    print(non_zero_ids)
    non_zero_row = row[non_zero_ids]
    print(non_zero_row)
    # get borders of equally distributed bins
    bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
    print(bins)
    # spread all values equally across the bins
    non_zero_digits = _digitize(non_zero_row, bins)
    print(non_zero_digits)
    binned_row = np.zeros_like(row, dtype=np.int64)
    # assign genes to bins
    binned_row[non_zero_ids] = non_zero_digits
    binned_rows.append(binned_row)
    bin_edges.append(np.concatenate([[0], bins]))
binned_data = np.stack(binned_rows)

(array([  1,  13,  20,  37,  47,  62,  75,  80,  90,  97, 106, 109, 111,
       117, 118, 119, 138, 143, 146, 155, 158, 186, 195, 202, 208, 219,
       230, 231, 243, 244, 265, 272, 279, 281, 282, 298, 309, 310, 330,
       354, 365, 375, 395, 401, 410, 412, 428, 429, 431, 439, 443, 444,
       449, 460, 462, 465, 476, 481]),)
[1.0123336 0.6291461 1.0123336 1.2886881 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.2886881 0.6291461 1.2886881 0.6291461 1.2886881
 0.6291461 0.6291461 1.2886881 0.6291461 1.0123336 3.0921414 0.6291461
 1.5049729 0.6291461 0.6291461 2.5167365 1.0123336 1.2886881 1.0123336
 1.2886881 0.6291461 1.8335485 1.5049729 0.6291461 0.6291461 1.6826957
 0.6291461 1.0123336 1.0123336 0.6291461 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.0123336 1.0123336 1.5049729 0.6291461 1.2886881
 0.6291461 1.5049729 2.4433982 0.6291461 1.2886881 0.6291461 1.6826957
 0.6291461 0.6291461]
[0.6291461  0.6291461  0.6291461  0.6291461  1.01233363 1.01233363
 1.28868806 1.

In [25]:
binned_data

array([[0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 5, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 5, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7,
        0, 2, 0, 0, 0, 0, 0, 7, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 3, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0,
        0, 9, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
        0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 8, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 2, 0, 2, 8, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [32]:
data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.2886881 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.47240812,
        0.47240812],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [33]:
from scanpy.get import _get_obs_rep, _set_obs_rep