In [38]:
import scvelo as scv
import scanpy as sc
import numpy as np

In [39]:
# load dataset
anndata = scv.datasets.pancreatic_endocrinogenesis()
anndata

AnnData object with n_obs × n_vars = 3696 × 27998
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
    var: 'highly_variable_genes'
    uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'

In [40]:
denta = scv.datasets.dentategyrus()

In [41]:
denta

AnnData object with n_obs × n_vars = 2930 × 13913
    obs: 'clusters', 'age(days)', 'clusters_enlarged'
    uns: 'clusters_colors'
    obsm: 'X_umap'
    layers: 'ambiguous', 'spliced', 'unspliced'

In [42]:
# filter by counts of genes
min_counts_genes = 10
sc.pp.filter_genes(anndata, 
                   min_counts=min_counts_genes
)

# filter by counts of cell
# min_counts_cell = None
# sc.pp.filter_cells(anndata,
#                    min_counts=min_counts_cell
# )

# normalize counts
sc.pp.normalize_total(anndata)

# log1p transformation
# without log1p row sums are all equal, with log1p they slightly differ
sc.pp.log1p(anndata)

In [43]:
# filter by counts of genes
min_counts_genes = 10
sc.pp.filter_genes(denta, 
                   min_counts=min_counts_genes
)

# filter by counts of cell
# min_counts_cell = None
# sc.pp.filter_cells(anndata,
#                    min_counts=min_counts_cell
# )

# normalize counts
sc.pp.normalize_total(denta)

# log1p transformation
# without log1p row sums are all equal, with log1p they slightly differ
sc.pp.log1p(denta)

# get highly varaible genes (hvg)
n_hvg = 200
sc.pp.highly_variable_genes(denta, n_top_genes=n_hvg, subset=True)

In [46]:
anndata.var.index.values

array(['Mrpl15', '4732440D04Rik', 'Gm26901', ..., 'Eif2s3y', 'Gm29650',
       'Erdr1'], dtype=object)

In [48]:
denta.var[denta.var.index.isin(anndata.var.index.values)]

Unnamed: 0_level_0,n_counts,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Col3a1,102.0,True,0.026731,2.138095,4.653627
Igfbp2,649.0,True,0.189566,2.202324,4.816107
Rgs5,404.0,True,0.091822,2.500787,5.571124
Pigm,64.0,True,0.019549,2.075384,4.494987
Ptpn14,33.0,True,0.011106,2.034065,4.390463
...,...,...,...,...,...
Plp1,8021.0,True,1.268913,4.950113,3.995849
Tbc1d25,66.0,True,0.021798,2.129013,4.630653
Rlim,153.0,True,0.041962,1.871540,3.979325
Itm2a,1097.0,True,0.246508,2.689193,6.047734


In [29]:
denta['index'][denta['index'].isin(anndata.var.index.values)]

66        Col3a1
117       Igfbp2
497       Resp18
503         Scg2
663         Rgs4
          ...   
12078       Wnk3
12160    Smarca1
12178      Fgf13
12269    Tmsb15l
12284     Maged2
Name: index, Length: 123, dtype: object

In [7]:
anndata.obs

Unnamed: 0_level_0,clusters_coarse,clusters,S_score,G2M_score
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAACCTGAGAGGGATA,Pre-endocrine,Pre-endocrine,-0.224902,-0.252071
AAACCTGAGCCTTGAT,Ductal,Ductal,-0.014707,-0.232610
AAACCTGAGGCAATTA,Endocrine,Alpha,-0.171255,-0.286834
AAACCTGCATCATCCC,Ductal,Ductal,0.599244,0.191243
AAACCTGGTAAGTGGC,Ngn3 high EP,Ngn3 high EP,-0.179981,-0.126030
...,...,...,...,...
TTTGTCAAGTGACATA,Pre-endocrine,Pre-endocrine,-0.235896,-0.266101
TTTGTCAAGTGTGGCA,Ngn3 high EP,Ngn3 high EP,0.279374,-0.204047
TTTGTCAGTTGTTTGG,Ductal,Ductal,-0.045692,-0.208907
TTTGTCATCGAATGCT,Endocrine,Alpha,-0.240576,-0.206865


In [21]:
def _digitize(x: np.ndarray, bins: np.ndarray) -> np.ndarray:
        """
        Digitize the data into bins. This method spreads data uniformly when bins
        have same values.

        Args:

        x (:class:`np.ndarray`):
            The data to digitize.
        bins (:class:`np.ndarray`):
            The bins to use for digitization, in increasing order.

        Returns:

        :class:`np.ndarray`:
            The digitized data.
        """
        assert x.ndim == 1 and bins.ndim == 1

        left_digits = np.digitize(x, bins)
        right_digits = np.digitize(x, bins, right=True)

        rands = np.random.rand(len(x))  # uniform random numbers

        digits = rands * (right_digits - left_digits) + left_digits
        digits = np.ceil(digits).astype(np.int64)
        return digits

In [24]:
# get full data matrix (includes zeros)
data = anndata.X.toarray()
n_bins = 10
binned_rows = []
bin_edges = []
# perform value binning for each cell
for row in data[1:2]:
    non_zero_ids = row.nonzero()
    print(non_zero_ids)
    non_zero_row = row[non_zero_ids]
    print(non_zero_row)
    # get borders of equally distributed bins
    bins = np.quantile(non_zero_row, np.linspace(0, 1, n_bins - 1))
    print(bins)
    # spread all values equally across the bins
    non_zero_digits = _digitize(non_zero_row, bins)
    print(non_zero_digits)
    binned_row = np.zeros_like(row, dtype=np.int64)
    # assign genes to bins
    binned_row[non_zero_ids] = non_zero_digits
    binned_rows.append(binned_row)
    bin_edges.append(np.concatenate([[0], bins]))
binned_data = np.stack(binned_rows)

(array([  1,  13,  20,  37,  47,  62,  75,  80,  90,  97, 106, 109, 111,
       117, 118, 119, 138, 143, 146, 155, 158, 186, 195, 202, 208, 219,
       230, 231, 243, 244, 265, 272, 279, 281, 282, 298, 309, 310, 330,
       354, 365, 375, 395, 401, 410, 412, 428, 429, 431, 439, 443, 444,
       449, 460, 462, 465, 476, 481]),)
[1.0123336 0.6291461 1.0123336 1.2886881 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.2886881 0.6291461 1.2886881 0.6291461 1.2886881
 0.6291461 0.6291461 1.2886881 0.6291461 1.0123336 3.0921414 0.6291461
 1.5049729 0.6291461 0.6291461 2.5167365 1.0123336 1.2886881 1.0123336
 1.2886881 0.6291461 1.8335485 1.5049729 0.6291461 0.6291461 1.6826957
 0.6291461 1.0123336 1.0123336 0.6291461 1.0123336 0.6291461 0.6291461
 0.6291461 1.0123336 1.0123336 1.0123336 1.5049729 0.6291461 1.2886881
 0.6291461 1.5049729 2.4433982 0.6291461 1.2886881 0.6291461 1.6826957
 0.6291461 0.6291461]
[0.6291461  0.6291461  0.6291461  0.6291461  1.01233363 1.01233363
 1.28868806 1.

In [14]:
hvg_pancreas = anndata.var
hvg_pancreas

Unnamed: 0_level_0,highly_variable_genes,n_counts,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Resp18,True,1648.0,True,0.386105,1.364425,5.119360
Scg2,True,3518.0,True,0.646343,1.621541,3.553158
Rgs4,True,338.0,True,0.088585,1.408274,6.609155
Fcgr3,True,18.0,True,0.004750,1.828977,8.712379
Fcer1g,True,85.0,True,0.021694,1.126170,5.198826
...,...,...,...,...,...,...
Arx,True,2261.0,True,0.513403,1.514864,4.975472
Pou3f4,True,1648.0,True,0.414762,1.228591,3.892176
Zcchc18,True,3927.0,True,0.732543,1.676028,3.697033
Wnk3,True,692.0,True,0.174621,0.883510,3.985689


In [29]:
anndata.var.index.isin(denta.var.index.values)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False, False,  True,  True,  True,  True,  True,
        True, False, False, False, False, False,  True,  True,  True,
        True,  True, False, False, False,  True,  True,  True, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True,  True,  True,  True, False, False,  True,  True,
        True,  True, False, False, False,  True, False, False,  True,
       False, False,  True,  True,  True, False, False,  True,  True,
        True,  True, False,  True,  True, False, False,  True, False,
        True,  True,  True, False,  True, False,  True,  True,  True,
        True, False, False,  True,  True,  True, False, False, False,
       False, False,

In [30]:
anndata.var[anndata.var.index.isin(denta.var.index.values)]

Unnamed: 0_level_0,highly_variable_genes,n_counts,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Resp18,True,1648.0,True,0.386105,1.364425,5.119360
Scg2,True,3518.0,True,0.646343,1.621541,3.553158
Rgs4,True,338.0,True,0.088585,1.408274,6.609155
Fcgr3,True,18.0,True,0.004750,1.828977,8.712379
Fcer1g,True,85.0,True,0.021694,1.126170,5.198826
...,...,...,...,...,...,...
Zcchc12,True,128.0,True,0.034697,0.964286,4.389517
Arx,True,2261.0,True,0.513403,1.514864,4.975472
Pou3f4,True,1648.0,True,0.414762,1.228591,3.892176
Zcchc18,True,3927.0,True,0.732543,1.676028,3.697033
