In [3]:
import scvelo as scv
import scanpy as sc
import numpy as np

In [4]:
# load dataset
anndata = scv.datasets.pancreatic_endocrinogenesis()
anndata

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50.0M/50.0M [00:08<00:00, 6.40MB/s]


AnnData object with n_obs × n_vars = 3696 × 27998
    obs: 'clusters_coarse', 'clusters', 'S_score', 'G2M_score'
    var: 'highly_variable_genes'
    uns: 'clusters_coarse_colors', 'clusters_colors', 'day_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap'
    layers: 'spliced', 'unspliced'
    obsp: 'distances', 'connectivities'

In [5]:
# filter by counts of genes
min_counts_genes = 10
sc.pp.filter_genes(anndata, 
                   min_counts=min_counts_genes
)

# filter by counts of cell
# min_counts_cell = None
# sc.pp.filter_cells(anndata,
#                    min_counts=min_counts_cell
# )

# normalize counts
sc.pp.normalize_total(anndata)

# log1p transformation
# without log1p row sums are all equal, with log1p they slightly differ
sc.pp.log1p(anndata)

# get highly varaible genes (hvg)
n_hvg = 2400
sc.pp.highly_variable_genes(anndata, n_top_genes=n_hvg, subset=True)

In [6]:
anndata.X.toarray().sum(axis=1)

array([462.78717, 339.6714 , 439.14117, ..., 324.27844, 382.44965,
       471.09863], dtype=float32)

In [7]:
anndata.var

Unnamed: 0_level_0,highly_variable_genes,n_counts,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Sbspon,True,767.0,True,0.143064,0.277315,0.955128
Mcm3,True,1615.0,True,0.282309,0.514745,1.532715
Neurl3,True,31.0,True,0.007501,0.239100,0.764078
Creg2,False,10.0,True,0.003019,0.160805,0.372656
Fhl2,True,888.0,True,0.232534,0.818194,2.813627
...,...,...,...,...,...,...
Ap1s2,True,478.0,True,0.127904,0.319952,1.168286
Tmem27,True,9385.0,True,1.297930,2.476463,3.335991
Uty,False,69.0,True,0.018678,0.182476,0.480999
Ddx3y,True,675.0,True,0.165302,0.351228,1.324641


In [8]:
# get full data matrix (includes zeros)
data = anndata.X.toarray()
for row in data[1:2]:
    print(row)
    print(row.nonzero())
    row.nonzero()

[0.        1.2886881 0.        ... 0.        0.        0.       ]
(array([   1,    5,    6,    9,   37,   41,   46,   52,   58,   59,   61,
         69,   87,  105,  118,  126,  134,  145,  152,  156,  166,  175,
        177,  178,  180,  187,  189,  194,  205,  212,  213,  214,  217,
        234,  241,  250,  253,  262,  277,  289,  291,  292,  297,  307,
        322,  340,  345,  347,  350,  351,  352,  357,  362,  376,  379,
        408,  410,  426,  434,  438,  447,  450,  454,  457,  461,  483,
        487,  491,  492,  493,  505,  512,  513,  514,  515,  516,  517,
        521,  536,  542,  543,  547,  548,  556,  567,  577,  587,  600,
        610,  618,  619,  629,  636,  638,  640,  648,  653,  665,  669,
        672,  673,  680,  688,  689,  696,  698,  708,  710,  716,  731,
        734,  736,  741,  755,  761,  772,  781,  786,  788,  790,  792,
        796,  809,  812,  818,  821,  835,  836,  851,  853,  859,  863,
        865,  874,  883,  884,  889,  891,  900,  918,  9

In [9]:
data[1:5].shape

(4, 2400)