# Making a tiny version of the Tabula Sapiens data, representing a very tiny subset of the cell types so that it's possible to run using scSHC.                               

## INPUT: 

    * data/test_data/TabulaSapiens_subset.h5ad

## OUTPUT: 

    * data/test_data/TabulaSapiens_tiny.h5ad
    
# Environment Setup

In [1]:
work_dir = './' #Change to where git clone Cytocipher_manuscript/
import os
os.chdir(work_dir)

import numpy as np
import pandas as pd
import scanpy as sc

data_dir = 'data/test_data/'

# Loading the data

In [2]:
data = sc.read_h5ad(data_dir+'TabulaSapiens_subset.h5ad')

In [3]:
data.shape

(7385, 2435)

In [5]:
data.obs.keys()

Index(['organ_tissue', 'method', 'donor', 'anatomical_information',
       'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation',
       'manually_annotated', 'compartment', 'gender', 'organ_tissue_merged',
       'overclusters', 'overclusters_merged'],
      dtype='object')

In [6]:
print(len(np.unique(data.obs['overclusters'].values)))

503


# Subsetting the cell types...

In [26]:
labels = data.obs['cell_ontology_class'].values.astype(str)
label_set = np.unique( labels )

sub_labels = np.random.choice(label_set, size=12, replace=False)
remaining_cells = []
[remaining_cells.extend(np.where(labels==label)[0]) for label in sub_labels]

print( len(remaining_cells) )

500


In [27]:
sub_labels

array(['eye photoreceptor cell', 'retinal ganglion cell',
       'small intestine goblet cell', 'smooth muscle cell', 'leucocyte',
       'lung ciliated cell', 'erythroid progenitor',
       'retinal blood vessel endothelial cell', 'mast cell',
       'endothelial cell of artery',
       'cd4-positive, alpha-beta memory t cell',
       'lacrimal gland functional unit cell'], dtype='<U59')

In [33]:
small = data[remaining_cells,:].copy()

In [34]:
small.shape

(500, 2435)

In [35]:
sc.pp.filter_genes(small, min_cells=3)

In [36]:
small.shape

(500, 2180)

In [43]:
counts = small.raw.to_adata().to_df().loc[:, small.var_names]
counts.shape

(500, 2180)

In [58]:
print('Same cells: ', np.all(counts.index.values == small.obs_names.values))
print('Same genes: ', np.all(counts.columns.values == small.var_names.values))

Same cells:  True
Same genes:  True


In [44]:
counts.values

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 1.,  0.,  0., ...,  0., 10.,  3.],
       [ 0.,  9.,  1., ...,  0., 68.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [45]:
small_min = sc.AnnData(small.to_df(), obs=small.obs, var=small.var)
small_min.layers['counts'] = counts.values

In [46]:
small_min.write_h5ad(data_dir+'TabulaSapiens_tiny.h5ad', compression='gzip')

## Also adding counts to TabulaSapiens_subset.h5ad

In [48]:
data.shape

(7385, 2435)

In [62]:
subset_counts = data.raw.to_adata().to_df().loc[:,data.var_names]

In [63]:
subset_counts.shape

(7385, 2435)

In [64]:
print('Same cells: ', np.all(subset_counts.index.values == data.obs_names.values))
print('Same genes: ', np.all(subset_counts.columns.values == data.var_names.values))

Same cells:  True
Same genes:  True


In [65]:
data.layers['counts'] = subset_counts.values
data.layers['counts']

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 5., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [66]:
#### Saving...
data.write_h5ad(data_dir+'TabulaSapiens_subset.h5ad', compression='gzip')