In [10]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [11]:
raw_data = sc.read("../data/Quake_Lung.h5ad", index_col=0)

In [12]:
processed_data = np.nan_to_num(raw_data.X, nan=0.0)
raw_data.X = processed_data
label = raw_data.obs['celltype']

In [13]:
print(raw_data)
print(len(label.to_list()))

AnnData object with n_obs × n_vars = 1676 × 23341
    obs: 'cell_ontology_class', 'cell_ontology_id', 'cell_type1', 'channel', 'cluster', 'dataset_name', 'donor', 'free_annotation', 'gender', 'organ', 'organism', 'platform', 'region', 'celltype'
    uns: 'creation_date', 'source_file'
1676


In [14]:
print(raw_data.X)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 4271360 stored elements and shape (1676, 23341)>
  Coords	Values
  (0, 1)	200.0
  (0, 2)	45.0
  (0, 3)	119.0
  (0, 5)	185.0
  (0, 22)	224.0
  (0, 24)	55.0
  (0, 38)	101.0
  (0, 41)	379.0
  (0, 42)	76.0
  (0, 43)	107.0
  (0, 47)	34.0
  (0, 58)	149.0
  (0, 81)	126.0
  (0, 86)	203.0
  (0, 88)	108.0
  (0, 89)	45.0
  (0, 104)	241.0
  (0, 105)	235.0
  (0, 118)	55.0
  (0, 135)	83.0
  (0, 139)	179.0
  (0, 143)	63.0
  (0, 164)	15.0
  (0, 253)	355.0
  (0, 264)	4.0
  :	:
  (1675, 22710)	32.0
  (1675, 22713)	24.0
  (1675, 22714)	81.0
  (1675, 22718)	346.0
  (1675, 22758)	1.0
  (1675, 22778)	123.0
  (1675, 22838)	1.0
  (1675, 22839)	280.0
  (1675, 22841)	342.0
  (1675, 22854)	39.0
  (1675, 22869)	1.0
  (1675, 22893)	6.0
  (1675, 22895)	2.0
  (1675, 22896)	1.0
  (1675, 22908)	1.0
  (1675, 22948)	628.0
  (1675, 22954)	1.0
  (1675, 22963)	2.0
  (1675, 23075)	1.0
  (1675, 23113)	313.0
  (1675, 23231)	65.0
  (1675, 23259)	57.0
  (1675, 23263)

In [15]:
print(raw_data.X.shape)

(1676, 23341)


In [16]:
print(raw_data.obs.columns)

Index(['cell_ontology_class', 'cell_ontology_id', 'cell_type1', 'channel',
       'cluster', 'dataset_name', 'donor', 'free_annotation', 'gender',
       'organ', 'organism', 'platform', 'region', 'celltype'],
      dtype='object')


In [17]:
print(set(raw_data.var.index.to_list()))

{'Acot7', 'Vmn2r3', 'Sphkap', 'Gng10', 'Vmn2r60', 'Cldn19', 'Pcdh11x', 'Dmrtc1c1', 'Cnr1', 'Prdx6', 'Disp2', 'Tsn', 'Camp', 'Myh14', 'Mir5113', 'Npnt', 'Cep76', 'Kcnmb4', 'Adc', 'Pstpip1', 'Rbl1', 'Lhx2', 'Gm17689', 'Gkap1', 'Trex2', 'Klra14', 'Mx2', 'Mir453', '2410021H03Rik', 'Arid3b', 'Hp', 'Gm10823', 'Cd53', 'Prlh', 'Spam1', '5330411J11Rik', 'Prss52', 'Calr', 'Arntl', 'Dok5', 'Gm15412', '0610010O12Rik', 'Rabepk', 'Ppp1r32', 'Col4a4', 'Bcdin3d', 'Nuak1', 'Speer4f', 'Sae1', 'Ube2cbp', 'Daf2', 'Olfr92', '1810065E05Rik', 'Chac1', 'Limd2', 'M6pr', 'C1qb', 'Tmem221', 'Prph2', 'Gmip', 'Sytl4', 'Usp9x', 'Akap3', 'Catsper2', 'Ifna11', 'Nos1ap', 'C230035I16Rik', 'Ccdc65', 'Olfr761', 'Eif3i', '3632454L22Rik', '4930500L23Rik', 'Tssk4', 'Rnf43', 'Mir148b', 'Phox2a', 'Fam83g', 'Lsr', 'Med11', 'Ncstn', '4930447K03Rik', 'Klk15', 'Rp1', '1700019E19Rik', 'Tnni1', 'Ppp1r13b', 'Ephx4', 'Smtn', 'Cetn2', 'Ythdf2', 'Gm17830', 'Pbx2', 'Olfr183', 'Gcc1', 'Chchd3', 'Naip5', 'Chrna2', 'Olfr930', '2700086A05Ri

In [18]:
print(any(raw_data.var_names.str.startswith('mt'))) # 线粒体基因
print(any(raw_data.var_names.str.startswith('rps'))) # 核糖体蛋白 S 基因
print(any(raw_data.var_names.str.startswith('rpl'))) # 核糖体蛋白 L 基因

# raw_data.var['mt'] = raw_data.var_names.str.startswith('MT')
# raw_data.var['ribo'] = raw_data.var_names.str.startswith(('RPS', 'RPL'))

False
False
False
