In [1]:
# import package
import discotoolkit as dt
import scanpy as sc
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2



For the sake of the tutorial, we will first download a sample in h5ad file using the download data funciton and then apply the discotoolkit CELLiD annotation function

In [2]:
# filter to only one sample
filter = dt.Filter(sample="AML003_3p")

# filter the database based on the metadata
metadata = dt.filter_disco_metadata(filter) 

# download the data and ignore if it is exist
dt.download_disco_data(metadata) 

INFO:root:Retrieving metadata from DISCO database
INFO:root:Filtering sample
INFO:root:Retrieving cell type information of each sample from DISCO database
INFO:root:1 samples and 6086 cells were found
INFO:root: AML003_3p has been downloaded before. Ignore ...


In [3]:
# helper function to allow the user to see how many atlas is in disco database
print(dt.get_atlas())

['tonsil', 'skeletal_muscle', 'kidney', 'thymus', 'lung', 'gingiva', 'adipose', 'ovarian_cancer', 'placenta', 'bladder', 'adrenal_gland', 'fibroblast', 'pancreas', 'breast', 'breast_milk', 'PDAC', 'heart', 'ovary', 'bone_marrow', 'liver', 'stomach', 'testis', 'blood', 'intestine', 'eye', 'skin', 'brain']


In [4]:
# first we need to read the h5ad file and extract the raw gene expression
adata = sc.read_h5ad("DISCOtmp/AML003_3p.h5ad")
temp = adata.X.toarray()

# apply log normalise to the count data gene expression
### Ignore this if the data has been normalised
log_temp = sc.pp.normalize_per_cell(temp, counts_per_cell_after=1e4, copy=True)
norm_temp = sc.pp.log1p(log_temp, copy=True)

temp = pd.DataFrame(norm_temp, columns=adata.var.index)

temp["cluster"] = np.array(adata.obs["seurat_clusters"]) # get the cluster metadata from 
integrated_data = temp.groupby("cluster").mean().transpose() # get the average expression for each cluster

# we want the rna format to have gene as index and cluster category as the columns
# here is the example. gene, cluster
integrated_data.head()


This is where adjacency matrices should go now.
  warn(


In [None]:
data = pd.read_csv("rna_average.csv", index_col=0)
data.columns = [int(each) for each in data.columns]
data.columns.name = "cluster"
data

cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
MIR1302-2HG,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
FAM138A,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
OR4F5,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
AL627309.1,0.019681,0.010808,0.006906,0.006409,0.00191,0.010536,0.002414,0.024108,0.007150,0.008123,0.003571,0.016340,0.000000,0.027671,0.007016,0.037748,0.0
AL627309.3,0.000000,0.000000,0.002456,0.001417,0.00000,0.000000,0.009509,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC233755.2,0.000000,0.000000,0.000000,0.001783,0.00000,0.000000,0.000000,0.000000,0.005064,0.000000,0.000000,0.000000,0.016382,0.000000,0.000000,0.000000,0.0
AC233755.1,0.000982,0.000957,0.000000,0.002525,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.037422,0.000000,0.000000,0.000000,0.0
AC240274.1,0.031422,0.038345,0.037700,0.018076,0.03774,0.031602,0.017182,0.023132,0.019483,0.064549,0.016189,0.013304,0.000000,0.017479,0.026389,0.022270,0.0
AC213203.1,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [None]:
# apply cellid_cluster function to annotate the cluster
cell_type = dt.CELLiD_cluster(rna = data, atlas = ["bone_marrow"], n_predict = 1)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   7 out of  17 | elapsed:    3.3s remaining:    4.7s
[Parallel(n_jobs=10)]: Done  17 out of  17 | elapsed:    5.6s finished


In [None]:
cell_type

Unnamed: 0,cell_type,atlas,score,input_index
0,MHCII low CD14 monocyte,bone_marrow,0.828,0
0,MHCII high CD14 monocyte,bone_marrow,0.818,1
0,MHCII high CD14 monocyte,bone_marrow,0.761,2
0,Common myeloid progenitor,bone_marrow,0.681,3
0,Cycling S100A+ preNeutrophil,bone_marrow,0.739,4
0,MHCII high CD14 monocyte,bone_marrow,0.837,5
0,pDC,bone_marrow,0.701,6
0,cDC2,bone_marrow,0.796,7
0,Myelocyte,bone_marrow,0.761,8
0,MHCII high CD14 monocyte,bone_marrow,0.796,9
