# CellOracle - TSS annotation

This notebook is used to annotate the peaks into promoters, enhancers, and other genomic regions.

## import

In [1]:
import snapatac2 as snap
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
# import episcanpy.api as epi
import gzip
import os
import pathlib

In [2]:
from celloracle import motif_analysis as ma
import celloracle as co
co.__version__
co.check_python_requirements()

2023-09-21 14:33:30 - INFO - Failed to extract font properties from /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf: In FT2Font: Can not load face.  Unknown file format.


Unnamed: 0,package_name,installed_version,required_version,requirement_satisfied
0,numpy,1.21.6,auto,True
1,scipy,1.7.3,auto,True
2,cython,0.29.34,auto,True
3,numba,0.56.4,0.50.1,True
4,matplotlib,3.5.3,auto,True
5,seaborn,0.12.2,auto,True
6,scikit-learn,1.0.2,auto,True
7,h5py,3.8.0,3.1.0,True
8,pandas,1.3.5,1.0.3,True
9,velocyto,0.17.17,0.17,True


## load peak and get the cicero results

In [5]:
result_dir = './base_GRN'
adata = sc.read(('./ATAC_data/adata_atac_raw.h5ad'))

In [6]:
peaks = np.array(adata.var_names)
peaks

array(['chr1_819722_820222', 'chr1_827288_827788', 'chr1_838176_838676',
       ..., 'chrX_155962954_155963454', 'chrX_155966781_155967281',
       'chrX_155971350_155971850'], dtype=object)

In [7]:
# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv(os.path.join(result_dir,"cicero_connections_stream.csv"), index_col=0)
cicero_connections.head()

Unnamed: 0,Peak1,Peak2,coaccess
1,chr10_100000290_100000790,chr10_99779229_99779729,-0.001031
2,chr10_100000290_100000790,chr10_99779759_99780259,0.001412
3,chr10_100000290_100000790,chr10_99785581_99786081,0.0
4,chr10_100000290_100000790,chr10_99788840_99789340,0.0
5,chr10_100000290_100000790,chr10_99845752_99846252,0.000482


In [8]:
cicero_connections.shape

(22818160, 3)

## annotate tss

In [9]:
ma.SUPPORTED_REF_GENOME

Unnamed: 0,species,ref_genome,provider
0,Human,hg38,UCSC
1,Human,hg19,UCSC
2,Mouse,mm39,UCSC
3,Mouse,mm10,UCSC
4,Mouse,mm9,UCSC
5,S.cerevisiae,sacCer2,UCSC
6,S.cerevisiae,sacCer3,UCSC
7,Zebrafish,danRer7,UCSC
8,Zebrafish,danRer10,UCSC
9,Zebrafish,danRer11,UCSC


In [9]:
##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38") # Note that the ref_genome should be changed according to your data

# Check results
tss_annotated.tail()


que bed peaks: 246132
tss peaks in que: 33757


Unnamed: 0,chr,start,end,gene_short_name,strand
33752,chr5,149550467,149550967,CSNK1A1,-
33753,chr5,149551161,149551661,CSNK1A1,-
33754,chr20,10673802,10674302,JAG1,-
33755,chr20,10674620,10675120,JAG1,-
33756,chr9,122228522,122229022,LHX6,-


## interate tss and cicero result

In [10]:
integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated,
                                               cicero_connections=cicero_connections)
print(integrated.shape)
integrated.head()

(938050, 3)


Unnamed: 0,peak_id,gene_short_name,coaccess
0,chr10_100000290_100000790,BLOC1S2,0.000119
1,chr10_100000290_100000790,CWF19L1,0.00171
2,chr10_100000290_100000790,DNMBP,0.003578
3,chr10_100000290_100000790,ERLIN1,0.002079
4,chr10_100000290_100000790,OLMALINC,0.000491


In [11]:
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)

print(peak.shape)
peak.head()

(31844, 2)


Unnamed: 0,peak_id,gene_short_name
0,chr10_100009740_100010240,DNMBP
1,chr10_100185817_100186317,ERLIN1
2,chr10_100186339_100186839,ERLIN1
3,chr10_100229388_100229888,CHUK
4,chr10_100267454_100267954,CWF19L1


## save data

In [12]:
peak.to_csv(os.path.join(result_dir,"processed_peak.csv"))