#### This section relies on the CellOracle package for annotating scATAC-seq peak calls with Transcription Start Site (TSS) information and the gene names associated with the TSS sites. For further information, please refer to the CellOracle documentation available at https://morris-lab.github.io/CellOracle.documentation/index.html.

### Import library

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm
from celloracle import motif_analysis as ma
import celloracle as co
co.__version__

import warnings
import numba
# Code that triggers the warning

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=numba.NumbaDeprecationWarning)

In [4]:
# config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

### Load data

In [5]:
tissue="Spleen"
sample_id="62016_P2"

print("Tissue:", tissue)
print("Sample ID:", sample_id)

Tissue: Spleen
Sample ID: 62016_P2


In [6]:
# Load scATAC-seq peak list.
peaks = pd.read_csv("~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + "_all_peaks.csv", index_col=0)
peaks = peaks.x.values
print(peaks)

['chr1_3002478_3002968' 'chr1_3084739_3085712' 'chr1_3103576_3104022' ...
 'chrY_631222_631480' 'chrY_795887_796426' 'chrY_2397419_2397628']


In [7]:
# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv("~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + "_cicero_connections.csv", index_col=0)
print(cicero_connections.head())

                       Peak1                    Peak2  coaccess
1  chr10_100005476_100005689  chr10_99750341_99750805       0.0
2  chr10_100005476_100005689  chr10_99758759_99759127       0.0
3  chr10_100005476_100005689  chr10_99762576_99763431       0.0
4  chr10_100005476_100005689  chr10_99860291_99860632       0.0
5  chr10_100005476_100005689  chr10_99886048_99886763       0.0


### TSS annotation

In [8]:
ma.SUPPORTED_REF_GENOME

Unnamed: 0,species,ref_genome,provider
0,Human,hg38,UCSC
1,Human,hg19,UCSC
2,Mouse,mm39,UCSC
3,Mouse,mm10,UCSC
4,Mouse,mm9,UCSC
5,S.cerevisiae,sacCer2,UCSC
6,S.cerevisiae,sacCer3,UCSC
7,Zebrafish,danRer7,UCSC
8,Zebrafish,danRer10,UCSC
9,Zebrafish,danRer11,UCSC


In [9]:
##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="mm9") 

# Check results
print(tss_annotated.tail())

que bed peaks: 427257
tss peaks in que: 24671
         chr     start       end gene_short_name strand
24666   chr2  60560211  60561602           Itgb6      -
24667  chr15   3975177   3978654        BC037032      -
24668  chr14  67690701  67692101         Ppp2r2a      -
24669  chr17  48455247  48455773   B430306N03Rik      +
24670  chr10  59861192  59861608         Gm17455      +


### Integrate TSS info and cicero connections

In [15]:
# The resulting file obtained from the integration process consists of three columns: ["peak_id", "gene_short_name", "coaccess"].
# The "peak_id" column represents either the TSS peak or the peaks that are connected to a TSS peak.
# The "gene_short_name" column contains the gene name associated with the TSS site.
# The "coaccess" column indicates the coaccessibility score between the peak and a TSS peak. A score of 1 signifies that the peak itself is a TSS.

integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated, cicero_connections=cicero_connections)
print(integrated.shape)
print(integrated.head())

(1487382, 3)
                     peak_id gene_short_name  coaccess
0  chr10_100006908_100007333   4930430F08Rik  0.005782
1  chr10_100007796_100007999   4930430F08Rik  0.054402
2  chr10_100007796_100007999         Gm35722  0.000877
3  chr10_100007796_100007999           Tmtc3  0.006258
4  chr10_100019332_100019577   4930430F08Rik  0.211422


### Filter out peaks with low coaccessibility scores

In [16]:
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)
print(peak.shape)
print(peak.head())

peak.to_csv("~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + "_processed_peak_file.csv")

(23739, 2)
                     peak_id gene_short_name
0  chr10_100050979_100052296   4930430F08Rik
1  chr10_100203726_100204441         Gm35722
2  chr10_100204553_100205270         Gm35722
3  chr10_101144061_101145000          Mgat4c
4  chr10_101621348_101622162          Mgat4c
