#### This section relies on the CellOracle package for annotating scATAC-seq peak calls with Transcription Start Site (TSS) information and the gene names associated with the TSS sites. For further information, please refer to the CellOracle documentation available at https://morris-lab.github.io/CellOracle.documentation/index.html.

### Import library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm
from celloracle import motif_analysis as ma
import celloracle as co
co.__version__

import warnings
import numba
# Code that triggers the warning

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=numba.NumbaDeprecationWarning)

In [3]:
# config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

### Load data

In [4]:
# Load scATAC-seq peak list.
peaks = pd.read_csv("~/Desktop/Buenrostro/res_Buenrostro2018_all_peaks.csv", index_col=0)
peaks = peaks.x.values
print(peaks)

['chr1_10413_10625' 'chr1_13380_13624' 'chr1_16145_16354' ...
 'chrY_59004165_59004411' 'chrY_59013930_59014161'
 'chrY_59363205_59363360']


In [5]:
# Load Cicero coaccessibility scores.
cicero_connections = pd.read_csv("~/Desktop/Buenrostro/res_Buenrostro2018_cicero_connections.csv", index_col=0)
print(cicero_connections.head())

                       Peak1                    Peak2  coaccess
1  chr10_100019639_100020050  chr10_99769506_99769797  0.000000
2  chr10_100019639_100020050  chr10_99787890_99788254  0.002895
3  chr10_100019639_100020050  chr10_99790129_99790934 -0.022282
4  chr10_100019639_100020050  chr10_99790943_99791257 -0.003558
5  chr10_100019639_100020050  chr10_99801285_99801535  0.002933


### TSS annotation

In [6]:
ma.SUPPORTED_REF_GENOME

Unnamed: 0,species,ref_genome,provider
0,Human,hg38,UCSC
1,Human,hg19,UCSC
2,Mouse,mm39,UCSC
3,Mouse,mm10,UCSC
4,Mouse,mm9,UCSC
5,S.cerevisiae,sacCer2,UCSC
6,S.cerevisiae,sacCer3,UCSC
7,Zebrafish,danRer7,UCSC
8,Zebrafish,danRer10,UCSC
9,Zebrafish,danRer11,UCSC


In [7]:
##!! Please make sure to specify the correct reference genome here
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg19") 

# Check results
print(tss_annotated.tail())

que bed peaks: 237450
tss peaks in que: 23840
         chr      start        end gene_short_name strand
23835  chr12   14720257   14720938           PLBD1      -
23836   chr4  156679570  156680679         GUCY1B1      +
23837   chrX  132549016  132549203            GPC4      -
23838   chrX  132549265  132550038            GPC4      -
23839   chr7  112727703  112728111           GPR85      -


### Integrate TSS info and cicero connections

In [8]:
# The resulting file obtained from the integration process consists of three columns: ["peak_id", "gene_short_name", "coaccess"].
# The "peak_id" column represents either the TSS peak or the peaks that are connected to a TSS peak.
# The "gene_short_name" column contains the gene name associated with the TSS site.
# The "coaccess" column indicates the coaccessibility score between the peak and a TSS peak. A score of 1 signifies that the peak itself is a TSS.

integrated = ma.integrate_tss_peak_with_cicero(tss_peak=tss_annotated, cicero_connections=cicero_connections)
print(integrated.shape)
print(integrated.head())

(771702, 3)
                     peak_id gene_short_name  coaccess
0  chr10_100019639_100020050         PYROXD2  0.003578
1  chr10_100022300_100022741           LOXL4  0.130933
2  chr10_100022300_100022741         PYROXD2  0.018459
3  chr10_100027770_100028555          CRTAC1  0.051738
4  chr10_100027770_100028555            HPS1  0.020793


### Filter out peaks with low coaccessibility scores

In [9]:
peak = integrated[integrated.coaccess >= 0.8]
peak = peak[["peak_id", "gene_short_name"]].reset_index(drop=True)
print(peak.shape)
print(peak.head())

peak.to_csv("~/Desktop/Buenrostro/res_Buenrostro2018_processed_peak_file.csv")

(21823, 2)
                     peak_id gene_short_name
0  chr10_100027770_100028555           LOXL4
1  chr10_100174589_100175172         PYROXD2
2  chr10_100175241_100175630         PYROXD2
3  chr10_100205646_100207085            HPS1
4  chr10_100205646_100207085    LOC101927278
