#### This section relies on the CellOracle package for scanning scATAC-seq peak calls (TSS annotated) for TF binding site motifs. For further information, please refer to the CellOracle documentation available at https://morris-lab.github.io/CellOracle.documentation/index.html.

### Import library

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm
import celloracle as co
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object
co.__version__

'0.14.0'

In [4]:
# config InlineBackend.figure_format = 'retina'
# matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

### Load reference genome and data

In [6]:
tissue="Spleen"
sample_id="62016_P2"

# PLEASE make sure reference genome is correct.
ref_genome = "mm9"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)
print(ma.SUPPORTED_REF_GENOME)

if not genome_installation:
    import genomepy
    genomepy.install_genome(name=ref_genome, provider="UCSC")
else:
    print(ref_genome, "is installed.")


# Load annotated peak data.
peaks = pd.read_csv("~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + "_processed_peak_file.csv", index_col=0)
print(peaks.head())


genome mm9 is not installed in this environment.
Please install genome using genomepy.
e.g.
    >>> import genomepy
    >>> genomepy.install_genome(name="mm9", provider="UCSC")
mm9 installation:  False
               species            ref_genome           provider
0                Human                  hg38               UCSC
1                Human                  hg19               UCSC
2                Mouse                  mm39               UCSC
3                Mouse                  mm10               UCSC
4                Mouse                   mm9               UCSC
5         S.cerevisiae               sacCer2               UCSC
6         S.cerevisiae               sacCer3               UCSC
7            Zebrafish               danRer7               UCSC
8            Zebrafish              danRer10               UCSC
9            Zebrafish              danRer11               UCSC
10  Xenopus tropicalis               xenTro2               UCSC
11  Xenopus tropicalis        

[32m13:38:46[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading assembly summaries from UCSC
[32m13:39:01[0m [1m|[0m [34mINFO[0m [1m|[0m Downloading genome from UCSC. Target URL: https://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/chromFa.tar.gz...


Download:   0%|          | 0.00/820M [00:00<?, ?B/s]

[32m13:48:28[0m [1m|[0m [34mINFO[0m [1m|[0m Genome download successful, starting post processing...
[32m13:49:21[0m [1m|[0m [34mINFO[0m [1m|[0m name: mm9
[32m13:49:21[0m [1m|[0m [34mINFO[0m [1m|[0m local name: mm9
[32m13:49:21[0m [1m|[0m [34mINFO[0m [1m|[0m fasta: /home/bio/.local/share/genomes/mm9/mm9.fa


Filtering Fasta: 0.00 lines [00:00, ? lines/s]

                     peak_id gene_short_name
0  chr10_100050979_100052296   4930430F08Rik
1  chr10_100203726_100204441         Gm35722
2  chr10_100204553_100205270         Gm35722
3  chr10_101144061_101145000          Mgat4c
4  chr10_101621348_101622162          Mgat4c


In [7]:
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'
        
    Returns:
        tuple: chromosome name, start position, end position
    """
    
    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_format(peaks_df, ref_genome):
    """
    Check peak format. 
     (1) Check chromosome name. 
     (2) Check peak size (length) and remove sort DNA sequences (<5bp)
    
    """
    
    df = peaks_df.copy()
    
    n_peaks_before = df.shape[0]
    
    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(int)
    df_decomposed["end"] = df_decomposed["end"].astype(int)
    
    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())
    
    
    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
    
    
    # Filter peaks with invalid chromosome name
    n_threshold = 5
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]
    
    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
    
    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]
    
    
    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)
    
    return df

In [8]:
peaks = check_peak_format(peaks, ref_genome)
print(peaks)

# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks, ref_genome=ref_genome) 
# print(tfi)

##time
# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!
# import faulthandler
# faulthandler.enable()
tfi.scan(fpr=0.02, motifs=None, verbose=True)

Peaks before filtering:  23739
Peaks with invalid chr_name:  0
Peaks with invalid length:  0
Peaks after filtering:  23739
                         peak_id gene_short_name
0      chr10_100050979_100052296   4930430F08Rik
1      chr10_100203726_100204441         Gm35722
2      chr10_100204553_100205270         Gm35722
3      chr10_101144061_101145000          Mgat4c
4      chr10_101621348_101622162          Mgat4c
...                          ...             ...
23734     chrX_99839037_99840900           Phka1
23735         chrY_345823_347369         Eif2s3y
23736         chrY_581272_582790             Uty
23737         chrY_621772_623366           Ddx3y
23738         chrY_795887_796426           Usp9y

[23739 rows x 2 columns]
No motif data entered. Loading default motifs for your species ...
 Default motif for vertebrate: gimme.vertebrate.v5.0. 
 For more information, please see https://gimmemotifs.readthedocs.io/en/master/overview.html 

Initiating scanner... 



DEBUG:gimme.scanner:using background: genome mm9 with size 200


Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. 



2023-11-05 13:55:55,330 - INFO - determining FPR-based threshold
INFO:gimme.scanner:determining FPR-based threshold


Motif scan started .. It may take long time.



scanning:   0%|          | 0/21840 [00:00<?, ? sequences/s]

DEBUG:gimme.scanner:Scanning


In [13]:
# Save tfinfo object
# tfi.to_hdf5(file_path="~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + ".celloracle.tfinfo")

# Check motif scan results
print(tfi.scanned_df.head())

# Reset filtering
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10)

# Format post-filtering results.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

# Get final base GRN
df = tfi.to_dataframe()
print(df.head())

# Save result as a dataframe
df = tfi.to_dataframe()
df.to_parquet("~/Desktop/scATAC-seq/res_" + tissue + "_" + sample_id + "_base_GRN_dataframe.parquet")

                     seqname                 motif_id  \
0  chr10_100050979_100052296  GM.5.0.Homeodomain.0001   
1  chr10_100050979_100052296        GM.5.0.Mixed.0002   
2  chr10_100050979_100052296         GM.5.0.bHLH.0004   
3  chr10_100050979_100052296   GM.5.0.Paired_box.0001   
4  chr10_100050979_100052296          GM.5.0.Ets.0003   

                                      factors_direct  \
0                                              TGIF1   
1                                         PAX4, VSX2   
2          USF1, TFE3, MITF, TFE, USF2, EBOX, SREBF1   
3                                   PAX5, Pax8, Pax5   
4  ERF, Elk4, Elk1, GABPA, FLI1, ELK3, ERG, Elk3,...   

                                    factors_indirect      score  pos  strand  
0                             ENSG00000234254, TGIF1  10.403033  327       1  
1                             Pax4, PAX4, Lbx1, Vsx2  10.025440   41       1  
2                                                      8.951626  493       1  
3   

  0%|          | 0/21830 [00:00<?, ?it/s]

2. Converting results into dictionaries.


  0%|          | 0/18806 [00:00<?, ?it/s]

  0%|          | 0/1095 [00:00<?, ?it/s]

                     peak_id gene_short_name  9430076c15rik  Ac002126.6  \
0  chr10_100050979_100052296   4930430F08Rik            0.0         0.0   
1  chr10_100203726_100204441         Gm35722            0.0         0.0   
2  chr10_100204553_100205270         Gm35722            0.0         0.0   
3  chr10_101144061_101145000          Mgat4c            0.0         0.0   
4  chr10_101621348_101622162          Mgat4c            1.0         0.0   

   Ac012531.1  Ac226150.2  Afp  Ahctf1  Ahr  Ahrr  ...  Znf784  Znf8  Znf816  \
0         1.0         0.0  0.0     0.0  0.0   0.0  ...     0.0   0.0     0.0   
1         0.0         0.0  0.0     0.0  0.0   0.0  ...     0.0   0.0     0.0   
2         0.0         0.0  0.0     0.0  0.0   0.0  ...     0.0   0.0     0.0   
3         0.0         0.0  0.0     0.0  0.0   0.0  ...     0.0   0.0     0.0   
4         0.0         0.0  0.0     0.0  0.0   0.0  ...     0.0   0.0     0.0   

   Znf85  Zscan10  Zscan16  Zscan22  Zscan26  Zscan31  Zscan4  
0   