In [1]:
from __future__ import print_function
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'

import random
from optparse import OptionParser
import pandas as pd
import itertools

import bioframe
import akita_utils

from io import StringIO

2022-11-18 09:58:06.130779: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /spack/apps/linux-centos7-x86_64/gcc-8.3.0/python-3.9.2-uvcroioc4witkp6qf7mbebof5ix4wlb6/lib:/spack/apps/linux-centos7-x86_64/gcc-8.3.0/pmix-3.1.3-3sm6emyqaxapunh7rwbjvtaqoqe2e5z3/lib:/spack/apps/linux-centos7-x86_64/gcc-8.3.0/openmpi-4.0.2-ipm3dnvlbtxawpi4ifz7jma6jgr7mexq/lib:/spack/apps/linux-centos7-x86_64/gcc-8.3.0/openblas-0.3.8-2no6mfziiclwxb7lstxoos335gnhjpes/lib:/spack/apps/gcc/8.3.0/lib64::/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2022-11-18 09:58:06.130815: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
import numpy as np

In [3]:
def filter_by_rmsk(
    sites,
    rmsk_file="/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz",
    exclude_window = 60,
    site_cols = ["chrom", "start", "end"],
    verbose=True,
):
    """
    Filter out sites that overlap any entry in rmsk.
    This is important for sineB2 in mice, and perhaps for other repetitive elements as well.

    Parameters
    -----------
    sites : dataFrame
        Set of genomic intervals, currently with columns "chrom","start_2","end_2"
        TODO: update this and allow columns to be passed
    rmsk_file : str
        File in repeatmasker format used for filtering sites.

    Returns
    --------
    sites : dataFrame
        Subset of sites that do not have overlaps with repeats in the rmsk_file.

    """
    if verbose:
        print("filtering sites by overlap with rmsk")

    rmsk_cols = list(
        pd.read_csv(
            StringIO(
                """bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEnd repLeft id"""
            ),
            sep=" ",
        )
    )

    rmsk = pd.read_table(
        rmsk_file,
        names=rmsk_cols,
    )
    rmsk.rename(
        columns={"genoName": "chrom", "genoStart": "start", "genoEnd": "end"},
        inplace=True,
    )
    
    rmsk = bioframe.expand(rmsk, pad=exclude_window)
    
    sites = bioframe.count_overlaps(
        sites, rmsk[site_cols], cols1=["chrom", "start_2", "end_2"]
    )
    
    sites = sites.iloc[sites["count"].values == 0]
    sites.reset_index(inplace=True, drop=True)

    return sites


def filter_by_ctcf(
    sites,
    ctcf_file = "/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window = 60,
    site_cols = ["chrom", "start", "end"],
    verbose=True,
    ):
    """
    Filter out sites that overlap any entry in ctcf within a window of 60bp up- and downstream.
    Parameters
    -----------
    sites : dataFrame
        Set of genomic intervals, currently with columns "chrom","start_2","end_2"
    ctcf_file : str
        File in tsv format used for filtering ctcf binding sites.
    Returns
    --------
    sites : dataFrame
        Subset of sites that do not have overlaps with ctcf binding sites in the ctcf_file.
    """
    if verbose:
        print("filtering sites by overlap with ctcfs")

    ctcf_cols = list(
        pd.read_csv(
            StringIO(
                """chrom start end name score pval strand"""
            ),
            sep=" ",
        )
    )

    ctcf_motifs = pd.read_table(
        ctcf_file,
        names=ctcf_cols,
    )
    
    ctct_motifs = bioframe.expand(ctcf_motifs, pad=exclude_window)
    
    sites = bioframe.count_overlaps(
        sites, ctcf_motifs[site_cols], cols1=["chrom", "start_2", "end_2"]
    )
    sites = sites.iloc[sites["count"].values == 0]
    sites.reset_index(inplace=True, drop=True)

    return sites

In [4]:
rmsk_file = "/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz"
jaspar_file = "/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz"

In [5]:
# loading motifs
score_key = "SCD"
weak_thresh_pct = 1
strong_thresh_pct = 99
pad_flank = 0
rmsk_exclude_window = 20
ctcf_exclude_window = 40

sites = akita_utils.filter_boundary_ctcfs_from_h5(
    h5_dirs="/project/fudenber_735/tensorflow_models/akita/v2/analysis/permute_boundaries_motifs_ctcf_mm10_model*/scd.h5",
    score_key=score_key,
    threshold_all_ctcf=5,
)

sites = filter_by_rmsk(
    sites,
    rmsk_file = rmsk_file, 
    exclude_window = rmsk_exclude_window,
    verbose=True
)

sites = filter_by_ctcf(sites,
    ctcf_file = jaspar_file,
    exclude_window = ctcf_exclude_window,
    verbose=True)

annotating each site with boundary-wide scores
filtering sites by overlap with rmsk
filtering sites by overlap with ctcfs


In [29]:
def filter_sites_by_score(
    sites,
    score_key="SCD",
    upper_threshold=100,
    lower_threshold=0,
    mode="head",
    num_sites=None,    # if num_sites == None -> return all filtered sites
    ):
    
    if mode not in ("head", "tail", "random"):
        raise ValueError("a mode has to be one from: head, tail, random")
    
    upper_thresh = np.percentile(sites[score_key].values, upper_threshold)
    lower_thresh = np.percentile(sites[score_key].values, lower_threshold)
        
    filtered_sites = (sites[(sites[score_key] >= lower_thresh) & (sites[score_key] <= upper_thresh)].copy().sort_values(score_key, ascending=False))
    
    if num_sites != None:
        assert num_sites <= len(filtered_sites), "length of dataframe is smaller than requested number of sites, change contraints"
        
        if mode == "head":
            filtered_sites = filtered_sites[:num_sites]
        elif mode == "tail":
            filtered_sites = filtered_sites[-num_sites:]
        else:
            filtered_sites = filtered_sites.sample(n=num_sites)
    
    return filtered_sites
    

In [30]:
strong_sites = filter_sites_by_score(
    sites,
    score_key="SCD",
    upper_threshold=99,
    lower_threshold=1,
    mode="head",
    num_sites=1000000000
    )

AssertionError: length of dataframe is smaller than requested number of sites, change contraints

In [31]:
strong_sites

Unnamed: 0,SCD,SSD,alt_INS-128,alt_INS-16,alt_INS-256,alt_INS-32,alt_INS-64,boundary_index,boundary_strength_200000,chrom,...,INS-32,INS-64,INS-128,INS-256,score_all_ctcf,score_10k,start_2,end_2,count,count.1
2950,53.78125,-46.9375,-0.461426,-0.287354,-0.284668,-0.391846,-0.457764,3096,1.33,chr12,...,-0.190186,-0.234619,-0.249512,-0.145508,53.90625,56.75,35192361,35192380,0,2
378,53.6875,-14.453125,-0.68457,-0.049316,-0.350098,-0.13208,-0.480225,449,1.226,chr2,...,-0.238525,-0.184326,-0.134766,-0.084229,53.5,53.875,48886252,48886271,0,1
2949,53.65625,-46.625,-0.463379,-0.28833,-0.285645,-0.393311,-0.459717,3096,1.33,chr12,...,-0.188721,-0.232666,-0.247559,-0.144531,53.90625,56.75,35192359,35192378,0,2
2072,53.5,-43.0625,-0.231323,-0.326416,-0.165283,-0.300537,-0.263672,2237,0.799,chr8,...,-0.24292,-0.270996,-0.275391,-0.148926,56.03125,78.375,102781112,102781131,0,1
1197,53.3125,-37.15625,-0.202271,-0.185669,-0.188232,-0.247314,-0.262207,1349,0.7466,chr5,...,-0.299072,-0.300781,-0.270508,-0.159912,53.75,49.125,49961991,49962010,0,1
2185,53.125,-29.78125,-0.150146,-0.161621,-0.263184,-0.255859,-0.237915,2336,0.2778,chr9,...,-0.149902,-0.175659,-0.227539,-0.132568,53.46875,49.9375,26776356,26776375,0,1
2250,53.0625,-17.140625,-0.268555,-0.214844,-0.23584,-0.275391,-0.184204,2387,0.6406,chr9,...,-0.240234,-0.207886,-0.156494,-0.060547,52.5625,56.25,51152589,51152608,0,1
2014,52.875,-12.054688,-0.057007,-0.050964,-0.110901,-0.055359,-0.05777,2154,0.5864,chr8,...,-0.179443,-0.195435,-0.209595,-0.110779,52.9375,53.3125,58566264,58566283,0,1
2960,52.84375,-36.96875,-0.635254,-0.40625,-0.471924,-0.518066,-0.57959,3106,1.109,chr12,...,-0.205078,-0.21875,-0.213867,-0.124756,75.75,89.375,40837731,40837750,0,1
1995,52.5,10.507812,-0.122314,-0.178467,-0.167725,-0.176514,-0.144287,2119,0.627,chr8,...,-0.241455,-0.22998,-0.173828,-0.060059,52.84375,56.6875,36523116,36523135,0,1


In [9]:
weak_sites = filter_sites_by_score(
    sites,
    score_key="SCD",
    upper_threshold=99,
    lower_threshold=1,
    mode="tail",
    num_sites=10
    )

In [10]:
site_df = pd.concat([strong_sites.copy(), weak_sites.copy()])

In [11]:
seq_coords_df = (
    site_df[["chrom", "start_2", "end_2", "strand_2", score_key]]
    .copy()
    .rename(
        columns={
            "start_2": "start",
            "end_2": "end",
            "strand_2": "strand",
            score_key: "genomic_" + score_key,
        }
    )
)
seq_coords_df.reset_index(drop=True, inplace=True)
seq_coords_df.reset_index(inplace=True)

In [12]:
seq_coords_df

Unnamed: 0,index,chrom,start,end,strand,genomic_SCD
0,0,chr12,35192361,35192380,+,53.78125
1,1,chr2,48886252,48886271,-,53.6875
2,2,chr12,35192359,35192378,-,53.65625
3,3,chr8,102781112,102781131,-,53.5
4,4,chr5,49961991,49962010,-,53.3125
5,5,chr9,26776356,26776375,-,53.125
6,6,chr9,51152589,51152608,-,53.0625
7,7,chr8,58566264,58566283,+,52.875
8,8,chr12,40837731,40837750,+,52.84375
9,9,chr8,36523116,36523135,-,52.5


In [13]:
# test -> comparison with the old version

strong_sites, weak_sites = akita_utils.filter_sites_by_score(
    sites,
    score_key=score_key,
    weak_thresh_pct=weak_thresh_pct,
    weak_num=10,
    strong_thresh_pct=strong_thresh_pct,
    strong_num=10,
)

site_df2 = pd.concat([strong_sites.copy(), weak_sites.copy()])
seq_coords_df2 = (
    site_df2[["chrom", "start_2", "end_2", "strand_2", score_key]]
    .copy()
    .rename(
        columns={
            "start_2": "start",
            "end_2": "end",
            "strand_2": "strand",
            score_key: "genomic_" + score_key,
        }
    )
)
seq_coords_df2.reset_index(drop=True, inplace=True)
seq_coords_df2.reset_index(inplace=True)

In [14]:
seq_coords_df2

Unnamed: 0,index,chrom,start,end,strand,genomic_SCD
0,0,chr12,35192361,35192380,+,53.78125
1,1,chr2,48886252,48886271,-,53.6875
2,2,chr12,35192359,35192378,-,53.65625
3,3,chr8,102781112,102781131,-,53.5
4,4,chr5,49961991,49962010,-,53.3125
5,5,chr9,26776356,26776375,-,53.125
6,6,chr9,51152589,51152608,-,53.0625
7,7,chr8,58566264,58566283,+,52.875
8,8,chr12,40837731,40837750,+,52.84375
9,9,chr8,36523116,36523135,-,52.5
