In [1]:
from __future__ import print_function
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"

import random
from optparse import OptionParser
import pandas as pd

from akita_utils.tsv_gen_utils import (
    filter_boundary_ctcfs_from_h5,
    filter_by_rmsk,
    filter_by_ctcf,
    add_orientation,
    add_background,
    add_diff_flanks_and_const_spacer,
    validate_df_lenght,
    filter_dataframe_by_column,
)

In [2]:
from io import StringIO

In [3]:
# loading motifs
score_key = "SCD"
weak_thresh_pct = 1
strong_thresh_pct = 99

In [4]:
sites = filter_boundary_ctcfs_from_h5(
    h5_dirs="/project/fudenber_735/tensorflow_models/akita/v2/analysis/permute_boundaries_motifs_ctcf_mm10_model*/scd.h5",
    score_key=score_key,
    threshold_all_ctcf=5,
)

annotating each site with boundary-wide scores


In [5]:
len(sites)

7454

In [6]:
flank_end = 20
rmsk_exclude_window = flank_end
ctcf_exclude_window = 2 * flank_end

In [None]:
sites = filter_by_rmsk(
    sites,
    rmsk_file="/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz",
    exclude_window=rmsk_exclude_window,
    verbose=True,
)

In [None]:
len(sites)

In [7]:
import bioframe

In [None]:
sites = filter_by_ctcf(
    sites,
    ctcf_file="/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window=ctcf_exclude_window,
    verbose=True,
)

In [None]:
len(sites)

In [8]:
# this is how this function look at this branch

def filter_by_ctcf(
    sites,
    ctcf_file="/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window=60,
    site_cols=["chrom", "start", "end"],
    verbose=True,
):

    """
    Filter out sites that overlap any entry in ctcf within a window of 60bp up- and downstream.

    Parameters
    -----------
    sites : dataFrame
        Set of genomic intervals, currently with columns "chrom","start_2","end_2"
    ctcf_file : str
        File in tsv format used for filtering ctcf binding sites.

    Returns
    --------
    sites : dataFrame
        Subset of sites that do not have overlaps with ctcf binding sites in the ctcf_file.
    """

    if verbose:
        print("filtering sites by overlap with ctcfs")

    ctcf_cols = list(
        pd.read_csv(
            StringIO("""chrom start end name score pval strand"""),
            sep=" ",
        )
    )

    ctcf_motifs = pd.read_table(
        ctcf_file,
        names=ctcf_cols,
    )

    ctcf_motifs = bioframe.expand(ctcf_motifs, pad=exclude_window)

    sites = bioframe.count_overlaps(
        sites, ctcf_motifs[site_cols], cols1=["chrom", "start_2", "end_2"]
    )
    sites = sites.iloc[sites["count"].values == 0]          # <---- WHY HERE IS 0???? How is it possible that the output is not empty???
    sites.reset_index(inplace=True, drop=True)

    return sites

In [None]:
# changed

def filter_by_ctcf(
    sites,
    ctcf_file="/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window=60,
    site_cols=["chrom", "start", "end"],
    verbose=True,
):

    """
    Filter out sites that overlap any entry in ctcf within a window of 60bp up- and downstream.

    Parameters
    -----------
    sites : dataFrame
        Set of genomic intervals, currently with columns "chrom","start_2","end_2"
    ctcf_file : str
        File in tsv format used for filtering ctcf binding sites.

    Returns
    --------
    sites : dataFrame
        Subset of sites that do not have overlaps with ctcf binding sites in the ctcf_file.
    """

    if verbose:
        print("filtering sites by overlap with ctcfs")

    ctcf_cols = list(
        pd.read_csv(
            StringIO("""chrom start end name score pval strand"""),
            sep=" ",
        )
    )

    ctcf_motifs = pd.read_table(
        ctcf_file,
        names=ctcf_cols,
    )

    ctcf_motifs = bioframe.expand(ctcf_motifs, pad=exclude_window)

    sites = bioframe.count_overlaps(
        sites, ctcf_motifs[site_cols], cols1=["chrom", "start_2", "end_2"]
    )
    sites = sites.iloc[sites["count"].values == 1]          # changed
    sites.reset_index(inplace=True, drop=True)

    return sites

In [None]:
sites = filter_by_ctcf(
    sites,
    ctcf_file="/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window=ctcf_exclude_window,
    verbose=True,
)

In [None]:
len(sites)

In [None]:
# checking if the new implementation works as well

In [None]:
def read_rmsk(rmsk_file="/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz"):
    
    """reads a data frame containing repeatable elements and renames columns specifying genomic intervals to standard: chrom, start, end, used in thie repo."""
    
    rmsk_cols = list(
        pd.read_csv(
            StringIO(
                """bin swScore milliDiv milliDel milliIns genoName genoStart genoEnd genoLeft strand repName repClass repFamily repStart repEnd repLeft id"""
            ),
            sep=" ",
        )
    )

    rmsk = pd.read_table(
        rmsk_file,
        names=rmsk_cols,
    )
    
    rmsk.rename(
        columns={"genoName": "chrom", "genoStart": "start", "genoEnd": "end"},
        inplace=True,
    )
    
    return rmsk

In [None]:
def filter_by_overlap_num(
    working_df,
    filter_df,
    expand_window=60,
    working_df_cols=["chrom","start","end"],
    filter_df_cols=["chrom","start","end"],
    max_overlap_num=0):
    
    """
    Filter out rows from working_df that overlap entries in filter_df above given threshold.

    Parameters
    -----------
    working_df : dataFrame
        First set of genomic intervals.
    filter_df : dataFrame
        Second set of genomic intervals.
    expand_window : int
        Indicates how big window around the given genomic intervals should be taken into account.
    working_df_cols : list
        Columns specifying genomic intervals in the working_df.
    filter_df_cols : list
        Columns specifying genomic intervals in the filter_df.
    max_overlap_num : int
        All the rows with number of overlaps above this threshold will be filtered out.
        
    Returns
    --------
    working_df : dataFrame
        Subset of working_df that do not have overlaps with filter_df above given threshold.

    """
    
    filter_df = bioframe.expand(filter_df, pad=expand_window)
    
    working_df = bioframe.count_overlaps(working_df, filter_df[filter_df_cols], cols1=working_df_cols)
    
    working_df = working_df.iloc[working_df["count"].values <= max_overlap_num]
    working_df.reset_index(inplace=True, drop=True)

    return working_df

In [None]:
rmsk_df = read_rmsk("/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz")

In [None]:
sites.columns

In [None]:
sites = filter_by_overlap_num(
    sites,
    rmsk_df,
    expand_window=rmsk_exclude_window,
    working_df_cols=["chrom","start_2","end_2"],
    filter_df_cols=["chrom","start","end"],
    max_overlap_num=0)

In [None]:
len(sites)

In [None]:
sites = sites.drop(columns=["count"])

In [None]:
ctcf_df = bioframe.read_table("/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz", schema="jaspar")

In [None]:
sites = filter_by_overlap_num(
    sites,
    ctcf_df,
    expand_window=ctcf_exclude_window,
    working_df_cols=["chrom","start_2","end_2"],
    filter_df_cols=["chrom","start","end"],
    max_overlap_num=1)

In [None]:
len(sites)

In [9]:
len(sites)

7454

In [19]:
single_site_df = sites[sites["SSD"] == 5.972656]

In [20]:
filter_by_ctcf(
    single_site_df,
    ctcf_file="/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz",
    exclude_window=ctcf_exclude_window,
    verbose=True,
)

filtering sites by overlap with ctcfs


Unnamed: 0,SCD,SSD,alt_INS-128,alt_INS-16,alt_INS-256,alt_INS-32,alt_INS-64,boundary_index,boundary_strength_200000,chrom,...,INS-16,INS-32,INS-64,INS-128,INS-256,score_all_ctcf,score_10k,start_2,end_2,count
