In [1]:
from optparse import OptionParser
import json
import bioframe as bf
import numpy as np
import pandas as pd
import os
from akita_utils.tsv_gen_utils import (
    filter_by_chrmlen,
    filter_by_overlap_num,
    filter_by_chromID,
)
from akita_utils.format_io import read_jaspar_to_numpy, read_rmsk

## Data and parameters

In [2]:
jaspar_file = "/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz"
ctcf_filter_expand_window = 60
rmsk_file = "/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz"
rmsk_filter_expand_window = 20
chrom_sizes_file = "/project/fudenber_735/genomes/mm10/mm10.chrom.sizes.reduced"
dot_file = "/project/fudenber_735/GEO/bonev_2017_GSE96107/distiller-0.3.1_mm10/results/coolers/features/mustache_HiC_ES.mm10.mapq_30.10000.tsv"
autosomes_only = True

if autosomes_only:
    chromID_to_drop = ["chrX", "chrY", "chrM"]

seq_length = 1310720

In [3]:
jaspar_df = bf.read_table(jaspar_file, schema="jaspar", skiprows=1)
if autosomes_only:
    jaspar_df = filter_by_chromID(jaspar_df, chrID_to_drop=chromID_to_drop)
jaspar_df.reset_index(drop=True, inplace=True)

In [4]:
# read rmsk file
rmsk_df = read_rmsk(rmsk_file)

# load dots
dots = pd.read_csv(dot_file, sep="\t")

In [5]:
# combining coordinates into one table
dots_bin1 = dots[["BIN1_CHR", "BIN1_START", "BIN1_END", "FDR", "DETECTION_SCALE"]]
dots_bin2 = dots[["BIN2_CHROMOSOME", "BIN2_START", "BIN2_END", "FDR", "DETECTION_SCALE"]]

dots_bin1 = dots_bin1.rename(columns={"BIN1_CHR": "chrom", "BIN1_START": "start", "BIN1_END": "end"})
dots_bin2 = dots_bin2.rename(columns={"BIN2_CHROMOSOME": "chrom", "BIN2_START": "start", "BIN2_END": "end"})

dots = pd.concat([dots_bin1, dots_bin2])

In [6]:
if autosomes_only:
    dots = filter_by_chromID(dots, chrID_to_drop=chromID_to_drop)

dots = filter_by_chrmlen(
    dots,
    chrom_sizes_file,
    seq_length,
)

dots.reset_index(drop=True, inplace=True)

In [7]:
len(dots)

18543

### Dot Anchors Summary
In total: 18543 dot anchors

In [8]:
boundaries_file = "/project/fudenber_735/GEO/bonev_2017_GSE96107/distiller-0.3.1_mm10/results/coolers/features/bonev2017.HiC_ES.mm10.mapq_30.1000.window_200000.insulation"
boundaries = pd.read_csv(boundaries_file, sep="\t")

In [9]:
window_size = boundaries_file.split("window_")[1].split(".")[0]
boundary_key, insulation_key = (
    f"boundary_strength_{window_size}",
    f"log2_insulation_score_{window_size}",
)

In [10]:
boundary_strength_thresh = 0.25
boundary_insulation_thresh = 0

In [11]:
boundaries = boundaries.iloc[
        (boundaries[boundary_key].values > boundary_strength_thresh)
        * (
            boundaries[insulation_key].values
            < boundary_insulation_thresh
        )
    ]

In [13]:
if autosomes_only:
    boundaries = filter_by_chromID(boundaries, chrID_to_drop=chromID_to_drop)

boundaries = filter_by_chrmlen(
    boundaries,
    chrom_sizes_file,
    seq_length,
)

boundaries.reset_index(drop=True, inplace=True)

In [14]:
len(boundaries)

4474

### Boundaries Summary
In total: 4474 boundaries

In [16]:
# overlapping CTCF df with boundaries df
df_overlap = bf.overlap(
    boundaries, jaspar_df, suffixes=("", "_2"), return_index=False
)

# removing rows with no start and end info
df_overlap = df_overlap[pd.notnull(df_overlap["start_2"])]
df_overlap = df_overlap[pd.notnull(df_overlap["end_2"])]

df_overlap["span"] = (
    df_overlap["start"].astype(str) + "-" + df_overlap["end"].astype(str)
)

df_keys = [
    "chrom",
    "start_2",
    "end_2",
    "span",
    "score_2",
    "strand_2",
    insulation_key,
    boundary_key,
]

df_overlap = df_overlap[df_keys]

In [17]:
# renaming
df_overlap = df_overlap.rename(
    columns={
        "span": "boundary_span",
        "score_2": "jaspar_score",
        "start_2": "start",
        "end_2": "end",
        "strand_2": "strand",
    }
)

# filtering by CTCF
B_filtered_df = filter_by_overlap_num(
    df_overlap,
    filter_df=jaspar_df,
    expand_window=ctcf_filter_expand_window,
    max_overlap_num=1,
)

# filtering by rmsk
B_filtered_df = filter_by_overlap_num(
    B_filtered_df,
    rmsk_df,
    expand_window=rmsk_filter_expand_window,
    working_df_cols=["chrom", "start", "end"],
)

In [18]:
len(B_filtered_df)

7560

### CTCFs overlapping boundaries
In total: 7560 sites

In [19]:
# overlapping CTCF df with boundaries df
df_overlap = bf.overlap(
    dots, jaspar_df, suffixes=("", "_2"), return_index=False
)

# removing rows with no start and end info
df_overlap = df_overlap[pd.notnull(df_overlap["start_2"])]
df_overlap = df_overlap[pd.notnull(df_overlap["end_2"])]

In [20]:
df_overlap["span"] = (
        df_overlap["start"].astype(str) + "-" + df_overlap["end"].astype(str)
    )

In [21]:
df_keys = [
        "chrom",
        "start_2",
        "end_2",
        "span",
        "score_2",
        "strand_2",
        "FDR",
        "DETECTION_SCALE",
    ]

df_overlap = df_overlap[df_keys]

In [22]:
# renaming
df_overlap = df_overlap.rename(
    columns={
        "span": "boundary_span",
        "score_2": "jaspar_score",
        "start_2": "start",
        "end_2": "end",
        "strand_2": "strand",
    }
)

In [23]:
# filtering by CTCF
filtered_df = filter_by_overlap_num(
    df_overlap,
    filter_df=jaspar_df,
    expand_window=ctcf_filter_expand_window,
    max_overlap_num=1,
)

# filtering by rmsk
filtered_df = filter_by_overlap_num(
    filtered_df,
    rmsk_df,
    expand_window=rmsk_filter_expand_window,
    working_df_cols=["chrom", "start", "end"],
)

In [24]:
len(filtered_df)

39226

### CTCFs overlapping dot anchors
In total: 39,226 sites

In [33]:
# Merging the DataFrames with an indicator
merged_df = pd.merge(filtered_df, B_filtered_df, on=['chrom', 'start', 'end'], how='left', indicator=True)

In [38]:
num_overlap = len(pd.merge(filtered_df, B_filtered_df, on=['chrom', 'start', 'end'], how='inner'))

In [39]:
num_overlap

2278

### 2,278 CTCF sites overlap bondaries and dot anchors

In [40]:
unique_to_dot_anchors = merged_df[merged_df["_merge"] == "left_only"]

In [41]:
unique_to_dot_anchors = unique_to_dot_anchors.drop(columns=['boundary_span_y',
       'jaspar_score_y', 'strand_y', 'log2_insulation_score_200000',
       'boundary_strength_200000', '_merge'])

In [42]:
unique_to_dot_anchors = unique_to_dot_anchors.rename(columns={"boundary_span_x": "boundary_span", 
                                      "jaspar_score_x": "jaspar_score",
                                     "strand_x": "strand"})

In [43]:
len(unique_to_dot_anchors)

36948

### 36,948 CTCF sites overlapping uniquely dot anchors

In [44]:
39226 // num_overlap

17

### 1/17 of dot-CTCTs overlap boundaries

In [45]:
7560 // num_overlap

3

### 1/3 of boundary-CTCFs overlap dots