In [1]:
import pandas as pd
import pysam
import numpy as np
import akita_utils
import seaborn as sns
import bioframe as bf

In [2]:
from akita_utils.dna_utils import scan_motif, dna_1hot
from akita_utils.format_io import read_jaspar_to_numpy, read_rmsk
from akita_utils.tsv_gen_utils import filter_by_overlap_num

2023-10-13 10:50:22.125445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2023-10-13 10:50:22.125493: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# reading tsv with all mouse ctcf motifs overlapping genomic boundaries

In [3]:
all_ctcf_path = "/project/fudenber_735/tensorflow_models/akita/v2/analysis/to_insert_boundaries.motifs.ctcf.mm10.tsv"
df = pd.read_csv(all_ctcf_path, sep="\t")

Since the set of motifs is the same for all backgrounds, we can focus on a subset of the table corresponding to one background (e.g. the 0th background)

In [4]:
df = df[df["background_index"] == 0]

# removing some not necessary columns from the table for clarity
columns_to_keep = ["boundary_index", "chrom", "boundary_end", "index", "num_ctcf", "span", "boundary_start", "strand", "start", "end"]
df = df[columns_to_keep]

# filtering

## 1. by ctcf

In [5]:
# jaspar tsv
jaspar_file = "/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz"
jaspar_df = bf.read_table(jaspar_file, schema="jaspar")

# filtering
filtered_df = filter_by_overlap_num(df,
                    filter_df=jaspar_df,
                     max_overlap_num=1)

In [6]:
len(df) - len(filtered_df)

2351

2351 sites have been excluded from the analysis since they overlap another ctcf sites

## 2. by rmsk (a table with repeatable elements e.g. SINE, LINE, ect.)

In [7]:
rmsk_file = "/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz"
rmsk_df = read_rmsk(rmsk_file)

In [8]:
filtered_df = filter_by_overlap_num(filtered_df,
                        rmsk_df,
                        expand_window=20,
                        working_df_cols = ["chrom","start","end"])

In [9]:
len(filtered_df)

7560

More than half of the sites have been filtered out.    
It's okay since we prefer to work with smaller, but high-quality sites (meaning that the effect of their insertion  disruption is not affected by overlapping ctct sites or repeatable genomic elements).

# adding seq_id

In [10]:
# so we can identify the same ctcf sites between experiments

seq_id = [seq_index for seq_index in range(len(filtered_df))]
filtered_df["seq_id"] = seq_id

# tsv saving

In [12]:
filtered_df.to_csv("./../../data_filtered_ctcf_table/filtered_base_mouse_ctcf.tsv", sep = "\t", index=False)