In [1]:
import pandas as pd
import numpy as np
import bioframe
from akita_utils.tsv_gen_utils import filter_boundary_ctcfs_from_h5, filter_by_overlap
from akita_utils.format_io import read_rmsk

In [2]:
rmsk_df = read_rmsk("/project/fudenber_735/genomes/mm10/database/rmsk.txt.gz")

In [3]:
rmsk_df

Unnamed: 0,bin,swScore,milliDiv,milliDel,milliIns,chrom,start,end,genoLeft,strand,repName,repClass,repFamily,repStart,repEnd,repLeft,id
0,607,12955,105,9,10,chr1,3000000,3002128,-192469843,-,L1_Mus3,LINE,L1,-3055,3592,1466,1
1,607,1216,268,31,105,chr1,3003152,3003994,-192467977,-,L1Md_F,LINE,L1,-5902,617,1,2
2,607,234,279,0,0,chr1,3003993,3004054,-192467917,-,L1_Mus3,LINE,L1,-6034,297,237,3
3,607,3685,199,21,14,chr1,3004040,3004206,-192467765,+,L1_Rod,LINE,L1,1321,1492,-4355,4
4,607,376,62,31,0,chr1,3004206,3004270,-192467701,+,(CAAA)n,Simple_repeat,Simple_repeat,4,69,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5333734,586,8702,95,80,89,chrna_GL456050_alt,140098,140367,-1974,+,RMER16-int,LTR,ERVK,411,667,-4739,1
5333735,586,44,24,0,0,chrna_GL456050_alt,140367,140409,-1932,+,(AG)n,Simple_repeat,Simple_repeat,1,42,0,1
5333736,586,8702,95,80,89,chrna_GL456050_alt,140409,140631,-1710,+,RMER16-int,LTR,ERVK,668,930,-4476,1
5333737,586,5082,56,3,0,chrna_GL456050_alt,140631,141216,-1125,-,LTRIS_Mus,LTR,ERV1,0,587,1,1


In [4]:
ctcf_df = bioframe.read_table("/project/fudenber_735/motifs/mm10/jaspar/MA0139.1.tsv.gz", schema='jaspar')

In [5]:
ctcf_df

Unnamed: 0,chrom,start,end,name,score,pval,strand
0,chr10,3103662,3103681,CTCF,814,410,-
1,chr10,3108121,3108140,CTCF,859,504,-
2,chr10,3119020,3119039,CTCF,801,386,-
3,chr10,3124081,3124100,CTCF,807,397,-
4,chr10,3127081,3127100,CTCF,810,403,+
...,...,...,...,...,...,...,...
825505,chrY,90829177,90829196,CTCF,842,465,+
825506,chrY,90839573,90839592,CTCF,804,392,+
825507,chrY,90839785,90839804,CTCF,885,565,-
825508,chrY,90839787,90839806,CTCF,842,465,+


In [6]:
# data frame with CTCF sites

sites = filter_boundary_ctcfs_from_h5(
        h5_dirs="/project/fudenber_735/tensorflow_models/akita/v2/analysis/permute_boundaries_motifs_ctcf_mm10_model*/scd.h5",
        score_key="SCD",
        threshold_all_ctcf=0,
    )

annotating each site with boundary-wide scores


In [7]:
# filtering out sites that overlap more than one CTCF motif

filtered_sites = filter_by_overlap(
                        sites,
                        ctcf_df,
                        exclude_window=60,
                        working_df_cols = ["chrom","start_2","end_2"],
                        filter_df_cols = ["chrom","start","end"],
                        overlap_threshold=1)

In [8]:
# filtering out sites that overlap repetitive elements

filtered_sites = filter_by_overlap(
                        filtered_sites,
                        rmsk_df,
                        exclude_window=60,
                        working_df_cols = ["chrom","start_2","end_2"],
                        filter_df_cols = ["chrom","start","end"])

In [9]:
filtered_sites

Unnamed: 0,SCD,SSD,alt_INS-128,alt_INS-16,alt_INS-256,alt_INS-32,alt_INS-64,boundary_index,boundary_strength_200000,chrom,...,INS-32,INS-64,INS-128,INS-256,score_all_ctcf,score_10k,start_2,end_2,count,count.1
0,26.125000,5.972656,-0.379639,-0.390381,-0.136719,-0.553223,-0.655273,1,1.0030,chr1,...,0.017090,0.016602,0.008545,0.005859,28.343750,29.125000,4770055,4770074,1,0
1,0.571289,-0.006458,-0.370850,-0.378662,-0.130859,-0.535645,-0.637695,1,1.0030,chr1,...,-0.000488,-0.000977,-0.000244,0.000000,28.343750,29.125000,4770180,4770199,1,0
2,0.190796,0.001311,-0.371094,-0.380127,-0.130859,-0.536621,-0.638672,1,1.0030,chr1,...,0.000488,0.000000,0.000000,0.000000,28.343750,29.125000,4770867,4770886,1,0
3,1.109375,0.053772,-0.370361,-0.376953,-0.130859,-0.534180,-0.637695,1,1.0030,chr1,...,-0.001953,-0.000977,-0.000732,0.000000,28.343750,29.125000,4773435,4773454,1,0
4,0.532715,0.063232,-0.370605,-0.379395,-0.130737,-0.536621,-0.638672,1,1.0030,chr1,...,0.000488,0.000000,-0.000488,-0.000122,28.343750,29.125000,4775739,4775758,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6549,8.218750,5.265625,-0.233643,-0.316162,-0.243408,-0.406006,-0.408936,4493,0.2927,chr19,...,-0.015869,-0.008057,0.001221,0.007812,7.843750,10.265625,60152656,60152675,1,0
6550,0.247925,-0.044434,-0.232544,-0.332031,-0.235596,-0.422119,-0.416992,4493,0.2927,chr19,...,0.000244,0.000000,0.000122,0.000000,7.843750,10.265625,60153312,60153331,1,0
6551,0.054901,-0.007572,-0.232300,-0.331787,-0.235596,-0.421875,-0.416748,4493,0.2927,chr19,...,0.000000,-0.000244,-0.000122,0.000000,7.843750,10.265625,60158407,60158426,1,0
6552,0.186401,-0.007526,-0.231079,-0.332031,-0.235107,-0.421631,-0.416016,4493,0.2927,chr19,...,-0.000244,-0.000977,-0.001343,-0.000488,7.843750,10.265625,60158927,60158946,1,0
