In [1]:
import h5py
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from akita_utils.tsv_gen_utils import add_orientation, add_const_flank_and_diff_spacer, add_background

## goal  
modify tsv with all single spans ctcfs that I used for insertion vs disruption experiment

In [3]:
all_ctcf_path = "./filtered_base_mouse_ctcf.tsv"

In [4]:
df = pd.read_csv(all_ctcf_path, sep="\t")

In [5]:
df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454
...,...,...,...,...,...,...,...,...,...,...
7501,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445
7502,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695
7503,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409
7504,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176


In [6]:
nr_targets = 6
nr_sites = len(df)
nr_backgrounds = 10

In [7]:
seq_id = []

for seq_index in range(nr_sites):
    seq_id.append(seq_index)

df["seq_id"] = seq_id

In [8]:
df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end,seq_id
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286,0
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074,1
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199,2
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886,3
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454,4
...,...,...,...,...,...,...,...,...,...,...,...
7501,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445,7501
7502,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695,7502
7503,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409,7503
7504,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176,7504


## adding bg_index and experiment ID

In [9]:
df = add_background(
        df, [bg_index for bg_index in range(nr_backgrounds)] 
    )

In [10]:
exp_id = [i for i in range(nr_sites * nr_backgrounds)]
df["exp_id"] = exp_id

In [11]:
df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end,seq_id,background_index,exp_id
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286,0,0,0
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074,1,0,1
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199,2,0,2
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886,3,0,3
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454,4,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75055,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445,7501,9,75055
75056,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695,7502,9,75056
75057,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409,7503,9,75057
75058,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176,7504,9,75058


## for boundary experiment

In [12]:
orientation = ["<>"]

In [13]:
boundary_df = add_orientation(df, orientation_strings=orientation, all_permutations=False)

In [14]:
# adding 20bp flanks
flank = 20
spacing_list = [70]

In [15]:
boundary_df = add_const_flank_and_diff_spacer(boundary_df, flank, spacing_list)

In [16]:
boundary_df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end,seq_id,background_index,exp_id,orientation,flank_bp,spacer_bp
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286,0,0,0,<>,20,70
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074,1,0,1,<>,20,70
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199,2,0,2,<>,20,70
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886,3,0,3,<>,20,70
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454,4,0,4,<>,20,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75055,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445,7501,9,75055,<>,20,70
75056,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695,7502,9,75056,<>,20,70
75057,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409,7503,9,75057,<>,20,70
75058,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176,7504,9,75058,<>,20,70


In [17]:
boundary_df.to_csv("./new_correct_all_motifs_boundary.tsv", sep = "\t", index=False)

## for dot experiment

In [18]:
orientation = ["><"]

In [19]:
dot_df = add_orientation(df, orientation_strings=orientation, all_permutations=False)

In [20]:
# adding 20bp flanks
flank = 20
spacing_list = [199980]

In [21]:
dot_df = add_const_flank_and_diff_spacer(dot_df, flank, spacing_list)

In [22]:
dot_df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end,seq_id,background_index,exp_id,orientation,flank_bp,spacer_bp
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286,0,0,0,><,20,199980
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074,1,0,1,><,20,199980
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199,2,0,2,><,20,199980
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886,3,0,3,><,20,199980
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454,4,0,4,><,20,199980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75055,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445,7501,9,75055,><,20,199980
75056,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695,7502,9,75056,><,20,199980
75057,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409,7503,9,75057,><,20,199980
75058,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176,7504,9,75058,><,20,199980


In [24]:
dot_df.to_csv("./new_correct_all_motifs_dot.tsv", sep = "\t", index=False)