In [1]:
import pandas as pd

In [2]:
# tsv with boundaries locations
tsv_file = "./input_data/TAD_boundaries/boundaries_no_strong_CTCF.tsv"
# tsv_file = "./input_data/TAD_boundaries/boundaries_strong_CTCFs.tsv"

# table with chromosome sizes
chrom_sizes = "/project/fudenber_735/genomes/mm10/mm10.fa.sizes"

In [3]:
chrom_sizes_table = pd.read_csv(chrom_sizes, sep="\t", names=["chrom", "size"])

seq_coords_df = pd.read_csv(tsv_file, sep="\t", index_col=None)
seq_coords_df = seq_coords_df.loc[:, ~seq_coords_df.columns.str.contains('^Unnamed')]

In [4]:
seq_coords_df = pd.read_csv(tsv_file, sep="\t", index_col=None)
seq_coords_df = seq_coords_df.loc[:, ~seq_coords_df.columns.str.contains('^Unnamed')]

In [5]:
# all the TAD boundaries are 10kb-long
# for the comparisson purposes, let's center all the prediction windows on the TAD boundaries

seq_length = 1310720

In [6]:
rel_tad_start = int((((seq_length / 2) - 5000) // 2048) * 2048)

In [7]:
# let's approximate 4.88 bins as 5 bins

rel_tad_end = rel_tad_start + 2048*5

In [8]:
to_window_end = seq_length - rel_tad_start

In [9]:
seq_coords_df["window_start"] = seq_coords_df["start"] - rel_tad_start
seq_coords_df["window_end"] = seq_coords_df["start"] + to_window_end

In [10]:
# double checking if we didn't get outside of chromosomes
len(seq_coords_df[seq_coords_df["window_start"] <= 0])

0

In [11]:
check_chrom = seq_coords_df.groupby(by="chrom")["window_end"].max().reset_index()

In [12]:
for index, row in check_chrom.iterrows():
    chr_size = int(
        chrom_sizes_table.loc[
            chrom_sizes_table["chrom"] == row.chrom, "size"
        ].iloc[0]
    )
    if chr_size < row.window_end:
        print("problem")
    

In [13]:
all_down_dfs = []

# Loop through each bin (from down_20 to down_1)
for i in range(1, 21):
    # Create a temporary dataframe for each "down" bin
    temp_df = seq_coords_df[["chrom", "start", "end", "window_start", "window_end"]].copy()
    temp_df["rel_disruption_start"] = [(rel_tad_start - i*2048) for _ in range(len(temp_df))]
    temp_df["rel_disruption_end"] = [(rel_tad_start - (i-1)*2048) for _ in range(len(temp_df))]
    temp_df["type"] = [f"down{i}" for _ in range(len(temp_df))]
    
    # Append the dataframe to the list
    all_down_dfs.append(temp_df)

# Concatenate all dataframes into one
combined_down_df = pd.concat(all_down_dfs, ignore_index=True)

In [14]:
tad_dfs = []

# Loop through each TAD bin (from tad1 to tad5)
for i in range(5):  # This will loop from 0 to 4 for tad1 to tad5
    # Create a temporary dataframe for each TAD bin
    temp_df = seq_coords_df[["chrom", "start", "end", "window_start", "window_end"]].copy()
    temp_df["rel_disruption_start"] = [rel_tad_start + (i * 2048) for _ in range(len(temp_df))]
    temp_df["rel_disruption_end"] = [rel_tad_start + ((i + 1) * 2048) for _ in range(len(temp_df))]
    temp_df["type"] = [f"tad{i + 1}" for _ in range(len(temp_df))]
    
    # Append the dataframe to the list
    tad_dfs.append(temp_df)

# Concatenate all TAD dataframes into one
combined_tad_df = pd.concat(tad_dfs, ignore_index=True)

In [15]:
up_dfs = []

# Loop through each UP bin (from up1 to up20)
for i in range(1, 21):  # This will loop from 1 to 20 for up1 to up20
    # Create a temporary dataframe for each UP bin
    temp_df = seq_coords_df[["chrom", "start", "end", "window_start", "window_end"]].copy()
    temp_df["rel_disruption_start"] = [rel_tad_end + (i - 1) * 2048 for _ in range(len(temp_df))]
    temp_df["rel_disruption_end"] = [rel_tad_end + i * 2048 for _ in range(len(temp_df))]
    temp_df["type"] = [f"up{i}" for _ in range(len(temp_df))]
    
    # Append the dataframe to the list
    up_dfs.append(temp_df)

# Concatenate all UP dataframes into one
combined_up_df = pd.concat(up_dfs, ignore_index=True)

In [18]:
df_concatenated = pd.concat([combined_down_df, combined_tad_df, combined_up_df], axis=0, ignore_index=True)

In [17]:
# df_concatenated.to_csv("boundaries_disruption_4.tsv", sep="\t", index=False)