In [1]:
import pandas as pd
import numpy as np

### Functions

In [2]:
def generate_good_windows(df_bad_windows):
    
    # Initialize a list to hold the good windows
    good_windows = []
    
    # Loop through the bad windows to create good windows
    for i in range(1, len(df_bad_windows)):
        prev_end = df_bad_windows.iloc[i-1]['end']
        curr_start = df_bad_windows.iloc[i]['start']
        
        # Create a good window if there's a gap
        if prev_end < curr_start:
            good_windows.append([df_bad_windows.iloc[i-1]['chr'], prev_end, curr_start])
    
    # Convert the good windows list to a DataFrame
    df_good = pd.DataFrame(good_windows, columns=['chr', 'start', 'end'])
    
    return df_good


def create_expanded_df(df):
    # Initialize lists to store the resulting rows
    chr_list = []
    start_list = []
    end_list = []
    genome_window_start_list = []

    # Iterate through each good window
    for good_window in df.itertuples(index=False):
        num_jumps = (good_window.end - good_window.start - seq_length) // bin_size

        for bin_shift_index in range(num_jumps):
            genome_window_start = good_window.start + bin_shift_index * bin_size

            # Append the values to the respective lists
            chr_list.append(good_window.chr)
            start_list.append(good_window.start)
            end_list.append(good_window.end)
            genome_window_start_list.append(genome_window_start)
                
    # Create the resulting DataFrame
    expanded_df = pd.DataFrame({
        'chr': chr_list,
        'start': start_list,
        'end': end_list,
        'genome_window_start': genome_window_start_list,
    })

    return expanded_df

### Reading table with gap windows

In [3]:
bad_windows_file = '/project/fudenber_735/backup/DNN_HiC/akita_paper_data/akita_mm10/200213_gaps_mm10_binSize2048_numconseq10.bed'
bad_windows = pd.read_csv(bad_windows_file, sep="\t", names=["chr", "start", "end"])

### Generating complementary windows (no gaps)

In [4]:
good_windows = generate_good_windows(bad_windows)

# focusing on first two good windows on chr1
chr1 = good_windows[good_windows["chr"] == "chr1"]
df = chr1[:2]

In [5]:
# parameters
seq_length = 1310720
bin_size = 2048
split = 10 # shifting by 1/10 bin

### Creating table with all the prediction windows

In [6]:
expanded_df = create_expanded_df(df)
# expanded_df.to_csv('chr1_2windows.tsv', sep="\t", index=False) 