In [1]:
import pandas as pd
import numpy as np
import bioframe as bf

### Overlapping Akita V1 and Akita V2 test sets

In [2]:
v1_data_split_path = "/project/fudenber_735/tensorflow_models/akita/v1/data/sequences.bed"
v2_data_split_path = "/project/fudenber_735/tensorflow_models/akita/v2/data/hg38/sequences.bed"

In [3]:
# reading test data for V1
sequences_V1 = pd.read_csv(v1_data_split_path, sep='\t', names=['chr','start','stop','type'])
sequences_test_V1 = sequences_V1.iloc[sequences_V1['type'].values=='test']
sequences_test_V1.reset_index(inplace=True, drop=True)

In [4]:
# reading test data for V2
sequences_V2 = pd.read_csv(v2_data_split_path, sep='\t', names=['chr','start','stop','type'])

In [5]:
df_overlap = bf.overlap(
        sequences_test_V1, sequences_V2, how="inner", suffixes=("_v1", "_v2"), cols1=["chr", "start", "stop"], cols2=["chr", "start", "stop"],
    )

In [6]:
# finding the closest windows
df_overlap["v1_midpoint"] = df_overlap["start_v1"] + (0.5*(df_overlap["stop_v1"] - df_overlap["start_v1"]))
df_overlap["v2_midpoint"] = df_overlap["start_v2"] + (0.5*(df_overlap["stop_v2"] - df_overlap["start_v2"]))
df_overlap["midpoint_dist"] = np.abs(df_overlap["v1_midpoint"]-df_overlap["v2_midpoint"])

In [7]:
# selecting v2 windows with minimal distance from the v1 windows
df_sorted = df_overlap.sort_values(by=['chr_v1', 'start_v1', 'stop_v1', 'midpoint_dist'], ascending=[True, True, True, True])
df_unique = df_sorted.groupby(['chr_v1', 'start_v1', 'stop_v1']).first().reset_index()

In [8]:
# renaming columns that are gonna be saved
df_unique = df_unique.rename(columns={"chr_v1" : "chr",
                                     "start_v1" : "start",
                                     "stop_v1" : "stop"})

In [9]:
df_unique = df_unique[["chr", "start", "stop", "type_v2"]]

In [12]:
# df_unique

In [11]:
# df_unique.to_csv("./data/v1_v2_sequences.tsv", sep="\t", index=False, header=True)