In [1]:
import pandas as pd
import numpy as np
import bioframe as bf

## Overlapping Akita V1 and Akita V2 test sets

In [2]:
v1_data_split_path = "/project/fudenber_735/tensorflow_models/akita/v1/data/sequences.bed"

In [3]:
v2_data_split_path = "/project/fudenber_735/tensorflow_models/akita/v2/data/hg38/sequences.bed"

In [4]:
# reading test data for V1
sequences_V1 = pd.read_csv(v1_data_split_path, sep='\t', names=['chr','start','stop','type'])
sequences_test_V1 = sequences_V1.iloc[sequences_V1['type'].values=='test']
sequences_test_V1.reset_index(inplace=True, drop=True)

In [5]:
sequences_V2 = pd.read_csv(v2_data_split_path, sep='\t', names=['chr','start','stop','type'])

In [6]:
df_overlap = bf.overlap(
        sequences_test_V1, sequences_V2, how="inner", suffixes=("_v1", "_v2"), cols1=["chr", "start", "stop"], cols2=["chr", "start", "stop"],
    )

In [7]:
# # Function to check if all values in "type_v2" for each "start_v1" are the same
# def check_uniformity(group):
#     return group['type_v2'].nunique() == 1

In [8]:
# # Grouping by "start_v1" and applying the check
# uniformity_check = df_overlap.groupby('start_v1').apply(check_uniformity)

In [9]:
# -> not all start_v1 have only one type_v2 value,therefore I am gonna find the closest window in test set for V2

In [10]:
# finding the closest windows
df_overlap["v1_midpoint"] = df_overlap["start_v1"] + (0.5*(df_overlap["stop_v1"] - df_overlap["start_v1"]))
df_overlap["v2_midpoint"] = df_overlap["start_v2"] + (0.5*(df_overlap["stop_v2"] - df_overlap["start_v2"]))
df_overlap["midpoint_dist"] = np.abs(df_overlap["v1_midpoint"]-df_overlap["v2_midpoint"])

In [11]:
# selecting v2 windows with minimal distance from the v1 windows
df_sorted = df_overlap.sort_values(by=['chr_v1', 'start_v1', 'stop_v1', 'midpoint_dist'], ascending=[True, True, True, True])
df_unique = df_sorted.groupby(['chr_v1', 'start_v1', 'stop_v1']).first().reset_index()

In [12]:
# renaming columns that are gonna be saved
df_unique = df_unique.rename(columns={"chr_v1" : "chr",
                                     "start_v1" : "start",
                                     "stop_v1" : "stop"})

In [13]:
df_unique = df_unique[["chr", "start", "stop", "type_v2"]]

In [14]:
# df_unique.to_csv("v1_v2_sequences.tsv", sep="\t", index=False, header=True)