In [1]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from tqdm import tqdm

In [2]:
df_gencode = pd.read_csv("data/01_gencode_introns.tsv", sep="\t")
df_gencode = df_gencode[~(df_gencode.end - df_gencode.start <= 4)]
df_gencode = df_gencode.set_index(['strand', 'start', 'end'])
df_gencode = df_gencode.sort_index()
df_gencode

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,chr
strand,start,end,Unnamed: 3_level_1
+,11595,13151,chr18
+,11595,15616,chr18
+,11799,14770,chr3
+,11799,20556,chr3
+,11799,23760,chr3
...,...,...,...
-,248858321,248858511,chr1
-,248858321,248858917,chr1
-,248858321,248859014,chr1
-,248858385,248858917,chr1


In [3]:
chrom_uniques = list(df_gencode["chr"].unique())  # we get the list of unique chromosome names
dict_chr = {}
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
	dict_chr[chrom] = df_gencode[df_gencode.chr == chrom]

dict_chr

100%|██████████| 24/24 [00:00<00:00, 74.00it/s]


{'chr18':                             chr
 strand start    end            
 +      11595    13151     chr18
                 15616     chr18
        13354    15616     chr18
        45235    45282     chr18
        45556    45640     chr18
 ...                         ...
 -      80160606 80202709  chr18
        80183150 80201963  chr18
                 80202709  chr18
        80202018 80202709  chr18
        80202932 80247276  chr18
 
 [7267 rows x 1 columns],
 'chr3':                              chr
 strand start     end            
 +      11799     14770      chr3
                  20556      chr3
                  23760      chr3
        12659     14770      chr3
                  20890      chr3
 ...                          ...
 -      198121656 198122027  chr3
        198122079 198122536  chr3
        198122645 198122729  chr3
        198122984 198123051  chr3
        198224435 198224520  chr3
 
 [24731 rows x 1 columns],
 'chr16':                             chr
 strand start

In [4]:
# Sample query: Find the nearest (and not exact) entry of chr1 with strand '+' and nearest to 13579
# query_chr = 'chr1'
# query_strand = '-'
# query_position = 14499

def find_nearest_match_start_ss(row, site):
    query_chr = row["chr"]
    query_strand = row["strand"]
    query_position = row[site]

    # Filter the DataFrame for the relevant entries
    filtered_df = dict_chr[query_chr].loc[query_strand].copy()

    # Calculate the distance to the specified position
    filtered_df['distance'] = np.abs(filtered_df.index.get_level_values(site) - query_position)

    # Exclude exact match from the DataFrame (if it exists)
    filtered_df = filtered_df[filtered_df['distance'] != 0]

    # Find the row with the minimum distance (i.e., the nearest entry after excluding exact matches)
    nearest_entry = filtered_df.loc[filtered_df['distance'].idxmin()]

    return nearest_entry.distance

In [5]:
sha512sum = !sha512sum data/08_trainset_with_seq.tsv
assert("159da72ef546de9f15bfc662d51b70987134035499c64674f4240279ed9d120d34b0312bf98f1cda8c4d43222df7175f818c6d53c9dde2487ea7cf747b51ab05" in sha512sum[0])
df_training_data = pd.read_csv("data/08_trainset_with_seq.tsv", sep="\t", usecols=["chr","start","end","strand"])
df_training_data

Unnamed: 0,chr,start,end,strand
0,chr1,12227,12612,+
1,chr1,12721,13220,+
2,chr1,12057,12178,+
3,chr1,12697,12974,+
4,chr1,13052,13220,+
...,...,...,...,...
519029,chrY,25464577,25465486,+
519030,chrY,25513173,25513588,-
519031,chrY,25513745,25516715,-
519032,chrY,25525288,25527646,-


In [6]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
# %%time
# df_test = df_training_data.copy().head(10000)
# df_test["start_site_nearest"] = df_test.parallel_apply(find_nearest_match_start_ss, args=("start",), axis=1)
# df_test

In [8]:
%%time
df_training_data["nearest_alt_start_ss_dist"] = df_training_data.parallel_apply(find_nearest_match_start_ss, args=("start",), axis=1)

CPU times: user 35.2 ms, sys: 33.1 ms, total: 68.3 ms
Wall time: 58.2 s


In [9]:
%%time
df_training_data["nearest_alt_end_ss_dist"] = df_training_data.parallel_apply(find_nearest_match_start_ss, args=("end",), axis=1)

CPU times: user 38.6 ms, sys: 30.3 ms, total: 68.9 ms
Wall time: 56 s


In [10]:
df_training_data

Unnamed: 0,chr,start,end,strand,nearest_alt_start_ss_dist,nearest_alt_end_ss_dist
0,chr1,12227,12612,+,170,362
1,chr1,12721,13220,+,24,232
2,chr1,12057,12178,+,170,434
3,chr1,12697,12974,+,24,246
4,chr1,13052,13220,+,322,232
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,487,25
519030,chrY,25513173,25513588,-,777,464
519031,chrY,25513745,25516715,-,572,2963
519032,chrY,25525288,25527646,-,1178,2


In [12]:
df_training_data.to_csv("data/09_trainset_nearest_alt_ss_dist_feature", sep="\t", index=False)

In [13]:
!sha512sum data/09_trainset_nearest_alt_ss_dist_feature

9e40b0835c11b7b825e45e01f6a0daa675fa5f40cf5a991b1f266910bb5b5767bc0ae0f6f2eeab0e5d7b6496c9b5a733ca46c78ac63fadfea41177a857d5e11d  data/09_trainset_nearest_alt_ss_dist_feature
