In [2]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from tqdm import tqdm

In [3]:
df_gencode = pd.read_csv("data/1-rmv_dup_introns_gencode_v44.tsv", sep="\t")
df_gencode = df_gencode.set_index(['strand', 'start', 'end'])
df_gencode = df_gencode.sort_index()

chrom_uniques = list(df_gencode["chr"].unique())  # we get the list of unique chromosome names
dict_chr = {}
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
	dict_chr[chrom] = df_gencode[df_gencode.chr == chrom]

dict_chr

100%|██████████| 24/24 [00:00<00:00, 44.30it/s]


{'chr18':                             chr
 strand start    end            
 +      11595    13151     chr18
                 15616     chr18
        13354    15616     chr18
        45235    45282     chr18
        45556    45640     chr18
 ...                         ...
 -      80160606 80202709  chr18
        80183150 80201963  chr18
                 80202709  chr18
        80202018 80202709  chr18
        80202932 80247276  chr18
 
 [7267 rows x 1 columns],
 'chr3':                              chr
 strand start     end            
 +      11799     14770      chr3
                  20556      chr3
                  23760      chr3
        12659     14770      chr3
                  20890      chr3
 ...                          ...
 -      198121656 198122027  chr3
        198122079 198122536  chr3
        198122645 198122729  chr3
        198122984 198123051  chr3
        198224435 198224520  chr3
 
 [24732 rows x 1 columns],
 'chr16':                             chr
 strand start

In [10]:
# Sample query: Find the nearest entry of chr1 with strand '+' and nearest to 13579
# query_chr = 'chr1'
# query_strand = '-'
# query_position = 14499

def find_nearest_match_start_ss(row, site):
    # print(row)
    query_chr = row["chr"]
    query_strand = row["strand"]
    query_position = row[site]

    # Filter the DataFrame for the relevant entries
    filtered_df = dict_chr[query_chr].loc[(query_strand)].copy()


    # Calculate the distance to the specified position
    filtered_df['distance'] = np.abs(filtered_df.index.get_level_values(site) - query_position)

    # Exclude exact match from the DataFrame (if it exists)
    filtered_df = filtered_df[filtered_df['distance'] != 0]

    # Find the row with the minimum distance (i.e., the nearest entry after excluding exact matches)
    nearest_entry = filtered_df.loc[filtered_df['distance'].idxmin()]

    return nearest_entry.distance

In [11]:
df_training_data = pd.read_csv("data/4.2_hg38_paired_introns.tsv", sep="\t", usecols=["chr","start","end","strand"])
df_training_data

Unnamed: 0,chr,start,end,strand
0,chr1,12227,12612,+
1,chr1,12721,13220,+
2,chr1,12057,12178,+
3,chr1,12697,12974,+
4,chr1,13052,13220,+
...,...,...,...,...
499138,chrY,24883840,24886132,+
499139,chrY,24888605,24889352,+
499140,chrY,24889386,24901111,+
499141,chrY,24833970,24840730,+


In [12]:
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [13]:
%%time

df_test = df_training_data.copy().head(10000)
df_test["start_site_nearest"] = df_test.parallel_apply(find_nearest_match_start_ss, args=("start",), axis=1)
df_test

CPU times: user 32.8 ms, sys: 46.4 ms, total: 79.2 ms
Wall time: 5.66 s


Unnamed: 0,chr,start,end,strand,start_site_nearest
0,chr1,12227,12612,+,170
1,chr1,12721,13220,+,24
2,chr1,12057,12178,+,170
3,chr1,12697,12974,+,24
4,chr1,13052,13220,+,322
...,...,...,...,...,...
9995,chr1,41848445,41848870,-,644
9996,chr1,41849089,41918412,-,644
9997,chr1,41918524,42035806,-,52909
9998,chr1,41849089,41864577,-,644


In [14]:
%%time
df_training_data["nearest_start_ss_dist"] = df_training_data.parallel_apply(find_nearest_match_start_ss, args=("start",), axis=1)

CPU times: user 95 ms, sys: 73 ms, total: 168 ms
Wall time: 3min 36s


In [15]:
df_training_data

Unnamed: 0,chr,start,end,strand,nearest_start_ss_dist
0,chr1,12227,12612,+,170
1,chr1,12721,13220,+,24
2,chr1,12057,12178,+,170
3,chr1,12697,12974,+,24
4,chr1,13052,13220,+,322
...,...,...,...,...,...
499138,chrY,24883840,24886132,+,5
499139,chrY,24888605,24889352,+,2384
499140,chrY,24889386,24901111,+,781
499141,chrY,24833970,24840730,+,159


In [16]:
%%time
df_training_data["nearest_end_ss_dist"] = df_training_data.parallel_apply(find_nearest_match_start_ss, args=("end",), axis=1)

CPU times: user 102 ms, sys: 99.5 ms, total: 202 ms
Wall time: 3min 26s


In [17]:
df_training_data

Unnamed: 0,chr,start,end,strand,nearest_start_ss_dist,nearest_end_ss_dist
0,chr1,12227,12612,+,170,362
1,chr1,12721,13220,+,24,232
2,chr1,12057,12178,+,170,434
3,chr1,12697,12974,+,24,246
4,chr1,13052,13220,+,322,232
...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,5,16
499139,chrY,24888605,24889352,+,2384,819
499140,chrY,24889386,24901111,+,781,4072
499141,chrY,24833970,24840730,+,159,421


In [18]:
df_training_data.to_csv("data/4.4-nearest_ss_distance.tsv", sep="\t", index=False)