In [1]:
import pandas as pd
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/05_trainset_recount3_feature.tsv", sep="\t", usecols=["chr","start","end","strand","recount3_score"])
df

Unnamed: 0,chr,start,end,strand,recount3_score
0,chr1,12227,12612,+,59151
1,chr1,12721,13220,+,61021
2,chr1,12057,12178,+,0
3,chr1,12697,12974,+,21
4,chr1,13052,13220,+,526
...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0
519030,chrY,25513173,25513588,-,31374
519031,chrY,25513745,25516715,-,67
519032,chrY,25525288,25527646,-,18958


In [3]:
gap = 5

# Start splice site (5")

In [4]:
def better_rc3_match_start_ss(row):
    potential_matches = !tabix data/resources/recount3.bed.gz {row.chr}:{row.start-gap}-{row.start+gap} | cut -f 2,5

    potential_matches = [
        (int(entry.split('\t')[0]), int(entry.split('\t')[1]))
        for entry in potential_matches
        if row['start'] - 5 <= int(entry.split('\t')[0]) <= row['start'] + 5 and int(entry.split('\t')[1]) > row["recount3_score"]
    ]
    if potential_matches:
        return True
    return False

In [5]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
%%time
df_test = df.copy().head(10000)
df_test["better_match_5"] = df_test.parallel_apply(better_rc3_match_start_ss, axis=1)
df_test

CPU times: user 0 ns, sys: 58 ms, total: 58 ms
Wall time: 6.67 s


Unnamed: 0,chr,start,end,strand,recount3_score,better_match_5
0,chr1,12227,12612,+,59151,False
1,chr1,12721,13220,+,61021,True
2,chr1,12057,12178,+,0,False
3,chr1,12697,12974,+,21,True
4,chr1,13052,13220,+,526,False
...,...,...,...,...,...,...
9995,chr1,41848445,41848870,-,881344,False
9996,chr1,41849089,41918412,-,188405,False
9997,chr1,41918524,42035806,-,96929,False
9998,chr1,41849089,41864577,-,61385,True


In [7]:
%%time
df["recount3_near_start_ss_with_better_score"] = df.parallel_apply(better_rc3_match_start_ss, axis=1)
df

CPU times: user 61.3 ms, sys: 42.8 ms, total: 104 ms
Wall time: 5min 14s


Unnamed: 0,chr,start,end,strand,recount3_score,recount3_near_start_ss_with_better_score
0,chr1,12227,12612,+,59151,False
1,chr1,12721,13220,+,61021,True
2,chr1,12057,12178,+,0,False
3,chr1,12697,12974,+,21,True
4,chr1,13052,13220,+,526,False
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,False
519030,chrY,25513173,25513588,-,31374,False
519031,chrY,25513745,25516715,-,67,True
519032,chrY,25525288,25527646,-,18958,False


# End splice site (3")

In [8]:
def better_rc3_match_end_ss(row):
    potential_matches = !tabix data/resources/recount3.bed.gz {row.chr}:{row.end-gap}-{row.end+gap} | cut -f 2,5


    potential_matches = [
        (int(entry.split('\t')[0]), int(entry.split('\t')[1]))
        for entry in potential_matches
        if row['end'] - 5 <= int(entry.split('\t')[0]) <= row['end'] + 5 and int(entry.split('\t')[1]) > row["recount3_score"]
    ]
    if potential_matches:
        return True
    return False

In [9]:
%%time
df["recount3_near_end_ss_with_better_score"] = df.parallel_apply(better_rc3_match_end_ss, axis=1)
df

CPU times: user 20 ms, sys: 49.4 ms, total: 69.4 ms
Wall time: 5min 11s


Unnamed: 0,chr,start,end,strand,recount3_score,recount3_near_start_ss_with_better_score,recount3_near_end_ss_with_better_score
0,chr1,12227,12612,+,59151,False,False
1,chr1,12721,13220,+,61021,True,False
2,chr1,12057,12178,+,0,False,False
3,chr1,12697,12974,+,21,True,False
4,chr1,13052,13220,+,526,False,False
...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,False,False
519030,chrY,25513173,25513588,-,31374,False,False
519031,chrY,25513745,25516715,-,67,True,False
519032,chrY,25525288,25527646,-,18958,False,False


In [11]:
# output
df.to_csv("data/13_recount3_near_ss_with_better_score_feature.tsv", sep="\t", index=False)