In [1]:
import pandas as pd
from pandarallel import pandarallel

In [2]:
df_trainset = pd.read_csv("data/04_train_set.tsv", sep="\t")
df_trainset

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0
519030,chrY,25513173,25513588,-,0
519031,chrY,25513745,25516715,-,0
519032,chrY,25525288,25527646,-,0


# Generating repeat features
We will generate the start ss (Donor) and end ss (Acceptor) separately, in the final script it will be merged to reduce run-time


In [3]:
def match_repeat_features_start_ss(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    matched_rows = !tabix data/resources/repeat_features.bed.gz {row.chr}:{row.start-2}-{row.start+2} | cut -f4
    return list(set(matched_rows + []))

In [4]:
def match_repeat_features_end_ss(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.end-2}), we also look at the small-region that proceeds (2 nt in the exon; {row.end+2}) the splice-site.
    """
    matched_rows2 = !tabix data/resources/repeat_features.bed.gz {row.chr}:{row.end-2}-{row.end+2} | cut -f4
        # | cut -f4 | sort -u
    return list(set([] + matched_rows2))

In [5]:
pandarallel.initialize(nb_workers=12, progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
# %%time
# # Test cell
# df_draft = df_trainset.copy().head(20)
# df_draft["Repeat_overlap"] = df_draft.apply(match_repeat_features_start_ss, axis=1)
# df_draft

In [6]:
%%time
df_trainset["repeat_features_start_site"] = df_trainset.parallel_apply(match_repeat_features_start_ss, axis=1)
df_trainset

CPU times: user 186 ms, sys: 57.6 ms, total: 244 ms
Wall time: 5min 48s


Unnamed: 0,chr,start,end,strand,class,repeat_features_start_site
0,chr1,12227,12612,+,1,[]
1,chr1,12721,13220,+,1,[]
2,chr1,12057,12178,+,1,[]
3,chr1,12697,12974,+,1,[]
4,chr1,13052,13220,+,1,[]
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,[]
519030,chrY,25513173,25513588,-,0,[]
519031,chrY,25513745,25516715,-,0,[]
519032,chrY,25525288,25527646,-,0,[]


In [7]:
%%time
df_trainset["repeat_features_end_site"] = df_trainset.parallel_apply(match_repeat_features_end_ss, axis=1)
df_trainset

CPU times: user 193 ms, sys: 79.9 ms, total: 273 ms
Wall time: 5min 47s


Unnamed: 0,chr,start,end,strand,class,repeat_features_start_site,repeat_features_end_site
0,chr1,12227,12612,+,1,[],[]
1,chr1,12721,13220,+,1,[],[]
2,chr1,12057,12178,+,1,[],[]
3,chr1,12697,12974,+,1,[],[]
4,chr1,13052,13220,+,1,[],[]
...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,[],[]
519030,chrY,25513173,25513588,-,0,[],[]
519031,chrY,25513745,25516715,-,0,[],[]
519032,chrY,25525288,25527646,-,0,[],[]


In [8]:
df_trainset.to_csv("data/06_trainset_repeat_feature.tsv", sep="\t", index=False)

In [9]:
!sha512sum data/06_trainset_repeat_feature.tsv

a2669a29d3663380ef41fcb5c8efe2ca8f1b2c9a2723f573af092ef1c0bc75ed6d2ba328333114a144c90902997f97a54e51cff672f875426ecff3fae6b7ab58  data/06_trainset_repeat_feature.tsv
