In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [10]:
df_merged = pd.read_csv("data/merged_train_set", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0
517421,chrX,151409211,151456968,+,0
517422,chrX,153906578,153906694,-,0
517423,chrX,153906410,153906520,-,0


In [3]:
df_recount3 = pd.read_csv("ftp-data/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Chrom,Features,Score
Strand,Start,End,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
+,12,16498,chrM,GTExv2=15:69;TCGAv2=0:0;SRAv3h=3:7;AT:AC,76
+,15,355,chrUn_KI270303v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:36;GT:AG,36
+,17,65645,chr22_KI270736v1_random,GTExv2=0:0;TCGAv2=0:0;SRAv3h=7:36;GT:AG,36
+,20,608,chrUn_GL000224v1,GTExv2=0:0;TCGAv2=1:1;SRAv3h=27:65;GT:AG,66
+,20,801,chrUn_KI270539v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:35;GT:AG,35
...,...,...,...,...,...
-,248937793,248937887,chr1,GTExv2=3:3;TCGAv2=1:2;SRAv3h=15:16;GT:AG,21
-,248937886,248938073,chr1,GTExv2=2:2;TCGAv2=1:2;SRAv3h=26:27;GT:AG,31
-,248937979,248938073,chr1,GTExv2=4:4;TCGAv2=1:2;SRAv3h=19:20;GT:AG,26
-,248942862,248945072,chr1,GTExv2=2:2;TCGAv2=1:1;SRAv3h=11:29;GT:AG,32


In [4]:
chrom_uniques = list(df_recount3.Chrom.unique())  # we get the list of unique chromosome names
df_chrom = {}

In [None]:
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
    df_chrom[chrom] = df_recount3[df_recount3.Chrom == chrom]

df_chrom

In [25]:
def match_recount(row):
    try:
        score = df_chrom[row.chr].loc[(row.strand, row.start, row.end), ["Features", "Score"]].values
    except:
        score = [None, -1]
    # print(score)
    return score

In [26]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [23]:
# %%time
# df_test = df_merged.copy().head(10)
# df_test["RC3"] = df_test.apply(match_recount, axis=1)
# df_test

CPU times: user 259 ms, sys: 11.7 ms, total: 271 ms
Wall time: 273 ms


Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
5,chr1,13374,13452,+,1,"[None, -1]"
6,chr1,13670,14403,-,1,"[None, -1]"
7,chr1,14501,15004,-,1,"[None, -1]"
8,chr1,15038,15795,-,1,[GTExv2=19076:1598588;TCGAv2=11350:748204;SRAv...
9,chr1,15947,16606,-,1,[GTExv2=19058:612687;TCGAv2=11302:480455;SRAv3...


In [27]:
%%time
df_merged["RC3"] = df_merged.parallel_apply(match_recount, axis=1)

CPU times: user 1.54 s, sys: 294 ms, total: 1.83 s
Wall time: 57min 18s


In [30]:
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
...,...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0,"[None, -1]"
517421,chrX,151409211,151456968,+,0,"[None, -1]"
517422,chrX,153906578,153906694,-,0,"[None, -1]"
517423,chrX,153906410,153906520,-,0,"[None, -1]"


In [2]:
# Optional backup:
# df_merged = pd.read_csv("data/3-temp-output", sep="\t")
# df_merged.to_csv("modded_recount3_inc", sep="\t", index=False)
# df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21]
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...
...,...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0,"[None, -1]"
517421,chrX,151409211,151456968,+,0,"[None, -1]"
517422,chrX,153906578,153906694,-,0,"[None, -1]"
517423,chrX,153906410,153906520,-,0,"[None, -1]"


# Adding repeat features to the merged train set


In [4]:
def match_repeat_features(row):
    matched_rows = !tabix ftp-data / repeat_features.bed.gz {row.chrom}:{row.intron_start}-{row.intron_ends} | cut -f4 | sort -u
    return list(matched_rows)

In [5]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
# Test cell
# %%time
# df_draft = df_merged.copy().head(50)
# df_draft["Repeat_overlap"] = df_draft.parallel_apply(match_repeat_features, axis=1)
# df_draft

In [11]:
%%time
df_merged["repeat_features"] = df_merged.parallel_apply(match_repeat_features, axis=1)
df_merged

CPU times: user 300 ms, sys: 160 ms, total: 460 ms
Wall time: 2min 13s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...,[[tabix] the index file either does not exist ...
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...,[[tabix] the index file either does not exist ...
2,chr1,12057,12178,+,1,"[None, -1]",[[tabix] the index file either does not exist ...
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21],[[tabix] the index file either does not exist ...
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...,[[tabix] the index file either does not exist ...
...,...,...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0,"[None, -1]",[[tabix] the index file either does not exist ...
517421,chrX,151409211,151456968,+,0,"[None, -1]",[[tabix] the index file either does not exist ...
517422,chrX,153906578,153906694,-,0,"[None, -1]",[[tabix] the index file either does not exist ...
517423,chrX,153906410,153906520,-,0,"[None, -1]",[[tabix] the index file either does not exist ...


In [12]:
df_merged.to_csv("data/3-output.tsv", sep="\t", index=False)