In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [2]:
df_merged = pd.read_csv("data/merged_train_set", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0
517421,chrX,151409210,151456968,+,0
517422,chrX,153906577,153906694,-,0
517423,chrX,153906409,153906520,-,0


In [3]:
df_recount3 = pd.read_csv("ftp-data/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Chrom,Features,Score
Strand,Start,End,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
+,12,16498,chrM,GTExv2=15:69;TCGAv2=0:0;SRAv3h=3:7;AT:AC,76
+,15,355,chrUn_KI270303v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:36;GT:AG,36
+,17,65645,chr22_KI270736v1_random,GTExv2=0:0;TCGAv2=0:0;SRAv3h=7:36;GT:AG,36
+,20,608,chrUn_GL000224v1,GTExv2=0:0;TCGAv2=1:1;SRAv3h=27:65;GT:AG,66
+,20,801,chrUn_KI270539v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:35;GT:AG,35
...,...,...,...,...,...
-,248937793,248937887,chr1,GTExv2=3:3;TCGAv2=1:2;SRAv3h=15:16;GT:AG,21
-,248937886,248938073,chr1,GTExv2=2:2;TCGAv2=1:2;SRAv3h=26:27;GT:AG,31
-,248937979,248938073,chr1,GTExv2=4:4;TCGAv2=1:2;SRAv3h=19:20;GT:AG,26
-,248942862,248945072,chr1,GTExv2=2:2;TCGAv2=1:1;SRAv3h=11:29;GT:AG,32


In [4]:
chrom_uniques = list(df_recount3.Chrom.unique())  # we get the list of unique chromosome names
df_chrom = {}

In [5]:
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
    df_chrom[chrom] = df_recount3[df_recount3.Chrom == chrom]

df_chrom

100%|██████████| 170/170 [01:21<00:00,  2.09it/s]


{'chrM':                    Chrom                                    Features  Score
 Strand Start End                                                           
 +      12    16498  chrM    GTExv2=15:69;TCGAv2=0:0;SRAv3h=3:7;AT:AC     76
        53    85     chrM    GTExv2=8:8;TCGAv2=2:2;SRAv3h=26:28;GT:AG     38
              124    chrM    GTExv2=1:1;TCGAv2=2:3;SRAv3h=20:28;GT:AG     32
              184    chrM   GTExv2=0:0;TCGAv2=0:0;SRAv3h=21:257;GT:AG    257
              228    chrM    GTExv2=0:0;TCGAv2=0:0;SRAv3h=20:68;GT:AG     68
 ...                  ...                                         ...    ...
 -      16450 16511  chrM    GTExv2=1:1;TCGAv2=5:5;SRAv3h=46:54;GT:AG     60
        16466 16488  chrM  GTExv2=7:8;TCGAv2=10:10;SRAv3h=38:42;GT:AG     60
              16498  chrM    GTExv2=1:2;TCGAv2=4:4;SRAv3h=44:52;GT:AG     58
              16511  chrM    GTExv2=2:2;TCGAv2=4:4;SRAv3h=21:22;GT:AG     28
        16477 16527  chrM     GTExv2=0:0;TCGAv2=0:0;SRAv3h=6:24;GC:A

In [6]:
def match_recount(row):
    try:
        score = df_chrom[row.chr].loc[(row.strand, row.start, row.end), ["Features", "Score"]].values
    except:
        score = [None, -1]
    # print(score)
    return score

In [7]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [8]:
# %%time
# df_test = df_merged.copy().head(10)
# df_test["RC3"] = df_test.apply(match_recount, axis=1)
# df_test

In [9]:
%%time
df_merged["RC3"] = df_merged.parallel_apply(match_recount, axis=1)

CPU times: user 1.26 s, sys: 327 ms, total: 1.58 s
Wall time: 52min 16s


In [10]:
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,"[None, -1]"
517421,chrX,151409210,151456968,+,0,"[None, -1]"
517422,chrX,153906577,153906694,-,0,"[None, -1]"
517423,chrX,153906409,153906520,-,0,"[None, -1]"


In [11]:
# Optional backup:
# df_merged.to_csv("data/3-temp-output.tsv", sep="\t", index=False)
df_merged = pd.read_csv("data/3-temp-output.tsv", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,"[None, -1]"
517421,chrX,151409210,151456968,+,0,"[None, -1]"
517422,chrX,153906577,153906694,-,0,"[None, -1]"
517423,chrX,153906409,153906520,-,0,"[None, -1]"


# Adding repeat features to the merged train set


In [23]:
def match_repeat_features(row):
    matched_rows = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.start}-{row.end} | cut -f4
        # | cut -f4 | sort -u
    return list(matched_rows)

In [24]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [22]:
# %%time
# # Test cell
# df_draft = df_merged.copy().head(100)
# df_draft["Repeat_overlap"] = df_draft.apply(match_repeat_features, axis=1)
# df_draft

CPU times: user 67.8 ms, sys: 12.4 ms, total: 80.2 ms
Wall time: 494 ms


Unnamed: 0,chr,start,end,strand,class,RC3,Repeat_overlap
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[]
2,chr1,12057,12178,+,1,"[None, -1]",[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[]
...,...,...,...,...,...,...,...
95,chr1,258567,263014,-,1,"[None, -1]","[Dust, LTRs, Tandem repeats, Type I Transposon..."
96,chr1,268816,289265,-,1,[GTExv2=6857:13834;TCGAv2=2556:3736;SRAv3h=610...,"[Dust, Low complexity regions, LTRs, Simple re..."
97,chr1,289370,297344,-,1,[GTExv2=273:365;TCGAv2=25:34;SRAv3h=1372:1898;...,"[Dust, Low complexity regions, LTRs, Type I Tr..."
98,chr1,267056,268121,+,1,[GTExv2=103:137;TCGAv2=132:206;SRAv3h=455:760;...,"[Dust, LTRs, Type I Transposons/SINE]"


In [25]:
%%time
df_merged["repeat_features"] = df_merged.parallel_apply(match_repeat_features, axis=1)
df_merged

CPU times: user 3.39 s, sys: 593 ms, total: 3.98 s
Wall time: 5min 35s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[]
2,chr1,12057,12178,+,1,"[None, -1]",[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[]
...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,"[None, -1]","[Dust, Type I Transposons/SINE, Dust, Tandem r..."
517421,chrX,151409210,151456968,+,0,"[None, -1]","[Type II Transposons, Type I Transposons/SINE,..."
517422,chrX,153906577,153906694,-,0,"[None, -1]",[]
517423,chrX,153906409,153906520,-,0,"[None, -1]",[]


In [26]:
df_merged.to_csv("data/3-final-output.tsv", sep="\t", index=False)