In [2]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
df_merged = pd.read_csv("data/merged_train_set", sep="\t")
df_merged

In [None]:
df_recount3 = pd.read_csv("ftp-data/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

In [None]:
chrom_uniques = list(df_recount3.Chrom.unique())  # we get the list of unique chromosome names
df_chrom = {}

In [None]:
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
    df_chrom[chrom] = df_recount3[df_recount3.Chrom == chrom]

df_chrom

In [None]:
def match_recount(row):
    try:
        score = df_chrom[row.chr].loc[(row.strand, row.start, row.end), ["Features", "Score"]].values
    except:
        score = [None, -1]
    # print(score)
    return score

In [None]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

In [None]:
# %%time
# df_test = df_merged.copy().head(10)
# df_test["RC3"] = df_test.apply(match_recount, axis=1)
# df_test

In [None]:
%%time
df_merged["RC3"] = df_merged.parallel_apply(match_recount, axis=1)

In [None]:
df_merged

In [3]:
# Optional backup:
# df_merged.to_csv("modded_recount3_inc", sep="\t", index=False)
df_merged = pd.read_csv("data/3-temp-output.tsv", sep="\t")
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21]
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...
...,...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0,"[None, -1]"
517421,chrX,151409211,151456968,+,0,"[None, -1]"
517422,chrX,153906578,153906694,-,0,"[None, -1]"
517423,chrX,153906410,153906520,-,0,"[None, -1]"


# Adding repeat features to the merged train set


In [4]:
def match_repeat_features(row):
    matched_rows = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.start}-{row.end} | cut -f4 | sort -u
    return list(matched_rows)

In [5]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
# %%time
# # Test cell
# df_draft = df_merged.copy().head(100)
# df_draft["Repeat_overlap"] = df_draft.parallel_apply(match_repeat_features, axis=1)
# df_draft

In [7]:
%%time
df_merged["repeat_features"] = df_merged.parallel_apply(match_repeat_features, axis=1)
df_merged

CPU times: user 491 ms, sys: 81.6 ms, total: 572 ms
Wall time: 5min 43s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...,[]
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...,[]
2,chr1,12057,12178,+,1,"[None, -1]",[]
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21],[]
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...,[]
...,...,...,...,...,...,...,...
517420,chrX,151403680,151404937,+,0,"[None, -1]","[Dust, Tandem repeats, Type I Transposons/SINE]"
517421,chrX,151409211,151456968,+,0,"[None, -1]","[Dust, Low complexity regions, LTRs, Simple re..."
517422,chrX,153906578,153906694,-,0,"[None, -1]",[]
517423,chrX,153906410,153906520,-,0,"[None, -1]",[]


In [8]:
df_merged.to_csv("data/3-final-output.tsv", sep="\t", index=False)