In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [2]:
df_merged = pd.read_csv("data/2.2-chr_renamed_train_set.tsv", sep="\t")
df_merged #402857

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0
499139,chrY,24888605,24889352,+,0
499140,chrY,24889386,24901111,+,0
499141,chrY,24833970,24840730,+,0


In [3]:
df_recount3 = pd.read_csv("ftp-data/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Chrom,Features,Score
Strand,Start,End,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
+,12,16498,chrM,GTExv2=15:69;TCGAv2=0:0;SRAv3h=3:7;AT:AC,76
+,15,355,chrUn_KI270303v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:36;GT:AG,36
+,17,65645,chr22_KI270736v1_random,GTExv2=0:0;TCGAv2=0:0;SRAv3h=7:36;GT:AG,36
+,20,608,chrUn_GL000224v1,GTExv2=0:0;TCGAv2=1:1;SRAv3h=27:65;GT:AG,66
+,20,801,chrUn_KI270539v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:35;GT:AG,35
...,...,...,...,...,...
-,248937793,248937887,chr1,GTExv2=3:3;TCGAv2=1:2;SRAv3h=15:16;GT:AG,21
-,248937886,248938073,chr1,GTExv2=2:2;TCGAv2=1:2;SRAv3h=26:27;GT:AG,31
-,248937979,248938073,chr1,GTExv2=4:4;TCGAv2=1:2;SRAv3h=19:20;GT:AG,26
-,248942862,248945072,chr1,GTExv2=2:2;TCGAv2=1:1;SRAv3h=11:29;GT:AG,32


In [4]:
# chrom_uniques = list(df_recount3.Chrom.unique())  # we get the list of unique chromosome names
ordinary_chr_list = ["chr"+str(x) for x in range(1,23)] + ["chrX"] + ["chrY"] + ["chrM"]
df_chrom = {}

In [5]:
ordinary_chr_list

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY',
 'chrM']

In [6]:
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(ordinary_chr_list):#chrom_uniques):
    df_chrom[chrom] = df_recount3[df_recount3.Chrom == chrom]


100%|██████████| 25/25 [00:13<00:00,  1.79it/s]


In [7]:
def match_recount(row):
    try:
        score = df_chrom[row.chr].loc[(row.strand, row.start, row.end), ["Features", "Score"]].values
    except:
        score = [None, -1]
    return score

In [8]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
# %%time
# df_test = df_merged.copy().head(10)
# df_test["RC3"] = df_test.apply(match_recount, axis=1)
# df_test

In [10]:
%%time
df_merged["RC3"] = df_merged.parallel_apply(match_recount, axis=1)

CPU times: user 1.27 s, sys: 251 ms, total: 1.52 s
Wall time: 53min 19s


In [11]:
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]"
499139,chrY,24888605,24889352,+,0,"[None, -1]"
499140,chrY,24889386,24901111,+,0,"[None, -1]"
499141,chrY,24833970,24840730,+,0,"[None, -1]"


In [12]:
# Optional backup:
df_merged.to_csv("data/3-temp-output-1.tsv", sep="\t", index=False)
# df_merged = pd.read_csv("data/3-temp-output-1.tsv", sep="\t")  # sha512sum ad0f3a08b8c7986d341543e75a19d5335c8b65ddeb4d28dd63016b4f79eabad0fe244b26b8031475cf5e6f212ea0ae9a2f0bdb200b692a77736de84c777a5b10
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...
2,chr1,12057,12178,+,1,"[None, -1]"
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]"
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...
...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]"
499139,chrY,24888605,24889352,+,0,"[None, -1]"
499140,chrY,24889386,24901111,+,0,"[None, -1]"
499141,chrY,24833970,24840730,+,0,"[None, -1]"


# Adding repeat features to the merged train set


In [14]:
def match_repeat_features_1(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    matched_rows = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.start-2}-{row.start+2} | cut -f4
    return list(set(matched_rows + []))

In [15]:
def match_repeat_features_2(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.end-2}), we also look at the small-region that proceeds (2 nt in the exon; {row.end+2}) the splice-site.
    """
    matched_rows2 = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.end-2}-{row.end+2} | cut -f4
        # | cut -f4 | sort -u
    return list(set([] + matched_rows2))

In [16]:
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [17]:
%%time
# Test cell
df_draft = df_merged.copy().head(100)
df_draft["Repeat_overlap"] = df_draft.apply(match_repeat_features_1, axis=1)
df_draft

CPU times: user 63.7 ms, sys: 20.9 ms, total: 84.7 ms
Wall time: 531 ms


Unnamed: 0,chr,start,end,strand,class,RC3,Repeat_overlap
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[]
2,chr1,12057,12178,+,1,"[None, -1]",[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[]
...,...,...,...,...,...,...,...
95,chr1,498456,501555,-,1,[GTExv2=12929:52155;TCGAv2=4637:16478;SRAv3h=6...,[Type II Transposons]
96,chr1,501620,502186,-,1,[GTExv2=17319:212570;TCGAv2=5829:20238;SRAv3h=...,[]
97,chr1,502243,502464,-,1,[GTExv2=14987:88036;TCGAv2=4894:11018;SRAv3h=6...,[Type II Transposons]
98,chr1,497299,498046,-,1,[GTExv2=4516:12484;TCGAv2=800:1497;SRAv3h=2282...,[Type I Transposons/SINE]


In [18]:
del df_draft

In [19]:
%%time
df_merged["repeat_features_start_site"] = df_merged.parallel_apply(match_repeat_features_1, axis=1)
df_merged

CPU times: user 2.29 s, sys: 217 ms, total: 2.51 s
Wall time: 5min 25s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[]
2,chr1,12057,12178,+,1,"[None, -1]",[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[]
...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]",[]
499139,chrY,24888605,24889352,+,0,"[None, -1]",[]
499140,chrY,24889386,24901111,+,0,"[None, -1]",[]
499141,chrY,24833970,24840730,+,0,"[None, -1]",[]


In [20]:
%%time
df_merged["repeat_features_end_site"] = df_merged.parallel_apply(match_repeat_features_2, axis=1)
df_merged

CPU times: user 1.96 s, sys: 255 ms, total: 2.21 s
Wall time: 5min 25s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[],[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[],[]
2,chr1,12057,12178,+,1,"[None, -1]",[],[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[],[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[],[]
...,...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]",[],[]
499139,chrY,24888605,24889352,+,0,"[None, -1]",[],[]
499140,chrY,24889386,24901111,+,0,"[None, -1]",[],[]
499141,chrY,24833970,24840730,+,0,"[None, -1]",[],[]


In [21]:
# df_merged.to_csv("data/3-temp-output-2.tsv", sep="\t", index=False)
df_merged = pd.read_csv("data/3-temp-output-2.tsv", sep="\t")

# Splice-site Antisense feature

In [22]:
df_exons = pd.read_csv("/home/rabbit/Documents/Projects/ML_gene_annot/ftp-data/gencode.v44.annotation.gtf", sep="\t", comment="#", header=None, names=["chr", "annotator", "type", "start", "end", "x1", "strand", "x2", "features"])
df_exons = df_exons[df_exons["type"] == "exon"]
df_exons = df_exons.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_exons

Unnamed: 0,chr,annotator,type,start,end,x1,strand,x2,features
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
7,chr1,HAVANA,exon,12010,12057,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
8,chr1,HAVANA,exon,12179,12227,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
3424172,chrM,ENSEMBL,exon,14149,14673,.,-,.,"gene_id ""ENSG00000198695.2""; transcript_id ""EN..."
3424177,chrM,ENSEMBL,exon,14674,14742,.,-,.,"gene_id ""ENSG00000210194.1""; transcript_id ""EN..."
3424180,chrM,ENSEMBL,exon,14747,15887,.,+,.,"gene_id ""ENSG00000198727.2""; transcript_id ""EN..."
3424185,chrM,ENSEMBL,exon,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."


In [23]:
def df_to_bed(df, bed_file):
    with open(bed_file, 'w') as bed:
        for index, row in df.iterrows():

            chrom = row[0]
            start = int(row[3]) - 1  # Convert 1-based to 0-based (BED format)
            end = int(row[4])
            strand = row[6]


            bed_line = f"{chrom}\t{start-1}\t{end}\t.\t.\t{strand}\n"
            bed.write(bed_line)

# Convert the DataFrame to a BED file
bed_file_path = 'exon.bed'
df_to_bed(df_exons, bed_file_path)

In [24]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [25]:
def antisense_exon_start(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.start}-{row.start+2} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

def antisense_exon_stop(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.end-2}-{row.end} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

In [26]:
df_merged

Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[],[]
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[],[]
2,chr1,12057,12178,+,1,"[None, -1]",[],[]
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[],[]
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[],[]
...,...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]",[],[]
499139,chrY,24888605,24889352,+,0,"[None, -1]",[],[]
499140,chrY,24889386,24901111,+,0,"[None, -1]",[],[]
499141,chrY,24833970,24840730,+,0,"[None, -1]",[],[]


In [27]:
%%time
df_merged["ss_antisense_start_site"] = df_merged.parallel_apply(antisense_exon_start, axis=1)
df_merged

CPU times: user 1.79 s, sys: 327 ms, total: 2.12 s
Wall time: 12min 29s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site,ss_antisense_start_site
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[],[],False
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[],[],False
2,chr1,12057,12178,+,1,"[None, -1]",[],[],False
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[],[],False
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[],[],False
...,...,...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]",[],[],False
499139,chrY,24888605,24889352,+,0,"[None, -1]",[],[],False
499140,chrY,24889386,24901111,+,0,"[None, -1]",[],[],False
499141,chrY,24833970,24840730,+,0,"[None, -1]",[],[],False


In [28]:
%%time
df_merged["ss_antisense_end_site"] = df_merged.parallel_apply(antisense_exon_stop, axis=1)
df_merged

CPU times: user 1.79 s, sys: 390 ms, total: 2.18 s
Wall time: 12min 30s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site,ss_antisense_start_site,ss_antisense_end_site
0,chr1,12227,12612,+,1,[GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374:...,[],[],False,False
1,chr1,12721,13220,+,1,[GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=14048...,[],[],False,False
2,chr1,12057,12178,+,1,"[None, -1]",[],[],False,False
3,chr1,12697,12974,+,1,"[GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG, 21]",[],[],False,False
4,chr1,13052,13220,+,1,[GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:A...,[],[],False,False
...,...,...,...,...,...,...,...,...,...,...
499138,chrY,24883840,24886132,+,0,"[None, -1]",[],[],False,False
499139,chrY,24888605,24889352,+,0,"[None, -1]",[],[],False,False
499140,chrY,24889386,24901111,+,0,"[None, -1]",[],[],False,False
499141,chrY,24833970,24840730,+,0,"[None, -1]",[],[],False,False


In [29]:
df_merged.to_csv("data/3_all_features.tsv", sep="\t", index=False)