In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [2]:
df_trainset = pd.read_csv("data/04_train_set.tsv", sep="\t")
df_trainset

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0
519030,chrY,25513173,25513588,-,0
519031,chrY,25513745,25516715,-,0
519032,chrY,25525288,25527646,-,0


In [3]:
df_recount3 = pd.read_csv("./data/resources/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Chrom,Features,Score
Strand,Start,End,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
+,12,16498,chrM,GTExv2=15:69;TCGAv2=0:0;SRAv3h=3:7;AT:AC,76
+,15,355,chrUn_KI270303v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:36;GT:AG,36
+,17,65645,chr22_KI270736v1_random,GTExv2=0:0;TCGAv2=0:0;SRAv3h=7:36;GT:AG,36
+,20,608,chrUn_GL000224v1,GTExv2=0:0;TCGAv2=1:1;SRAv3h=27:65;GT:AG,66
+,20,801,chrUn_KI270539v1,GTExv2=0:0;TCGAv2=0:0;SRAv3h=33:35;GT:AG,35
...,...,...,...,...,...
-,248937793,248937887,chr1,GTExv2=3:3;TCGAv2=1:2;SRAv3h=15:16;GT:AG,21
-,248937886,248938073,chr1,GTExv2=2:2;TCGAv2=1:2;SRAv3h=26:27;GT:AG,31
-,248937979,248938073,chr1,GTExv2=4:4;TCGAv2=1:2;SRAv3h=19:20;GT:AG,26
-,248942862,248945072,chr1,GTExv2=2:2;TCGAv2=1:1;SRAv3h=11:29;GT:AG,32


In [4]:
# Recount3 have entries for a lot of haplotypes and alternative chromosomes, we will just need to get the primary assembly CHRs
ordinary_chr_list = ["chr"+str(x) for x in range(1,23)] + ["chrX"] + ["chrY"] + ["chrM"]
ordinary_chr_list

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22',
 'chrX',
 'chrY',
 'chrM']

In [7]:
def matchrc(row):
    potential_matches = !tabix data/resources/recount3.bed.gz {row.chr}:{row.start}-{row.end} | cut -f 2,3,5

    potential_matches = [
        int(entry.split('\t')[2])
        for entry in potential_matches
        if row['start'] == int(entry.split('\t')[0]) and row['end'] == int(entry.split('\t')[1]) #["RC3-Score"]
    ]

    if potential_matches:
        return potential_matches[0]
    else:
        return 0

In [8]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
%%time
df_test = df_trainset.copy().head(10)
df_test["RC3"] = df_test.apply(matchrc, axis=1)
df_test

CPU times: user 6.87 ms, sys: 11.4 ms, total: 18.3 ms
Wall time: 100 ms


Unnamed: 0,chr,start,end,strand,class,RC3
0,chr1,12227,12612,+,1,59151
1,chr1,12721,13220,+,1,61021
2,chr1,12057,12178,+,1,0
3,chr1,12697,12974,+,1,21
4,chr1,13052,13220,+,1,526
5,chr1,13374,13452,+,1,0
6,chr1,14501,15004,-,1,0
7,chr1,15038,15795,-,1,7437205
8,chr1,15947,16606,-,1,2667727
9,chr1,16765,16857,-,1,7006744


In [10]:
%%time
df_trainset["recount3_score"] = df_trainset.parallel_apply(matchrc, axis=1)
df_trainset

CPU times: user 25.3 ms, sys: 343 ms, total: 368 ms
Wall time: 5min 24s


Unnamed: 0,chr,start,end,strand,class,recount3_score
0,chr1,12227,12612,+,1,59151
1,chr1,12721,13220,+,1,61021
2,chr1,12057,12178,+,1,0
3,chr1,12697,12974,+,1,21
4,chr1,13052,13220,+,1,526
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,0
519030,chrY,25513173,25513588,-,0,31374
519031,chrY,25513745,25516715,-,0,67
519032,chrY,25525288,25527646,-,0,18958


In [11]:
# Optional backup:
df_trainset.to_csv("data/05_trainset_recount3_feature.tsv", sep="\t", index=False)

In [12]:
! sha512sum data/05_trainset_recount3_feature.tsv

2edc6e69902481e6fb62fb03ea084cd35baba20583c62e9a459d9656010c4591ceef9d0af187359f967ecad876ad58b71521bdf891fb60e4e7ee5b0704a5ca2c  data/05_trainset_recount3_feature.tsv


# Adding repeat features to the merged train set


In [None]:
def match_repeat_features_1(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    matched_rows = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.start-2}-{row.start+2} | cut -f4
    return list(set(matched_rows + []))

In [None]:
def match_repeat_features_2(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.end-2}), we also look at the small-region that proceeds (2 nt in the exon; {row.end+2}) the splice-site.
    """
    matched_rows2 = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.end-2}-{row.end+2} | cut -f4
        # | cut -f4 | sort -u
    return list(set([] + matched_rows2))

In [None]:
pandarallel.initialize(progress_bar=False)

In [None]:
%%time
# Test cell
df_draft = df_merged.copy().head(100)
df_draft["Repeat_overlap"] = df_draft.apply(match_repeat_features_1, axis=1)
df_draft

In [None]:
del df_draft

In [None]:
%%time
df_merged["repeat_features_start_site"] = df_merged.parallel_apply(match_repeat_features_1, axis=1)
df_merged

In [None]:
%%time
df_merged["repeat_features_end_site"] = df_merged.parallel_apply(match_repeat_features_2, axis=1)
df_merged

In [None]:
# df_merged.to_csv("data/3-temp-output-2.tsv", sep="\t", index=False)
df_merged = pd.read_csv("data/3-temp-output-2.tsv", sep="\t")

# Splice-site Antisense feature

In [None]:
df_exons = pd.read_csv("/home/rabbit/Documents/Projects/ML_gene_annot/ftp-data/gencode.v44.annotation.gtf", sep="\t", comment="#", header=None, names=["chr", "annotator", "type", "start", "end", "x1", "strand", "x2", "features"])
df_exons = df_exons[df_exons["type"] == "exon"]
df_exons = df_exons.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_exons

In [None]:
def df_to_bed(df, bed_file):
    with open(bed_file, 'w') as bed:
        for index, row in df.iterrows():

            chrom = row[0]
            start = int(row[3]) - 1  # Convert 1-based to 0-based (BED format)
            end = int(row[4])
            strand = row[6]


            bed_line = f"{chrom}\t{start-1}\t{end}\t.\t.\t{strand}\n"
            bed.write(bed_line)

# Convert the DataFrame to a BED file
bed_file_path = 'exon.bed'
df_to_bed(df_exons, bed_file_path)

In [None]:
pandarallel.initialize(nb_workers=12)

In [None]:
def antisense_exon_start(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.start}-{row.start+2} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

def antisense_exon_stop(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.end-2}-{row.end} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

In [None]:
df_merged

In [None]:
%%time
df_merged["ss_antisense_start_site"] = df_merged.parallel_apply(antisense_exon_start, axis=1)
df_merged

In [None]:
%%time
df_merged["ss_antisense_end_site"] = df_merged.parallel_apply(antisense_exon_stop, axis=1)
df_merged

In [None]:
df_merged.to_csv("data/3_all_features.tsv", sep="\t", index=False)