In [1]:
import pandas as pd
from pandarallel import pandarallel

In [2]:
df_trainset = pd.read_csv("data/04_train_set.tsv", sep="\t")
df_trainset

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0
519030,chrY,25513173,25513588,-,0
519031,chrY,25513745,25516715,-,0
519032,chrY,25525288,25527646,-,0


# Generating splice-site antisense feature

In [3]:
df_exons = pd.read_csv("data/resources/gencode.v44.annotation.gtf", sep="\t", comment="#", header=None, names=["chr", "annotator", "type", "start", "end", "x1", "strand", "x2", "features"])
df_exons = df_exons[df_exons["type"] == "exon"]
df_exons = df_exons.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_exons

Unnamed: 0,chr,annotator,type,start,end,x1,strand,x2,features
2,chr1,HAVANA,exon,11869,12227,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
3,chr1,HAVANA,exon,12613,12721,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
4,chr1,HAVANA,exon,13221,14409,.,+,.,"gene_id ""ENSG00000290825.1""; transcript_id ""EN..."
7,chr1,HAVANA,exon,12010,12057,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
8,chr1,HAVANA,exon,12179,12227,.,+,.,"gene_id ""ENSG00000223972.6""; transcript_id ""EN..."
...,...,...,...,...,...,...,...,...,...
3424172,chrM,ENSEMBL,exon,14149,14673,.,-,.,"gene_id ""ENSG00000198695.2""; transcript_id ""EN..."
3424177,chrM,ENSEMBL,exon,14674,14742,.,-,.,"gene_id ""ENSG00000210194.1""; transcript_id ""EN..."
3424180,chrM,ENSEMBL,exon,14747,15887,.,+,.,"gene_id ""ENSG00000198727.2""; transcript_id ""EN..."
3424185,chrM,ENSEMBL,exon,15888,15953,.,+,.,"gene_id ""ENSG00000210195.2""; transcript_id ""EN..."


In [4]:
def df_to_bed(df, bed_file):
    with open(bed_file, 'w') as bed:
        for index, row in df.iterrows():

            chrom = row[0]
            start = int(row[3]) - 1  # Convert 1-based to 0-based (BED format)
            end = int(row[4])
            strand = row[6]


            bed_line = f"{chrom}\t{start-1}\t{end}\t.\t.\t{strand}\n"
            bed.write(bed_line)

# Convert the DataFrame to a BED file
bed_file_path = 'data/gencode_exon.bed'
df_to_bed(df_exons, bed_file_path)

In [5]:
!sort -k 1,1 -k2,2n data/gencode_exon.bed > data/gencode_exon_sorted.bed

In [6]:
!bgzip data/gencode_exon_sorted.bed

In [7]:
!tabix -p bed data/gencode_exon_sorted.bed.gz

In [8]:
pandarallel.initialize(nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
def antisense_exon_start(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """

    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.start}-{row.start+2} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

def antisense_exon_stop(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.end-2}-{row.end} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

In [10]:
# Test Cell
# %%time
# df_test = df_trainset.copy().head(100)
# df_test["test_antisense"] = df_test.apply(antisense_exon_start, axis=1)
# df_test

In [11]:
%%time
df_trainset["antisense_exon_start_ss"] = df_trainset.parallel_apply(antisense_exon_start, axis=1)
df_trainset

CPU times: user 67.7 ms, sys: 139 ms, total: 207 ms
Wall time: 13min 28s


Unnamed: 0,chr,start,end,strand,class,antisense_exon_start_ss
0,chr1,12227,12612,+,1,False
1,chr1,12721,13220,+,1,False
2,chr1,12057,12178,+,1,False
3,chr1,12697,12974,+,1,False
4,chr1,13052,13220,+,1,False
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,False
519030,chrY,25513173,25513588,-,0,False
519031,chrY,25513745,25516715,-,0,False
519032,chrY,25525288,25527646,-,0,False


In [12]:
%%time
df_trainset["antisense_exon_end_ss"] = df_trainset.parallel_apply(antisense_exon_stop, axis=1)
df_trainset

CPU times: user 44.6 ms, sys: 135 ms, total: 179 ms
Wall time: 13min 22s


Unnamed: 0,chr,start,end,strand,class,antisense_exon_start_ss,antisense_exon_end_ss
0,chr1,12227,12612,+,1,False,False
1,chr1,12721,13220,+,1,False,False
2,chr1,12057,12178,+,1,False,False
3,chr1,12697,12974,+,1,False,False
4,chr1,13052,13220,+,1,False,False
...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,False,False
519030,chrY,25513173,25513588,-,0,False,False
519031,chrY,25513745,25516715,-,0,False,False
519032,chrY,25525288,25527646,-,0,False,False


In [13]:
# Optional backup:
df_trainset.to_csv("data/07_trainset_antisense_feature.tsv", sep="\t", index=False)

In [14]:
!sha512sum data/07_trainset_antisense_feature.tsv

d4fc65e1a51d71c5983e667617640cdfb4c478fe768625d248059e68d469b61d672fa0a735c3e01832388a1f181957aaf422714e0022b614fc6b508c97a9cb33  data/07_trainset_antisense_feature.tsv
