In [None]:
import pandas as pd
from pandarallel import pandarallel

In [None]:
sha512sum = !sha512sum data/04_train_set.tsv
assert("5006dfaea420edf91d4e86a72bc2428a57ad3b184a8e3b9cc55ec048d17e001247a36322b18b48fa19b24a784b8bd7fe33eb2cfa074d4625138f4728b56d3324" in sha512sum[0])
df_trainset = pd.read_csv("data/04_train_set.tsv", sep="\t")
df_trainset

# Generating splice-site antisense feature

In [None]:
df_exons = pd.read_csv("data/resources/gencode.v44.annotation.gtf", sep="\t", comment="#", header=None, names=["chr", "annotator", "type", "start", "end", "x1", "strand", "x2", "features"])
df_exons = df_exons[df_exons["type"] == "exon"]
df_exons = df_exons.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_exons

In [None]:
def df_to_bed(df, bed_file):
    with open(bed_file, 'w') as bed:
        for index, row in df.iterrows():

            chrom = row[0]
            start = int(row[3]) - 1  # Convert 1-based to 0-based (BED format)
            end = int(row[4])
            strand = row[6]


            bed_line = f"{chrom}\t{start-1}\t{end}\t.\t.\t{strand}\n"
            bed.write(bed_line)

# Convert the DataFrame to a BED file
bed_file_path = 'data/gencode_exon.bed'
df_to_bed(df_exons, bed_file_path)

In [None]:
!sort -k 1,1 -k2,2n data/gencode_exon.bed > data/gencode_exon_sorted.bed

In [None]:
!bgzip data/gencode_exon_sorted.bed

In [None]:
!tabix -p bed data/gencode_exon_sorted.bed.gz

In [None]:
pandarallel.initialize(nb_workers=12)

In [None]:
def antisense_exon_start(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """

    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.start}-{row.start+2} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

def antisense_exon_stop(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.end-2}-{row.end} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

In [None]:
# Test Cell
# %%time
# df_test = df_trainset.copy().head(100)
# df_test["test_antisense"] = df_test.apply(antisense_exon_start, axis=1)
# df_test

In [None]:
%%time
df_trainset["antisense_exon_start_ss"] = df_trainset.parallel_apply(antisense_exon_start, axis=1)
df_trainset

In [None]:
%%time
df_trainset["antisense_exon_end_ss"] = df_trainset.parallel_apply(antisense_exon_stop, axis=1)
df_trainset

In [None]:
# Optional backup:
df_trainset.to_csv("data/07_trainset_antisense_feature.tsv", sep="\t", index=False)

In [None]:
!sha512sum data/07_trainset_antisense_feature.tsv