In [None]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel

In [None]:
df_merged = pd.read_csv("data/2-merged_train_set", sep="\t")
df_merged #402857

In [None]:
df_recount3 = pd.read_csv("ftp-data/recount3.bed", sep="\t",
                          names=["Chrom", "Start", "End", "Features", "Score", "Strand"])

df_recount3.set_index(["Strand", "Start", "End"], inplace=True)
df_recount3.sort_index(inplace=True)
df_recount3

In [None]:
chrom_uniques = list(df_recount3.Chrom.unique())  # we get the list of unique chromosome names
df_chrom = {}

In [None]:
# Splitting the dataframe chromosome groups separately into key(chromosome)-value(rows) in a dictionary
for chrom in tqdm(chrom_uniques):
    df_chrom[chrom] = df_recount3[df_recount3.Chrom == chrom]


In [None]:
def match_recount(row):
    try:
        score = df_chrom[row.chr].loc[(row.strand, row.start, row.end), ["Features", "Score"]].values
    except:
        score = [None, -1]
    return score

In [None]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

In [None]:
# %%time
# df_test = df_merged.copy().head(10)
# df_test["RC3"] = df_test.apply(match_recount, axis=1)
# df_test

In [None]:
%%time
df_merged["RC3"] = df_merged.parallel_apply(match_recount, axis=1)

In [None]:
df_merged

In [None]:
# Optional backup:
# df_merged.to_csv("data/3-temp-output.tsv", sep="\t", index=False)
df_merged = pd.read_csv("data/3-temp-output.tsv", sep="\t")  # sha512sum ad0f3a08b8c7986d341543e75a19d5335c8b65ddeb4d28dd63016b4f79eabad0fe244b26b8031475cf5e6f212ea0ae9a2f0bdb200b692a77736de84c777a5b10
df_merged

# Adding repeat features to the merged train set


In [None]:
def match_repeat_features_1(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    matched_rows = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.start-2}-{row.start+2} | cut -f4
    return list(set(matched_rows + []))

In [None]:
def match_repeat_features_2(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.end-2}), we also look at the small-region that proceeds (2 nt in the exon; {row.end+2}) the splice-site.
    """
    matched_rows2 = !tabix ftp-data/repeat_features.bed.gz {row.chr}:{row.end-2}-{row.end+2} | cut -f4
        # | cut -f4 | sort -u
    return list(set([] + matched_rows2))

In [None]:
pandarallel.initialize(progress_bar=False)

In [None]:
%%time
# Test cell
df_draft = df_merged.copy().head(100)
df_draft["Repeat_overlap"] = df_draft.apply(match_repeat_features_1, axis=1)
df_draft

In [None]:
del df_draft

In [None]:
%%time
df_merged["repeat_features_start_site"] = df_merged.parallel_apply(match_repeat_features_1, axis=1)
df_merged

In [None]:
%%time
df_merged["repeat_features_end_site"] = df_merged.parallel_apply(match_repeat_features_2, axis=1)
df_merged

In [2]:
# df_merged.to_csv("data/3_repeat_features_output.tsv", sep="\t", index=False)
df_merged = pd.read_csv("data/3_repeat_features_output.tsv", sep="\t")

# Splice-site Antisense feature

In [None]:
df_exons = pd.read_csv("/home/rabbit/Documents/Projects/ML_gene_annot/ftp-data/gencode.v44.annotation.gtf", sep="\t", comment="#", header=None, names=["chr", "annotator", "type", "start", "end", "x1", "strand", "x2", "features"])
df_exons = df_exons[df_exons["type"] == "exon"]
df_exons = df_exons.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_exons

In [None]:
def df_to_bed(df, bed_file):
    with open(bed_file, 'w') as bed:
        for index, row in df.iterrows():

            chrom = row[0]
            start = int(row[3]) - 1  # Convert 1-based to 0-based (BED format)
            end = int(row[4])
            strand = row[6]


            bed_line = f"{chrom}\t{start-1}\t{end}\t.\t.\t{strand}\n"
            bed.write(bed_line)

# Convert the DataFrame to a BED file
bed_file_path = 'exon.bed'
df_to_bed(df_exons, bed_file_path)

In [3]:
pandarallel.initialize(progress_bar=False, nb_workers=12)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
def antisense_exon_start(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.start}-{row.start+2} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

def antisense_exon_stop(row):
    """
    We call the tabix command, which look at the bed file to see if the row (which are entries in the intron that we extracted) overlaps with the repeat
    regions detailed in the repeat_features.bed.gz, apart from the splice-site ({row.start+2}), we also look at the small-region that precedes (2 nt in the exon; row.start-2) the splice-site.
    """
    opposite_strand = "+" if row.strand == "-" else "-"
    # print(opposite_strand)
    matched_rows = !tabix sorted_exon.bed.gz {row.chr}:{row.end-2}-{row.end} | cut -f6
    # if matched_rows != []:
    #     print(matched_rows)
    return opposite_strand in matched_rows

In [None]:
df_merged

In [5]:
%%time
df_merged["ss_antisense_start_site"] = df_merged.parallel_apply(antisense_exon_start, axis=1)
df_merged

CPU times: user 111 ms, sys: 48.9 ms, total: 160 ms
Wall time: 13min 22s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site,ss_antisense_start_site
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...,[],[],False
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...,[],[],False
2,chr1,12057,12178,+,1,"[None, -1]",[],[],False
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21],[],[],False
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...,[],[],False
...,...,...,...,...,...,...,...,...,...
527913,chrY,24883840,24886132,+,0,"[None, -1]",[],[],False
527914,chrY,24888605,24889352,+,0,"[None, -1]",[],[],False
527915,chrY,24889386,24901111,+,0,"[None, -1]",[],[],False
527916,chrY,24833970,24840730,+,0,"[None, -1]",[],[],False


In [6]:
%%time
df_merged["ss_antisense_end_site"] = df_merged.parallel_apply(antisense_exon_stop, axis=1)
df_merged

CPU times: user 83.4 ms, sys: 53.9 ms, total: 137 ms
Wall time: 13min 16s


Unnamed: 0,chr,start,end,strand,class,RC3,repeat_features_start_site,repeat_features_end_site,ss_antisense_start_site,ss_antisense_end_site
0,chr1,12227,12612,+,1,['GTExv2=1122:3199;TCGAv2=583:1460;SRAv3h=9374...,[],[],False,False
1,chr1,12721,13220,+,1,['GTExv2=1791:3198;TCGAv2=783:1104;SRAv3h=1404...,[],[],False,False
2,chr1,12057,12178,+,1,"[None, -1]",[],[],False,False
3,chr1,12697,12974,+,1,['GTExv2=1:1;TCGAv2=1:1;SRAv3h=15:19;GT:AG' 21],[],[],False,False
4,chr1,13052,13220,+,1,['GTExv2=22:24;TCGAv2=17:18;SRAv3h=433:484;GC:...,[],[],False,False
...,...,...,...,...,...,...,...,...,...,...
527913,chrY,24883840,24886132,+,0,"[None, -1]",[],[],False,False
527914,chrY,24888605,24889352,+,0,"[None, -1]",[],[],False,False
527915,chrY,24889386,24901111,+,0,"[None, -1]",[],[],False,False
527916,chrY,24833970,24840730,+,0,"[None, -1]",[],[],False,False


In [7]:
df_merged.to_csv("data/3_all_features.tsv", sep="\t", index=False)