In [1]:
import pandas as pd
from pyfaidx import Fasta
from Bio.Seq import Seq
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/4.1_populated_features.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand,class,ss_antisense_start_site,ss_antisense_end_site,GTExv2,TCGAv2,SRAv3h,...,repeat_features_end_site:Type I Transposons/LINE,repeat_features_end_site:LTRs,repeat_features_end_site:Dust,repeat_features_end_site:Unknown,repeat_features_end_site:RNA repeats,repeat_features_end_site:Satellite repeats,repeat_features_end_site:Tandem repeats,repeat_features_end_site:Low complexity regions,RC3 - Acceptor splice site,RC3 - Donor splice site
0,chr1,12227,12612,+,1,False,False,1122:3199,583:1460,9374:54492,...,0,0,0,0,0,0,0,0,GT,AG
1,chr1,12721,13220,+,1,False,False,1791:3198,783:1104,14048:56719,...,0,0,0,0,0,0,0,0,GT,AG
2,chr1,12057,12178,+,1,False,False,,,,...,0,0,0,0,0,0,0,0,,
3,chr1,12697,12974,+,1,False,False,1:1,1:1,15:19,...,0,0,0,0,0,0,0,0,GT,AG
4,chr1,13052,13220,+,1,False,False,22:24,17:18,433:484,...,0,0,0,0,0,0,0,0,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527913,chrY,24883840,24886132,+,0,False,False,,,,...,0,0,0,0,0,0,0,0,,
527914,chrY,24888605,24889352,+,0,False,False,,,,...,0,0,0,0,0,0,0,0,,
527915,chrY,24889386,24901111,+,0,False,False,,,,...,0,0,0,0,0,0,0,0,,
527916,chrY,24833970,24840730,+,0,False,False,,,,...,0,0,0,0,0,0,0,0,,


In [3]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('release-109-hg38/hg38.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start_c, end_c, chr_c: genome[chr_c][start_c-1:end_c]


In [4]:
def find_sequence(row):
    chromosome = row["chr"]
    start = row["start"]
    stop = row["end"]
    strand = row["strand"]

    seq_find_sequence = str(coords_to_dna(int(start)+1, int(stop), chromosome))

    if strand == "-": #strand
        seq_find_sequence = Seq(seq_find_sequence)  # Encode the seqeunce into
        seq_find_sequence = seq_find_sequence.reverse_complement()


    return str(seq_find_sequence)

def find_sequence_maxentscan(row):
    chromosome = row["chr"]
    start = row["start"]
    stop = row["end"]
    strand = row["strand"]

    seq_find_sequence = str(coords_to_dna(int(start)+1-3, int(stop)+3, chromosome))

    if strand == "-": #strand
        seq_find_sequence = Seq(seq_find_sequence)  # Encode the seqeunce into
        seq_find_sequence = seq_find_sequence.reverse_complement()

    seq_find_sequence = str(seq_find_sequence)
    first_three = seq_find_sequence[:3].lower()
    last_three = seq_find_sequence[-3:].lower()
    middle_part = seq_find_sequence[3:-3]

    return first_three + middle_part + last_three

In [None]:
# df_temp = df.copy().head(5000)
# df_temp["sequence"] = df_temp.apply(find_sequence, axis=1)
# df_temp["sequence_maxentscan"] = df_temp.apply(find_sequence_maxentscan, axis=1)
# df_temp

In [5]:
df["sequence"] = df.apply(find_sequence, axis=1)

In [6]:
df["sequence_maxentscan"] = df.apply(find_sequence_maxentscan, axis=1)

In [7]:
df["hg38 - Acceptor splice site"] = df["sequence"].str[:2]
df["hg38 - Donor splice site"] = df["sequence"].str[-2::]
df

Unnamed: 0,chr,start,end,strand,class,ss_antisense_start_site,ss_antisense_end_site,GTExv2,TCGAv2,SRAv3h,...,repeat_features_end_site:RNA repeats,repeat_features_end_site:Satellite repeats,repeat_features_end_site:Tandem repeats,repeat_features_end_site:Low complexity regions,RC3 - Acceptor splice site,RC3 - Donor splice site,sequence,sequence_maxentscan,hg38 - Acceptor splice site,hg38 - Donor splice site
0,chr1,12227,12612,+,1,False,False,1122:3199,583:1460,9374:54492,...,0,0,0,0,GT,AG,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,AG
1,chr1,12721,13220,+,1,False,False,1791:3198,783:1104,14048:56719,...,0,0,0,0,GT,AG,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,AG
2,chr1,12057,12178,+,1,False,False,,,,...,0,0,0,0,,,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,GA
3,chr1,12697,12974,+,1,False,False,1:1,1:1,15:19,...,0,0,0,0,GT,AG,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,AG
4,chr1,13052,13220,+,1,False,False,22:24,17:18,433:484,...,0,0,0,0,GC,AG,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527913,chrY,24883840,24886132,+,0,False,False,,,,...,0,0,0,0,,,GTAAGAAGGAGTAAAATTATTTGCTTTCAGGTATTATTGAGGCCTT...,aatGTAAGAAGGAGTAAAATTATTTGCTTTCAGGTATTATTGAGGC...,GT,GC
527914,chrY,24888605,24889352,+,0,False,False,,,,...,0,0,0,0,,,GTAATGTAAGAAGGAGTAAAATTATTTGCTTTCAGGTATTATTGAG...,cagGTAATGTAAGAAGGAGTAAAATTATTTGCTTTCAGGTATTATT...,GT,AG
527915,chrY,24889386,24901111,+,0,False,False,,,,...,0,0,0,0,,,ATTATGTTTTCCTTGATGTTAAGTGAATTAGCCAAACATAGACTTC...,gggATTATGTTTTCCTTGATGTTAAGTGAATTAGCCAAACATAGAC...,AT,AG
527916,chrY,24833970,24840730,+,0,False,False,,,,...,0,0,0,0,,,CCTTGGTTTTCCTTACACCTTAGCCTTTGGCTCCTTTGACCACTCG...,cgcCCTTGGTTTTCCTTACACCTTAGCCTTTGGCTCCTTTGACCAC...,CC,AG


In [8]:
df = df.drop(columns=["RC3 - Acceptor splice site", "RC3 - Donor splice site"])

In [9]:
df.to_csv("data/4.2_hg38_paired_introns.tsv", sep="\t", index=False)

 # Experimental Test (Please ignore)

In [None]:
seq = str(coords_to_dna(int(150677660)+1, int(150677813), "chr7"))  # rev strand
seq = Seq(seq)  # Encode the seqeunce into
seq = seq.reverse_complement()
seq

In [None]:
seq = str(coords_to_dna(14829+1, 14969, "chr1"))  # rev strand
seq = Seq(seq)  # Encode the seqeunce into
seq = seq.reverse_complement()
seq

In [None]:
# Pandarelle is not suitable to use with pyfaidx because it wrongly matches the sequence
#pandarallel.initialize(progress_bar=False, nb_workers=8)
# df["sequence-parallel"] = df.parallel_apply(find_sequence, axis=1)

In [None]:
seq = str(coords_to_dna(20016325+1, 20024950, "chrX"))  # rev strand
seq = Seq(seq)  # Encode the seqeunce into
seq = seq.reverse_complement()
seq

In [None]:
str(seq)