In [1]:
import pandas as pd
from pyfaidx import Fasta
from Bio.Seq import Seq
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/4-featurized_introns_data.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features_end_site:Centromere,repeat_features_end_site:RNA repeats,repeat_features_end_site:Unknown,repeat_features_end_site:Type I Transposons/LINE,repeat_features_end_site:Type II Transposons,repeat_features_end_site:Low complexity regions,repeat_features_end_site:LTRs,repeat_features_end_site:Type I Transposons/SINE,RC3 - Acceptor splice site,RC3 - Donor splice site
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,0,0,0,GT,AG
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,0,0,0,GT,AG
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,0,0,0,,
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,0,0,0,GT,AG
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,0,0,0,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449562,chrY,25986748,25987810,+,0,297:602,93:135,708:1741,GT:AG,2478,...,0,0,0,0,0,0,0,0,GT,AG
449563,chrY,26335719,26337372,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,,
449564,chrY,26337521,26354287,-,0,2:3,0:0,64:89,GT:AG,92,...,0,0,0,0,0,0,0,0,GT,AG
449565,chrY,56954656,56960285,+,0,,,,,-1,...,0,0,0,0,0,0,0,0,,


In [3]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('release-109-hg38/hg38.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start_c, end_c, chr_c: genome[chr_c][start_c-1:end_c]

In [4]:
seq_lst = []

def find_sequence(row):
    chromosome = row["chr"]
    start = row["start"]
    stop = row["end"]
    strand = row["strand"]


    # Send the API request and get the response
    seq = str(coords_to_dna(int(start)+1, int(stop), chromosome))

    if strand == "-": #strand
        seq = Seq(seq)  # Encode the seqeunce into
        seq = seq.reverse_complement()


    return str(seq)

In [None]:
# df_temp = df.copy().head(5000)
# df_temp["sequence"] = df_temp.apply(find_sequence, axis=1)
# df_temp

In [5]:
df["sequence"] = df.apply(find_sequence, axis=1)

In [6]:
df["sequence"]

0         GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...
1         GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...
2         CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...
3         GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...
4         GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...
                                ...                        
449562    GTAACAGAGTGTTTTACAGAGTAGACATTTCAACTTTCAATAAAGT...
449563    GTGAGCCTGAACCTCTAAAAGAACAACGAGATTGGTGGCTGGGTCT...
449564    GTGAGTGAGTCTCTGTGTGTGTAGCTTTGCAGTATTTGTAGGAATG...
449565    GTAAGCCATAGAAATTAGCTCTTTAAAAACCCAGAATTCTTTCTAT...
449566    GTGAGCGGGCCCTGGAGCCTGCGGTCGGAGGGCCTTGGGCAAGATC...
Name: sequence, Length: 449567, dtype: object

In [7]:
df["hg38 - Acceptor splice site"] = df["sequence"].str[:2]
df["hg38 - Donor splice site"] = df["sequence"].str[-2::]
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features_end_site:Type I Transposons/LINE,repeat_features_end_site:Type II Transposons,repeat_features_end_site:Low complexity regions,repeat_features_end_site:LTRs,repeat_features_end_site:Type I Transposons/SINE,RC3 - Acceptor splice site,RC3 - Donor splice site,sequence,hg38 - Acceptor splice site,hg38 - Donor splice site
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,GT,AG,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,GT,AG
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,GT,AG,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,GT,AG
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,,,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,CA,GA
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,GT,AG,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,GT,AG
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,GC,AG,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449562,chrY,25986748,25987810,+,0,297:602,93:135,708:1741,GT:AG,2478,...,0,0,0,0,0,GT,AG,GTAACAGAGTGTTTTACAGAGTAGACATTTCAACTTTCAATAAAGT...,GT,AG
449563,chrY,26335719,26337372,-,0,,,,,-1,...,0,0,0,0,0,,,GTGAGCCTGAACCTCTAAAAGAACAACGAGATTGGTGGCTGGGTCT...,GT,AG
449564,chrY,26337521,26354287,-,0,2:3,0:0,64:89,GT:AG,92,...,0,0,0,0,0,GT,AG,GTGAGTGAGTCTCTGTGTGTGTAGCTTTGCAGTATTTGTAGGAATG...,GT,AG
449565,chrY,56954656,56960285,+,0,,,,,-1,...,0,0,0,0,0,,,GTAAGCCATAGAAATTAGCTCTTTAAAAACCCAGAATTCTTTCTAT...,GT,AG


In [9]:
df = df.drop(columns=["RC3 - Acceptor splice site", "RC3 - Donor splice site"])

In [10]:
df.to_csv("data/4.2-hg38-paired-introns.tsv", sep="\t", index=False)

 # Experimental Test (Ignore)

In [None]:
seq = str(coords_to_dna(int(150677660)+1, int(150677813), "chr7"))  # rev strand
seq = Seq(seq)  # Encode the seqeunce into
seq = seq.reverse_complement()
seq

In [None]:
seq = str(coords_to_dna(14829+1, 14969, "chr1"))  # rev strand
seq = Seq(seq)  # Encode the seqeunce into
seq = seq.reverse_complement()
seq

In [None]:
# Pandarelle is not suitable to use with pyfaidx because it wrongly matches the sequence
#pandarallel.initialize(progress_bar=False, nb_workers=8)
# df["sequence-parallel"] = df.parallel_apply(find_sequence, axis=1)