In [1]:
import pandas as pd
from pyfaidx import Fasta
from Bio.Seq import Seq
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/4-featurized_introns_data.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features_end_site:LTRs,repeat_features_end_site:Unknown,repeat_features_end_site:Centromere,repeat_features_end_site:Satellite repeats,repeat_features_end_site:Dust,repeat_features_end_site:Tandem repeats,repeat_features_end_site:Type I Transposons/LINE,repeat_features_end_site:Simple repeats,RC3 - Acceptor splice site,RC3 - Donor splice site
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,0,0,0,GT,AG
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,0,0,0,GT,AG
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,0,0,0,,
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,0,0,0,GT,AG
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,0,0,0,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402852,chrX,151403679,151404937,+,0,,,,,-1,...,0,0,0,0,0,0,0,0,,
402853,chrX,151409210,151456968,+,0,,,,,-1,...,0,0,0,0,0,1,0,0,,
402854,chrX,153906577,153906694,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,,
402855,chrX,153906409,153906520,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,,


In [3]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('release-109-hg38/hg38.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start_c, end_c, chr_c: genome[chr_c][start_c-1:end_c]

In [4]:
seq_lst = []

def find_sequence(row):
    chromosome = row["chr"]
    start = row["start"]
    stop = row["end"]
    strand = row["strand"]


    # Send the API request and get the response
    seq = str(coords_to_dna(int(start)+1, int(stop), chromosome))

    if strand == "-": #strand
        seq = Seq(seq)  # Encode the seqeunce into
        seq = seq.reverse_complement()


    return str(seq)

In [5]:
# df_temp = df.copy().head(5000)
# df_temp["sequence"] = df_temp.apply(find_sequence, axis=1)
# df_temp

In [6]:
pandarallel.initialize(progress_bar=False, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
df["sequence"] = df.apply(find_sequence, axis=1)

In [8]:
df["sequence-parallel"] = df.parallel_apply(find_sequence, axis=1)

In [9]:
df[["sequence", "sequence-parallel"]]

Unnamed: 0,sequence,sequence-parallel
0,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...
1,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...
2,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...
3,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...
4,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...
...,...,...
402852,GCTTTAATGATCACTGTTCCTATTGGGTTATATTTCACAACTAAAT...,GTGAGTCTGGAACTAAATTTTGATAGCTTTTCCAATAGCACAGAAG...
402853,GCTTGAGTTCTTTAGCTACTTGAATCCGATTTACTTCTGTTAAGTG...,TCATTTCTTCTAGATTTTCTAGTTTATTTGCATAGAGGTGTTTACA...
402854,GCGGTGGTGCAGGACTCATCTTGGGGACCCAGGCTGGGTGGGGTGC...,GCGGTGGTGCAGGACTCATCTTGGGGACCCAGGCTGGGTGGGGTGC...
402855,GTGGAGGTGGAGGATCTAGGTTGGGTTCAGGATGGCTGGTGGGGCA...,GTGGAGGTGGAGGATCTAGGTTGGGTTCAGGATGGCTGGTGGGGCA...


We can see that pandarelle is not suitable because it wrongly matches the sequence

In [15]:
df["hg38 - Acceptor splice site"] = df["sequence"].str[:2]
df["hg38 - Donor splice site"] = df["sequence"].str[-2::]
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features_end_site:Satellite repeats,repeat_features_end_site:Dust,repeat_features_end_site:Tandem repeats,repeat_features_end_site:Type I Transposons/LINE,repeat_features_end_site:Simple repeats,RC3 - Acceptor splice site,RC3 - Donor splice site,sequence,hg38 - Acceptor splice site,hg38 - Donor splice site
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,GT,AG,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...,GT,AG
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,GT,AG,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...,GT,AG
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,,,CACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAGTGG...,CA,GA
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,GT,AG,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,GT,AG
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,GC,AG,GCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGGAGA...,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402852,chrX,151403679,151404937,+,0,,,,,-1,...,0,0,0,0,0,,,GCTTTAATGATCACTGTTCCTATTGGGTTATATTTCACAACTAAAT...,GC,AG
402853,chrX,151409210,151456968,+,0,,,,,-1,...,0,0,1,0,0,,,GCTTGAGTTCTTTAGCTACTTGAATCCGATTTACTTCTGTTAAGTG...,GC,AG
402854,chrX,153906577,153906694,-,0,,,,,-1,...,0,0,0,0,0,,,GCGGTGGTGCAGGACTCATCTTGGGGACCCAGGCTGGGTGGGGTGC...,GC,AG
402855,chrX,153906409,153906520,-,0,,,,,-1,...,0,0,0,0,0,,,GTGGAGGTGGAGGATCTAGGTTGGGTTCAGGATGGCTGGTGGGGCA...,GT,AG


In [17]:
df = df.drop(columns=["sequence-parallel", "RC3 - Acceptor splice site", "RC3 - Donor splice site"])

In [18]:
df.to_csv("data/4.2-hg38-paired-introns.tsv", sep="\t", index=False)

# Experimental Test (Ignore)

In [None]:
# seq = str(coords_to_dna(int(150677660)+1, int(150677813), "chr7"))  # rev strand
# seq = Seq(seq)  # Encode the seqeunce into
# seq = seq.reverse_complement()
# seq