In [1]:
import pandas as pd
from pyfaidx import Fasta
from tqdm import tqdm
from Bio.Seq import Seq
from pandarallel import pandarallel

In [2]:
df = pd.read_csv("data/4-featurized_introns_data.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features:Low complexity regions,repeat_features:Tandem repeats,repeat_features:Unknown,repeat_features:Satellite repeats,repeat_features:Centromere,repeat_features:RNA repeats,repeat_features:Type II Transposons,repeat_features:LTRs,Acceptor splice site,Donor splice site
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,0,0,0,GT,AG
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,0,0,0,GT,AG
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,0,0,0,,
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,0,0,0,GT,AG
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,0,0,0,GC,AG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,,,,,-1,...,0,1,0,0,0,0,0,0,,
517421,chrX,151409210,151456968,+,0,,,,,-1,...,6,22,1,0,0,0,8,17,,
517422,chrX,153906577,153906694,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,,
517423,chrX,153906409,153906520,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,,


In [3]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('release-109-hg38/hg38.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start_c, end_c, chr_c: genome[chr_c][start_c-1:end_c]

In [4]:
seq_lst = []

def find_sequence(row):
    chromosome = row["chr"]
    start = row["start"]
    stop = row["end"]
    strand = row["strand"]


    # Send the API request and get the response
    seq = str(coords_to_dna(int(start)+1, int(stop), chromosome))

    # If the response was successful (i.e. the status code is 200)
    # try:
    if strand == "-": #strand
        seq = Seq(seq)  # Encode the seqeunce into
        seq = seq.reverse_complement()
    # except:
    #     print(start, stop,strand,chromosome)
    #     raise Exception("Sorry, no numbers below zero")

    return str(seq)

In [5]:
# df_temp = df.copy().head(5000)
# df_temp["sequence"] = df_temp.apply(find_sequence, axis=1)
# df_temp

In [6]:
pandarallel.initialize(progress_bar=False, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
df["sequence"] = df.parallel_apply(find_sequence, axis=1)

In [8]:
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features:Tandem repeats,repeat_features:Unknown,repeat_features:Satellite repeats,repeat_features:Centromere,repeat_features:RNA repeats,repeat_features:Type II Transposons,repeat_features:LTRs,Acceptor splice site,Donor splice site,sequence
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,0,0,GT,AG,GTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGCCGG...
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,0,0,GT,AG,GTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTAGGG...
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,0,0,,,GTATTTTGGAATACACCTTTAATGTAATGTTCGATCAAATAGAAGA...
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,0,0,GT,AG,GCACACTGACTCGTTACTCCTCTTTGTTACTGTTAGGCATCAGAGA...
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,0,0,GC,AG,TGAGGCTAATATATTATCCTTTGGTGCCATGAATGGATGAAGAAAT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,,,,,-1,...,1,0,0,0,0,0,0,,,GCTTTAATGATCACTGTTCCTATTGGGTTATATTTCACAACTAAAT...
517421,chrX,151409210,151456968,+,0,,,,,-1,...,22,1,0,0,0,8,17,,,GCTTGAGTTCTTTAGCTACTTGAATCCGATTTACTTCTGTTAAGTG...
517422,chrX,153906577,153906694,-,0,,,,,-1,...,0,0,0,0,0,0,0,,,GCGGTGGTGCAGGACTCATCTTGGGGACCCAGGCTGGGTGGGGTGC...
517423,chrX,153906409,153906520,-,0,,,,,-1,...,0,0,0,0,0,0,0,,,CTAGCGAGGGCTGCCCCAGGCACAGGACCCTGAGCTGACAGGTTCC...
