In [1]:
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
df_trainset = pd.read_csv("data/08_trainset_with_seq.tsv", sep="\t")
df_trainset

Unnamed: 0,chr,start,end,strand,class,maxentscan_sequence,start_ss,end_ss
0,chr1,12227,12612,+,1,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,AG
1,chr1,12721,13220,+,1,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,AG
2,chr1,12057,12178,+,1,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,GA
3,chr1,12697,12974,+,1,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,AG
4,chr1,13052,13220,+,1,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,AG
...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,AG
519030,chrY,25513173,25513588,-,0,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,AG
519031,chrY,25513745,25516715,-,0,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,AG
519032,chrY,25525288,25527646,-,0,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,AG


In [3]:
df_trainset = df_trainset[~(df_trainset.maxentscan_sequence.str.len() <= 24)]
df_trainset

Unnamed: 0,chr,start,end,strand,class,maxentscan_sequence,start_ss,end_ss
0,chr1,12227,12612,+,1,ccaGTAAGTAGTGCTTGTGCTCATCTCCTTGGCTGTGATACGTGGC...,GT,AG
1,chr1,12721,13220,+,1,gagGTGAGAGGAGAGTAGACAGTGAGTGGGAGTGGCGTCGCCCCTA...,GT,AG
2,chr1,12057,12178,+,1,gagCACTGGAGTGGAGTTTTCCTGTGGAGAGGAGCCATGCCTAGAG...,CA,GA
3,chr1,12697,12974,+,1,cttGTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAG...,GT,AG
4,chr1,13052,13220,+,1,tagGCAAGCCTGGCTGCCTCCAGCTGGGTCGACAGACAGGGGCTGG...,GC,AG
...,...,...,...,...,...,...,...,...
519029,chrY,25464577,25465486,+,0,cacGTTGAGGCGCCCAGTGGCGGCCTCACGGGGCAGGGCGAGGGCG...,GT,AG
519030,chrY,25513173,25513588,-,0,aggGTAAGATCAGTGCTATTGTCAGAGGAAAAACTCCTGGCCATCA...,GT,AG
519031,chrY,25513745,25516715,-,0,gctGTAAGTTCCACATTGATTATCATAGGCTAACCATGGGCCAGGC...,GT,AG
519032,chrY,25525288,25527646,-,0,tgtGTGTATAAATATCTGGACTTTTTGGTTAAGTAATTATAGTTAA...,GT,AG


In [4]:
# Create an empty list to hold the SeqRecord objects
seq_records = []

# Iterate through each row in the DataFrame
for index, row in df_trainset.iterrows():
    # Get the chromosome, start, and end positions
    chr_name = row['chr']
    start = row['start']
    end = row['end']
    strand = row['strand']

    # Get the sequence and remove the first and last 3 nucleotides
    seq = row['maxentscan_sequence'][3:-3]

    # Create a SeqRecord object
    seq_record = SeqRecord(Seq(seq),
                           id=f"{chr_name};{start};{end};{strand}",
                           description="")

    # Append the SeqRecord object to seq_records (list)
    seq_records.append(seq_record)

# Export the list of SeqRecord objects as a FASTA file
with open("data/BPP/introns.fasta", "w") as output_handle:
    SeqIO.write(seq_records, output_handle, "fasta")

In [5]:
!md5sum data/BPP/introns.fasta

10ef58a976ed3d8f9b8629afd4f53138  data/BPP/introns.fasta


In [6]:
!python3 data/BPP/BP_PPT.py -b data/BPP/pwmBP_human.txt -p data/BPP/scPPT_human.txt -i data/BPP/introns.fasta > data/14_trainset_BPP_score_feature.tsv

In [7]:
df_BPP_len = len(pd.read_csv("data/14_trainset_BPP_score_feature.tsv", sep="\t"))

In [8]:
# Identify rows to skip: in this example, rows 2, 4, 6, 8, 10, etc. have extra headers
skip_rows = [i for i in range(2, df_BPP_len, 2)]

df_BPP = pd.read_csv("data/14_trainset_BPP_score_feature.tsv", sep="\t", skiprows=skip_rows)

In [9]:
id_col_split = df_BPP["#id"].str.replace(">", "").str.split(";")

In [10]:
df_BPP["chr"] = id_col_split.str[0]
df_BPP["start"] = id_col_split.str[1]
df_BPP["end"] = id_col_split.str[2]
df_BPP["strand"] = id_col_split.str[3]

In [11]:
df_BPP = df_BPP.drop(columns="#id")

In [12]:
df_BPP.to_csv("data/14_trainset_BPP_score_feature.tsv", sep="\t", index=False)

In [13]:
!md5sum data/14_trainset_BPP_score_feature.tsv

18cf299d030ab9707fd2e9d995a939fd  data/14_trainset_BPP_score_feature.tsv
