In [1]:
import pandas as pd 
import numpy as np
import os
from Bio import SeqIO
#WangComputing website: http://wangcomputing.com/assp/index_multiseq.html

In [2]:
def fasta_to_df(filename):
    file = filename
    seq_object = SeqIO.parse(file, "fasta")
    sequences = []

    for seq in seq_object:
        sequences.append(seq)
    
    seq_ids = []
    seqs = []
    seq_lengths = []

    for record in sequences:
        seq_id = record.id
        sequence = str(record.seq)
        length = len(sequence)
    
        seq_ids.append(seq_id)
        seqs.append(sequence)
        seq_lengths.append(length)
    
    df = pd.DataFrame()
    df["Seq_id"]= seq_ids
    df["Sequences"] = seqs
    df["Sequences"] = df["Sequences"].str.upper()
    df["Sequence_length"] = seq_lengths 
    return df

def Process_rawoutputs(filename, Fasta, ranking, resultname):

    inputfile = open(f'{filename}', 'r')
    seqname="holder"
    outfile = open(f'{seqname}.txt', 'w')
    for line in inputfile:
        if line.split()[0] == "Sequence:":
            seqname=line[10:-1]
            outfile.close()
            outfile = open(f'{seqname}.txt', 'w')
        else:
            outfile.write(line)
    outfile.close()
    inputfile.close()
    os.remove("holder.txt")
    df = pd.DataFrame(columns=['position', 'Putative_splice_site', 'Sequence','Score', 'Intron_GC', 'Activation_alt_cryptic', 'Activation constitutive', 'Confidence'])
    for index, row in Fasta.iterrows():
        #print(row['Seq_id'])
        filename=row['Seq_id']
        dfcurrent=pd.read_csv(f'{filename}.txt', sep='|',skiprows=1,names=['position', 'Putative_splice_site', 'Sequence','Score', 'Intron_GC', 'Activation_alt_cryptic', 'Activation constitutive', 'Confidence'] )
        dfcurrent['fasta_name']=row['Seq_id']
        dfcurrent['original_seq']=row['Sequences']
        df=pd.concat([df,dfcurrent])
    
    exchange=pd.read_csv("./source_files_do_not_touch/exchange_sorted.csv")
    df['codon_end'] = np.where(df['position']%3 == 0, df['position'], (df['position']/3).astype(int)*3 + 3)
    df['codon_start']=df['codon_end']-2
    #df["original_seq"]=fasta['Sequences'][0]
    dfnums=df[((df['Score']>8) & (df['Confidence']>0.5))]
    dfConstac=df[(df['Putative_splice_site'] == "Constitutive acceptor")]
    dfConstdo=df[(df['Putative_splice_site'] == "Constitutive donor")]
    dfa=pd.concat([dfnums, dfConstac, dfConstdo]).drop_duplicates().copy()
    dfa['Codon'] = dfa.apply(lambda x: x['original_seq'][x['codon_start']-1 : x['codon_start']+2],axis=1)
    df2=dfa.merge(exchange,  how='left',on='Codon')
    df3=exchange.merge(df2,  how='inner',on='AA')
    dfW=df3.loc[df3['Codon_x'] == 'TGG']
    print("Number of tryptophan sites detected for alternative splicing: ", len(dfW) )
    dfW=dfW.rename(columns={"Codon_x": "Replaced_codon", "Codon_y": "Codon", "Codon_score_y": "Codon_Score", "Codon_score_x": "Replaced_Codon_Score"})
    dfW=dfW.drop(columns=['Replaced_codon', 'Replaced_Codon_Score','original_seq'])
    print(dfW)
    df4=df3[df3['Codon_x'] != df3['Codon_y']]
    df5=df4.sort_values(['Codon_score_x'],ascending=False).groupby('codon_start').nth(ranking-1)
    df5=df5.reset_index()
    Result_fasta=[]
    for index, row in Fasta.iterrows():
        fastaname=row['Seq_id']
        Sequence=row['Sequences']
        codon_start = df5.loc[df5['fasta_name'] == fastaname].codon_start.values.tolist()
        replacement = df5.loc[df5['fasta_name'] == fastaname].Codon_x.values.tolist()
        NewSequence=list(row['Sequences'])
        for i in range(len(codon_start)):
            NewSequence[(codon_start[i]-1) : (codon_start[i]+2)] = replacement[i]
        Finished = "".join(NewSequence)
        print("Fasta name: ", row['Seq_id'])
        #print(NewSequence)
        print("Original length: ", len(Sequence), "Finished length: ", len(Finished), "Ranking setting: ", ranking, "Codon changed: ", len(replacement))
        SeqName = ">" + row['Seq_id']
        Result_fasta.append(SeqName)
        Result_fasta.append(Finished)
    with open(f"{resultname}.fasta", 'w') as fp:
        for item in Result_fasta:
         # write each item on a new line
            fp.write("%s\n" % item)
    df5=df5.rename(columns={"Codon_x": "Replaced_codon", "Codon_y": "Original_Codon", "Codon_score_y": "Original_Codon_Score", "Codon_score_x": "Replaced_Codon_Score"})
    df5=df5.reindex(sorted(df5.columns), axis=1)
    df5.to_csv(f"{resultname}.csv",index=False)
    df5 = df5.reindex(sorted(df5.columns), axis=1)
    print('Fasta result is saved as: ', f"{resultname}.fasta")
    print('Fasta exchange summary datasheet is saved as: ', f"{resultname}.csv")  
    for index, row in Fasta.iterrows():
        fastaname=row['Seq_id']
        os.remove(f"{fastaname}.txt")
    return df5

In [3]:
Fasta=fasta_to_df("combined_fasta.txt")

In [4]:
Process_rawoutputs("Result.txt", Fasta, 1, "Result1a")

Number of tryptophan sites detected for alternative splicing:  4
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       170  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
1   Trp (W)       941  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   
2   Trp (W)      1742  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
3   Trp (W)      2513  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   

    Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0   8.816     0.529                  0.884                   0.084      0.905   
1  10.113     0.457                  0.822                   0.135      0.836   
2   8.816     0.529                  0.884                   0.084      0.905   
3  10.113     0.457                  0.822                   0.135      0.836   

                                        fasta_name codon_end codon_start  \
0  Alternative_splicing_replace_example-2_(1452bp)       171         169   
1

Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position
0,Leu (L),0.178,0.778,0.771,0.557,CTC,0.2,Alt. isoform/cryptic donor,0.4,CTG,8.044,TAGTAACCTCgtgagtggga,174,172,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,174
1,Gly (G),0.174,0.77,0.774,0.457,GGT,0.16,Alt. isoform/cryptic donor,0.34,GGC,9.993,AATTACAACGgtaaatttaa,564,562,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,562
2,Gly (G),0.04,0.941,0.957,0.5,GGT,0.16,Alt. isoform/cryptic donor,0.34,GGC,9.372,AAATTTAAGGgtaaagctac,576,574,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,574
3,Arg (R),0.192,0.738,0.739,0.529,AGG,0.21,Alt. isoform/cryptic donor,0.211,AGA,8.642,CGGTGGGGAGgtactattat,696,694,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,695
4,Gly (G),0.157,0.796,0.803,0.429,GGT,0.16,Alt. isoform/cryptic donor,0.34,GGC,9.347,CAACGGCCCGgtcagggcct,891,889,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,889
5,Asp (D),0.685,0.291,0.576,0.529,GAC,0.54,Constitutive acceptor,0.46,GAT,3.674,tctgcaccagGACTGGCTGA,1047,1045,Alternative_splicing_replace_example-2_(1452bp),GATAAAACACATACTTGCCCGCCCTGCCCAGCCCCCGAATTGCTGG...,1045
6,Lys (K),0.045,0.932,0.951,0.471,AAG,0.57,Alt. isoform/cryptic donor,0.43,AAA,8.34,TAAGTGTAAGgtatcaaaca,1077,1075,Alternative_splicing_replace_example-2_(1452bp),GATAAAACACATACTTGCCCGCCCTGCCCAGCCCCCGAATTGCTGG...,1077
7,Ser (S),0.214,0.734,0.708,0.443,TCC,0.22,Alt. isoform/cryptic donor,0.24,AGC,9.633,TTCTAGCTCCgtgagttaca,1260,1258,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,1260
8,Asp (D),0.685,0.291,0.576,0.529,GAC,0.54,Constitutive acceptor,0.46,GAT,3.674,tctgcaccagGACTGGCTGA,2619,2617,Liz_Constitutive,GCCACCATGGTGCTGCAGACCCAGGTGTTCATCAGCCTGCTGCTGT...,2617
9,Lys (K),0.045,0.932,0.951,0.471,AAG,0.57,Alt. isoform/cryptic donor,0.43,AAA,8.34,TAAGTGTAAGgtatcaaaca,2649,2647,Liz_Constitutive,GCCACCATGGTGCTGCAGACCCAGGTGTTCATCAGCCTGCTGCTGT...,2649


In [5]:
Fasta=fasta_to_df("Result1a.fasta")

In [6]:
Process_rawoutputs("Result1a.txt", Fasta, 1, "Result1b")

Number of tryptophan sites detected for alternative splicing:  4
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       170  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
1   Trp (W)       941  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   
2   Trp (W)      1742  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
3   Trp (W)      2513  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   

    Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0   8.816     0.529                  0.884                   0.084      0.905   
1  10.113     0.457                  0.822                   0.135      0.836   
2   8.816     0.529                  0.884                   0.084      0.905   
3  10.113     0.457                  0.822                   0.135      0.836   

                                        fasta_name codon_end codon_start  \
0  Alternative_splicing_replace_example-2_(1452bp)       171         169   
1

Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position
0,Leu (L),0.21,0.728,0.712,0.557,CTG,0.4,Alt. isoform/cryptic donor,0.2,CTC,12.019,TAGTAACCTGgtgagtggga,174,172,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,174
1,Arg (R),0.508,0.402,0.209,0.457,AGG,0.21,Constitutive donor,0.211,AGA,11.049,CAACCTACAGgtgcgtctcc,1014,1012,Alternative_splicing_replace_example-2_(1452bp),GATAAAACACATACTTGCCCGCCCTGCCCAGCCCCCGAATTGCTGG...,1013
2,Asp (D),0.655,0.322,0.509,0.529,GAT,0.46,Constitutive acceptor,0.54,GAC,3.674,tctgcaccagGATTGGCTGA,1047,1045,Alternative_splicing_replace_example-2_(1452bp),GATAAAACACATACTTGCCCGCCCTGCCCAGCCCCCGAATTGCTGG...,1045
3,Ser (S),0.16,0.794,0.798,0.443,AGC,0.24,Alt. isoform/cryptic donor,0.22,TCC,9.804,TTCTAGCAGCgtgagttaca,1260,1258,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,1260
4,Arg (R),0.508,0.402,0.209,0.457,AGG,0.21,Constitutive donor,0.211,AGA,11.049,CAACCTACAGgtgcgtctcc,2586,2584,Liz_Constitutive,GCCACCATGGTGCTGCAGACCCAGGTGTTCATCAGCCTGCTGCTGT...,2585
5,Asp (D),0.655,0.322,0.509,0.529,GAT,0.46,Constitutive acceptor,0.54,GAC,3.674,tctgcaccagGATTGGCTGA,2619,2617,Liz_Constitutive,GCCACCATGGTGCTGCAGACCCAGGTGTTCATCAGCCTGCTGCTGT...,2617


In [7]:
Fasta=fasta_to_df("Result1b.fasta")

In [8]:
Process_rawoutputs("Result1b.txt", Fasta, 2, "Result2")

Number of tryptophan sites detected for alternative splicing:  1
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       221  Alt. isoform/cryptic donor  AATTCAACTGgtacgtggat   

   Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0  9.633     0.386                   0.67                   0.258      0.616   

       fasta_name codon_end codon_start Codon  Codon_Score  
0  FcLALA_(822bp)       222         220   TGG          1.0  
Fasta name:  FcLALA_(822bp)
Original length:  822 Finished length:  822 Ranking setting:  2 Codon changed:  0
Fasta result is saved as:  Result2.fasta
Fasta exchange summary datasheet is saved as:  Result2.csv


Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position


In [9]:
Fasta=fasta_to_df("Result2.fasta")

In [10]:
Process_rawoutputs("Result2.txt", Fasta, 3, "Result3")

Number of tryptophan sites detected for alternative splicing:  1
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       221  Alt. isoform/cryptic donor  AATTCAACTGgtacgtggat   

   Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0  9.633     0.386                   0.67                   0.258      0.616   

       fasta_name codon_end codon_start Codon  Codon_Score  
0  FcLALA_(822bp)       222         220   TGG          1.0  
Fasta name:  FcLALA_(822bp)
Original length:  822 Finished length:  822 Ranking setting:  3 Codon changed:  0
Fasta result is saved as:  Result3.fasta
Fasta exchange summary datasheet is saved as:  Result3.csv


Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position


In [11]:
Fasta=fasta_to_df("Result3.fasta")

In [12]:
Process_rawoutputs("Result3.txt", Fasta, 4, "Result4")

Number of tryptophan sites detected for alternative splicing:  1
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       221  Alt. isoform/cryptic donor  AATTCAACTGgtacgtggat   

   Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0  9.633     0.386                   0.67                   0.258      0.616   

       fasta_name codon_end codon_start Codon  Codon_Score  
0  FcLALA_(822bp)       222         220   TGG          1.0  
Fasta name:  FcLALA_(822bp)
Original length:  822 Finished length:  822 Ranking setting:  4 Codon changed:  0
Fasta result is saved as:  Result4.fasta
Fasta exchange summary datasheet is saved as:  Result4.csv


Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position


In [13]:
Fasta=fasta_to_df("Result4.fasta")

In [14]:
Process_rawoutputs("Result4.txt", Fasta, 5, "Result5")

Number of tryptophan sites detected for alternative splicing:  1
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       221  Alt. isoform/cryptic donor  AATTCAACTGgtacgtggat   

   Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0  9.633     0.386                   0.67                   0.258      0.616   

       fasta_name codon_end codon_start Codon  Codon_Score  
0  FcLALA_(822bp)       222         220   TGG          1.0  
Fasta name:  FcLALA_(822bp)
Original length:  822 Finished length:  822 Ranking setting:  5 Codon changed:  0
Fasta result is saved as:  Result5.fasta
Fasta exchange summary datasheet is saved as:  Result5.csv


Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position


In [15]:
Fasta=fasta_to_df("Result5.fasta")

In [16]:
Process_rawoutputs("Result5.txt", Fasta, 1, "Result5to1")

Number of tryptophan sites detected for alternative splicing:  1
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       221  Alt. isoform/cryptic donor  AATTCAACTGgtacgtggat   

   Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0  9.633     0.386                   0.67                   0.258      0.616   

       fasta_name codon_end codon_start Codon  Codon_Score  
0  FcLALA_(822bp)       222         220   TGG          1.0  
Fasta name:  FcLALA_(822bp)
Original length:  822 Finished length:  822 Ranking setting:  1 Codon changed:  0
Fasta result is saved as:  Result5to1.fasta
Fasta exchange summary datasheet is saved as:  Result5to1.csv


Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position


In [17]:
Fasta=fasta_to_df("Result5to1.fasta")

In [24]:
Process_rawoutputs("Result5to1.txt", Fasta, 1, "Result5to1b")

Number of tryptophan sites detected for alternative splicing:  4
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       170  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
1   Trp (W)       941  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   
2   Trp (W)      1742  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
3   Trp (W)      2513  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   

    Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0   8.816     0.529                  0.884                   0.084      0.905   
1  10.113     0.457                  0.822                   0.135      0.836   
2   8.816     0.529                  0.884                   0.084      0.905   
3  10.113     0.457                  0.822                   0.135      0.836   

                                        fasta_name codon_end codon_start  \
0  Alternative_splicing_replace_example-2_(1452bp)       171         169   
1

Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position
0,Leu (L),0.21,0.728,0.712,0.557,CTG,0.4,Alt. isoform/cryptic donor,0.2,CTC,12.019,TAGTAACCTGgtgagtggga,174,172,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,174
1,Asp (D),0.634,0.341,0.462,0.514,GAT,0.46,Constitutive acceptor,0.54,GAC,3.674,tctgcaccagGATTGGCTGA,1047,1045,Alternative_splicing_replace_example-2_(1452bp),GATAAAACACATACTTGCCCGCCCTGCCCAGCCCCCGAATTGCTGG...,1045
2,Asp (D),0.634,0.341,0.462,0.514,GAT,0.46,Constitutive acceptor,0.54,GAC,3.674,tctgcaccagGATTGGCTGA,2619,2617,Liz_Constitutive,GCCACCATGGTGCTGCAGACCCAGGTGTTCATCAGCCTGCTGCTGT...,2617


In [25]:
Fasta=fasta_to_df("Result5to1b.fasta")

In [26]:
Process_rawoutputs("Result5to1b.txt", Fasta, 2, "Result5to2")

Number of tryptophan sites detected for alternative splicing:  4
          AA position        Putative_splice_site              Sequence  \
0   Trp (W)       170  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
1   Trp (W)       941  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   
2   Trp (W)      1742  Alt. isoform/cryptic donor  AGTTCAATTGgtacgtcgat   
3   Trp (W)      2513  Alt. isoform/cryptic donor  AATTTAACTGgtacgtagac   

    Score Intron_GC Activation_alt_cryptic Activation constitutive Confidence  \
0   8.816     0.529                  0.884                   0.084      0.905   
1  10.113     0.457                  0.822                   0.135      0.836   
2   8.816     0.529                  0.884                   0.084      0.905   
3  10.113     0.457                  0.822                   0.135      0.836   

                                        fasta_name codon_end codon_start  \
0  Alternative_splicing_replace_example-2_(1452bp)       171         169   
1

Unnamed: 0,AA,Activation constitutive,Activation_alt_cryptic,Confidence,Intron_GC,Original_Codon,Original_Codon_Score,Putative_splice_site,Replaced_Codon_Score,Replaced_codon,Score,Sequence,codon_end,codon_start,fasta_name,original_seq,position
0,Leu (L),0.178,0.778,0.771,0.557,CTC,0.2,Alt. isoform/cryptic donor,0.131,CTT,8.044,TAGTAACCTCgtgagtggga,174,172,Alternative_splicing_replace_example_(1494bp),GACATACAACTGACTCAAAGTCCTGCCTCACTGGCCGTCAGTCTGG...,174
