In [None]:
from google.colab import files
uploaded = files.upload()
import pandas as pd

Saving training_filtered.fasta to training_filtered.fasta
Saving pos_cleaned.fasta to pos_cleaned.fasta
Saving tr_id.txt to tr_id.txt
Saving training.fasta to training.fasta
Saving neg_dss.tsv to neg_dss.tsv
Saving pos_dss.tsv to pos_dss.tsv


In [None]:
neg_labels = ["UniProt_ID", "Species", "Kingdom", "Seq_Length", "TransM_Helix_90", "T or B"]
pos_labels = ["UniProt_ID", "Species", "Kingdom", "Seq_Length", "Cleavage", "T or B"]

unf_neg = pd.read_csv("neg_dss.tsv", sep = "\t", names = neg_labels)
unf_pos = pd.read_csv("pos_dss.tsv", sep = "\t",  names = pos_labels)

neg = unf_neg[["UniProt_ID", "Species", "Kingdom", "Seq_Length"]]
pos = unf_pos[["UniProt_ID", "Species", "Kingdom", "Seq_Length","Cleavage"]]

neg = neg.assign(Cleavage=0)
pos = pos.assign(TransM_Helix_90="NIL")

neg = pd.concat([neg, unf_neg["TransM_Helix_90"]],axis=1)

neg = pd.concat([neg, unf_neg["T or B"]], axis=1)
pos = pd.concat([pos, unf_pos["T or B"]], axis=1)

neg = neg.assign(Class=0)
pos = pos.assign(Class=1)

keep_labels = ["Metazoa", "Viridiplantae", "Fungi"]

for index, row in neg.iterrows():
    if row["Kingdom"] not in keep_labels:
        neg.at[index, "Kingdom"] = "Other"

for index, row in pos.iterrows():
    if row["Kingdom"] not in keep_labels:
        pos.at[index, "Kingdom"] = "Other"

data = pd.concat([pos, neg])
data.to_csv("data.tsv", sep = "\t", index=False, header=True)

In [None]:
# Build training set
t = data[data["T or B"] != 0]
print(t.head(5))

# Keep only ID, class, label
t = t[["UniProt_ID", "Class", "T or B"]]

# Rename columns to match desired format
t = t.rename(columns={"UniProt_ID": "UniProt_ID","Class": "Class","T or B": "Label"})

print(t.head(5))

# Save as tsv
t.to_csv("training.tsv", sep="\t", index=False, header=True)
t.shape

  UniProt_ID          Species  Kingdom  Seq_Length  Cleavage TransM_Helix_90  \
0     P52798     Homo sapiens  Metazoa         201        25             NIL   
1     P17948     Homo sapiens  Metazoa        1338        26             NIL   
2     Q8WWA0     Homo sapiens  Metazoa         313        18             NIL   
3     O75629     Homo sapiens  Metazoa         220        31             NIL   
4     P19069  Cavia porcellus  Metazoa        1742        19             NIL   

   T or B  Class  
0       1      1  
1       1      1  
2       1      1  
3       1      1  
4       1      1  
  UniProt_ID  Class  Label
0     P52798      1      1
1     P17948      1      1
2     Q8WWA0      1      1
3     O75629      1      1
4     P19069      1      1


(8021, 3)

In [None]:
def reorder_clean_fasta(fasta_file, ordered_ids_file, output_fasta):
    ordered_ids = []
    with open(ordered_ids_file) as f:
        for line in f:
            line = line.strip()
            if line != "":
                ordered_ids.append(line)

    fasta_dict = {}
    with open(fasta_file) as f:
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if "|" in line:
                    seq_id = line.split("|")[1]
                else:
                    seq_id = line[1:]
                fasta_dict[seq_id] = ""
            else:
                fasta_dict[seq_id] += line

    for uid in ordered_ids:
        if uid in fasta_dict:
            with open(output_fasta, "a") as out:
                out.write(f">{uid}\n")
                out.write(fasta_dict[uid] + "\n")


training_filtered = reorder_clean_fasta("training.fasta", "tr_id.txt", "training_filtered.fasta")


In [None]:
cleavages = []
for index,row in pos.iterrows():
  cleavages.append(row["Cleavage"])

cleavages_train = cleavages[:874]
sps_train = []

i = 0
with open("pos_cleaned.fasta", "r") as reader:
  for line in reader:
    if not line.startswith(">") and i < 874:
      sps_train.append(line[cleavages_train[i]-13:cleavages_train[i]+2])
      i += 1

def training_df_maker(t_df_initial,fasta_file,SP_15_list,n_pos):
    #Extract sequences
    seqs = []
    with open(fasta_file, "r") as f:
        sequence = ""
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if sequence:
                    seqs.append(sequence)
                    sequence = ""
            else:
                sequence += line
        if sequence:
            seqs.append(sequence)

    #Make sure the number of sequences matches dataframe
    assert len(seqs) == len(t_df_initial)

    #Extract fragments
    frag_90 = []
    for s in seqs:
        frag_90.append(s[:90])

    SP_15 = []
    for i in range(len(t_df_initial)):
      if i < n_pos:
        SP_15.append(SP_15_list[i])
      else:
        SP_15.append("NIL")


    t_df_final = t_df_initial.copy()
    t_df_final["Frag_90"] = frag_90
    t_df_final["SP_15"] = SP_15

    return t_df_final

training_df = training_df_maker(t, "training_filtered.fasta",sps_train,874)
training_df.to_csv("training_df.tsv", sep = "\t", index=False, header=True)
print(training_df.head(15))

   UniProt_ID  Class  Label  \
0      P52798      1      1   
1      P17948      1      1   
2      Q8WWA0      1      1   
3      O75629      1      1   
4      P19069      1      1   
5      P25236      1      1   
6      P55955      1      1   
7      P0DPU9      1      1   
8      P79263      1      1   
9      P19320      1      1   
10     P00719      1      1   
11     P02809      1      1   
12     P0C8E8      1      1   
13     Q9P2K2      1      1   
14     Q42502      1      1   

                                              Frag_90            SP_15  
0   MRLLPLLRTVLWAAFLGSPLRGGSSLRHVVYWNSSNPRLLRGDAVV...  AAFLGSPLRGGSSLR  
1   MVSYWDTGVLLCALLSCLLLTGSSSGSKLKDPELSLKGTQHIMQAG...  LLSCLLLTGSSSGSK  
2   MNQLSFLLFLIATTRGWSTDEANTYFKEWTCSSSPSLPRSCKEIKD...  FLLFLIATTRGWSTD  
3   MAGLSRGSARALLAALLASTLLALLVSPARGRGGRDHGDWDEASRL...  STLLALLVSPARGRG  
4   MRLLWGLLWAFGLFASSLQKPRLLLFSPSVVNLGVPLSVGVQLQDA...  LLWAFGLFASSLQKP  
5   MWRSLGLALALCLLPYGGAESQGQSPACKQAPPWNIGDQNPMLNSE...  LALALCLLPY