In [10]:
import pandas as pd
from Bio.Align import Applications as AlignApp
import numpy as np
import tensorflow as tf

In [2]:
data = pd.read_csv("data/processed_data/AMP/PeptideMarkerMIC.csv")

In [3]:
TheSmallest = data.drop_duplicates(["Target", "Peptide"]).groupby("Target")[["Target", "Marker", "Peptide", "Log2MIC"]].apply(lambda x: x.nsmallest(10, columns=["Log2MIC"]))
peptides = TheSmallest[["Peptide"]].drop_duplicates()
peptides.index = [i for i in range(peptides.shape[0])]
peptides['PepID'] = [f'pep.{i}' for i in range(peptides.shape[0])]
TheSmallest.index = [i for i in range(TheSmallest.shape[0])]
TheSmallest = TheSmallest.merge(peptides, on='Peptide')

In [4]:
in_filename = 'data/processed_data/Baseline/peptides.fasta'
file = open(in_filename, 'w')
for pair in peptides.values.tolist():
    file.write(f">{pair[1]}\n")
    file.write(f"{pair[0]}\n")
file.close()

In [5]:
out_filename = 'data/processed_data/Baseline/aln_peptides.fasta'
mafft_cline = AlignApp.MafftCommandline(input=in_filename)
stdout, stderr = mafft_cline()
with open(out_filename, "w") as handle:
    handle.write(stdout)

In [6]:
class Preprocessing():
    def __init__(self):
        return
        
    def ReadFASTA(self,filename,as_pd=True):
        if filename.split(".")[-1] not in ["fasta","fna","fa"]:
            raise ValueError('Invalid file format. Expected formats are ["fasta","fna","fa"].')
        file_handle = open(filename,"r")
        seqs = []
        seqid = []
        tmp_seq = ""
        for line in file_handle:
            if (line[0] == ">"):
                if tmp_seq != "":
                    seqs.append(tmp_seq)
                seqid.append(line.split("\n")[0][1:])
                tmp_seq = ""
            else:
                tmp_seq+=line.split("\n")[0]
        seqs.append(tmp_seq)
        file_handle.close()
        if as_pd:
            fasta = {}
            for i in range(len(seqs)):
                fasta[seqid[i]] = seqs[i]
            return pd.DataFrame(fasta,index=["AlnPep"]).transpose()
        else:
            return seqs, seqid

In [7]:
pp = Preprocessing()
aln = pp.ReadFASTA('data/processed_data/Baseline/aln_peptides.fasta')
aln['PepID'] = aln.index
aln.index = [i for i in range(aln.shape[0])]

In [52]:
aa_vocal = np.load("model_weights/PepTV_vocal.npy")
PepTV = tf.keras.layers.TextVectorization(standardize=None, split="character",
                                          output_sequence_length=194,
                                          output_mode="int", name="PepTextVectorizer")
PepTV.set_vocabulary(aa_vocal)

In [59]:
enc_aln_pep = PepTV(aln['AlnPep'].values).numpy()

In [61]:
np.savez_compressed("data/processed_data/Baseline/enc_aln_pep", data=enc_aln_pep)

In [62]:
keys = TheSmallest[["Target", "PepID"]]

In [63]:
keys.to_csv('data/processed_data/Baseline/keys.csv', index=False)

In [65]:
targets = TheSmallest[["Target", "Marker"]].drop_duplicates()

In [67]:
targets.to_csv('data/processed_data/Baseline/targets.csv', index=False)