Notebook for cleaning the peptide data

In [1]:
# Supress pytorch pickle load warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Logging
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

# Library imports
import gdiffusion as gd
import util
import util.chem as chem
import util.visualization as vis
import util.stats as gdstats


import gdiffusion.bayesopt as bayesopt
from gdiffusion.classifier.logp_predictor import LogPPredictor

device = util.util.get_device()
print(f"device: {device}")

DIFFUSION_PATH = "saved_models/diffusion/molecule-diffusion-v1.pt"
SELFIES_VAE_PATH = "saved_models/selfies_vae/selfies-vae.ckpt"
PEPTIDE_VAE_PATH = "saved_models/peptide_vae/peptide-vae.ckpt"
LOGP_PREDICTOR_PATH = "saved_models/logp/model-logp"

import h5py
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


device: cuda


In [1]:
with open(file="data/raw_peptide/peptide_raw_10M.csv") as f:
    for i in range(100):
        print(f.readline())

# data is PEPTIDE, EXTINCT

text,labels

FLPQGTPSPLIPMLILIETISLFIQPMALAVRLTANITAGHLLIHL,1

VMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGPTLVEWIW,0

QDIRKMGGMMYTLPFTSSCLMIGTLALTGMPFMTGFYSKDHII,1

AFMGYVLPWGQMSFWGATVITNLLSAIPYIGTTLVEW,0

MLTMIPILMKTTNPRSTEAATKYFMTQATASMMLMMALTINLVYS,1

LLVLFIMFQLKVSNHMYPMNPELIKPKLKEQKTPWE,1

QCPKPTLQQISHIAQQLGLEKDVVRVWFCNRRQKGKRSSSDYSQREDF,0

LASATNTWEIQQL,0

IQQAFSHTQAPTLPLLGLILAATGKSAQ,0

MAIAMLSLLSLFFYLRLAYHSTIILPPNSSNH,1

DVIRESTFQGHHTTTVQKGLRYGMVLFIVSEVFFFLGFFW,1

MISHIVTYYSGKKEPFGYMGMVWAMVSIGFLGFIVWA,0

PILIAMAFLMLTERKILGYMQLRKGPNVVGPYGL,0

IPMITNSLT,1

PWASQTSKLPTMLITALL,0

PPLSGFLPKWMIIQEMTKNSLIIMPTMMAI,1

ALMVALAICSLVLYLLTLMLTEKLSS,0

ADAIKLFTKEPLKPSTS,0

LLILVLFLPDLLGDPDNYTPANPLN,1

IIMYNPTLMALNLIIYLLMT,1

IPGGPFENLEIRRFDRVKDTEWNDFEYRFIS,0

ASEPYTTKFFYYLLMFLI,1

NFTPANPLATPPHIKPEWYFLFAYAILRSIPNKLGG,0

SMLPIILLVFAAC,0

NVFGFKALRALRLEDLRIPTAYVKTFQGPPHGIQVERDKLNKYGRPLLGC,0

AKMPLYGLHLWLPKA,0

KDAIFSVSIAYFGIFIASF,0

NMSFWLLPPSFLLLLASSTVEAGAGTGWTVYPPLAGNMAH,0

KILGYMQLRKGPNIVGPLGLLQPMAD

In [3]:
with open("data/raw_peptide/peptide_raw_4p5.csv") as f:
    for _ in range(5):
        print(f.readline())

# all non-extinct (0)

sequence

QRSTPYCRQSIPKGTIV

STPYCRQSIPKGTIVPLKGP

GIISHIWALARHTLFTNTFQDDER

TGTGNALRRRATSVATSVGTD



In [9]:
# dataset lengths:
def get_num_lines(file_name):
    total_lines = 0
    with open(file_name, 'r') as f:
        for line in f:
            if line.strip():
                total_lines += 1
    return total_lines

peptide_10m_len = get_num_lines("data/raw_peptide/peptide_raw_10M.csv")
peptide_4p5_len = get_num_lines("data/raw_peptide/peptide_raw_4p5.csv")

In [10]:
print(f"10 Million Peptide Len: {peptide_10m_len}")
print(f"4.5 Million Peptide Len: {peptide_4p5_len}")

10 Million Peptide Len: 10274724
4.5 Million Peptide Len: 4500001


In [None]:
# subtract 2 for the header lines
total_len = peptide_10m_len + peptide_4p5_len - 2

In [16]:
peptide_latent_dim = 256

In [None]:
# # Create h5py file, do not run again!

# # data source is if its from the 10M dataset (0) or the 4.5M dataset (1)
# dataset_file = "data/peptide_dataset.h5"
# with h5py.File(dataset_file, 'w') as h5file:
#     peptide_dataset = h5file.create_dataset('PEPTIDES', (total_len), dtype=h5py.string_dtype())
#     extinct_dataset = h5file.create_dataset('EXTINCT', (total_len), dtype=bool)
#     data_source = h5file.create_dataset('DATA_SOURCE', (total_len), dtype=np.int8)
#     latents = h5file.create_dataset('LATENTS', (total_len, peptide_latent_dim), dtype=np.float32)
    

In [2]:
# Read from peptide latent data:
def read_peptide_dataset_raw(i: int, data_path="data/peptide_dataset.h5"):
    with h5py.File(data_path, 'r') as f:
        return f['PEPTIDES'][i], f['EXTINCT'][i], f['DATA_SOURCE'][i], f['LATENTS'][i]
    
def read_peptide_dataset(i: int, data_path="data/peptide_dataset.h5"):
    ''' return peptide, latent, extinct, datasource '''
    with h5py.File(data_path, 'r') as f:
        raw_peptide, raw_extinct, raw_datasource, raw_latent = f['PEPTIDES'][i], f['EXTINCT'][i], f['DATA_SOURCE'][i], f['LATENTS'][i]
        peptide = raw_peptide.decode('utf-8')
        extinct = bool(raw_extinct)
        datasource = 'peptide_10M' if raw_datasource == 0 else 'peptide_4.5M'
        latent = raw_latent

    return peptide, latent, extinct, datasource

In [19]:
peptide_10m_len = 10274724

In [29]:
RAW_DATA_PATH = "data/raw_peptide/peptide_raw_10M.csv"
PEPTIDE_DATASET_PATH = "data/peptide_dataset.h5"

In [30]:
RAW_DATA_PATH_4P5 = "data/raw_peptide/peptide_raw_4p5.csv"

In [None]:
# def write_peptide_10M_dataset(start_idx: int = 0, start_line_num: int = 1):
#     # line_num is 0-indexed, so line_num 0 is the first line!
    
#     with open(RAW_DATA_PATH, 'r') as infile, h5py.File(PEPTIDE_DATASET_PATH, 'r+') as outfile:
#         peptide_ds = outfile['PEPTIDES']
#         extinct_ds = outfile['EXTINCT']
#         data_source_ds = outfile['DATA_SOURCE']

#         # skip first start_line_num lines in infile csv
#         for _ in range(start_line_num):
#             next(infile)

#         idx = start_idx
#         for line_num, raw_line in tqdm(enumerate(infile, start=start_line_num), total=peptide_10m_len, desc='Reading Peptide10M CSV'):
#             try:
#                 raw_line = raw_line.strip()
#                 peptide, extinct = raw_line.split(',')

#                 if peptide is None or extinct is None:
#                     raise ValueError(f"peptide, extinct is wrong: peptide={peptide} extinct={extinct}")

#                 if peptide_ds[idx] != b'':
#                     print(f"Warning, overriding peptide data: {peptide_ds[idx]}! Aborting")
#                     raise ValueError("See above.")
                
#                 peptide_ds[idx] = peptide
#                 extinct_ds[idx] = extinct
#                 data_source_ds[idx] = 0 # this coorosponds to the 10M dataset
#                 idx += 1

#             except Exception as e:
#                 print(f"Encountered an error while processing line_num {line_num}")
#                 print(f"Line was {raw_line} ")
#                 print(f"peptide = {peptide} ")
#                 print(f"extinct = {extinct} ")
#                 print(f"Attempted to index into idx={idx}")
#                 print("Error MSG:")
#                 print(e)

#                 print(f"Removing idx: {idx} data")
#                 peptide_ds[idx] = ''
#                 extinct_ds[idx] = False
#                 data_source_ds[idx] = 0
#                 return

# # write_peptide_10M_dataset()

Reading Peptide10M CSV: 100%|█████████▉| 10274723/10274724 [16:58<00:00, 10089.92it/s]


In [None]:
# # TODO: Determine start_idx
# def write_peptide_4p5M_dataset(start_idx: int = None, start_line_num: int = 1):
#     # line_num is 0-indexed, so line_num 0 is the first line!
    
#     with open(RAW_DATA_PATH_4P5, 'r') as infile, h5py.File(PEPTIDE_DATASET_PATH, 'r+') as outfile:
#         peptide_ds = outfile['PEPTIDES']
#         extinct_ds = outfile['EXTINCT']
#         data_source_ds = outfile['DATA_SOURCE']

#         # skip first start_line_num lines in infile csv
#         for _ in range(start_line_num):
#             next(infile)

#         idx = start_idx
#         for line_num, raw_line in tqdm(enumerate(infile, start=start_line_num), total=peptide_4p5_len, desc='Reading Peptide 4.5M CSV'):
#             try:
#                 peptide = raw_line.strip()

#                 if peptide is None:
#                     raise ValueError(f"peptide, extinct is wrong: peptide={peptide}")

#                 if peptide_ds[idx] != b'':
#                     print(f"Warning, overriding peptide data: {peptide_ds[idx]}! Aborting")
#                     raise ValueError("See above.")
                
#                 peptide_ds[idx] = peptide
#                 extinct_ds[idx] = False # these are all modern peptides, so they are all NOT extinct
#                 data_source_ds[idx] = 1 # this coorosponds to the 4.5 dataset
#                 idx += 1

#             except Exception as e:
#                 print(f"Encountered an error while processing line_num {line_num}")
#                 print(f"Line was {raw_line} ")
#                 print(f"peptide = {peptide} ")
#                 print(f"Attempted to index into idx={idx}")
#                 print("Error MSG:")
#                 print(e)

#                 print(f"Removing idx: {idx} data")
#                 peptide_ds[idx] = ''
#                 extinct_ds[idx] = False
#                 data_source_ds[idx] = 0
#                 return

# peptide_start_index = peptide_10m_len - 1
# # write_peptide_4p5M_dataset(start_idx=peptide_start_index)

Reading Peptide 4.5M CSV: 100%|█████████▉| 4500000/4500001 [07:29<00:00, 10001.05it/s]


In [None]:
# Confirming data:
peptide_amino_acids = set('ACDEFGHIKLMNPQRSTVWY')
with h5py.File("data/peptide_dataset.h5", mode='r') as f:
    peptide_ds = f['PEPTIDES']
    extinct_ds = f['EXTINCT']
    data_source_ds = f['DATA_SOURCE']

    for idx in tqdm(range(len(peptide_ds))):
        peptide = peptide_ds[idx].decode('utf-8')
        extinct = extinct_ds[idx]
        data_source = extinct_ds[idx]

        if len(peptide) <= 4:
            print(f"Small Peptide peptide at index {idx}: {peptide}")

        if not peptide.isupper():
            print(f"Peptide at index {idx} is not uppercase: {peptide}")
        
        if not all(c in peptide_amino_acids for c in peptide):
            print(f"Peptide at index {idx} is not a valid peptide! {peptide}")

  5%|▍         | 684970/14774723 [00:32<11:14, 20879.79it/s]


KeyboardInterrupt: 

In [None]:
from difflib import SequenceMatcher

def peptides_similar(seq1, seq2, max_changes=5):
   matcher = SequenceMatcher(None, seq1, seq2)
   opcodes = matcher.get_opcodes()
   changes = sum(1 for tag, _, _, _, _ in opcodes if tag != 'equal')
   return changes <= max_changes

In [67]:
peptide, _, extinct, data_source = read_peptide_dataset(total_len-1)
string_print = f"""
    Peptide: {peptide}
    Extinct: {str(extinct)}
    Data Source: {data_source}
    """
print(string_print)


    Peptide: ALAPRHADVVAPRLMAITRAGVTALVLTAFLGVRGLNPGADLL
    Extinct: False
    Data Source: peptide_4.5M
    


In [70]:
# attatch latents to dataset (this will take a long long time)

vae = gd.load_vae_peptides()

loading model from saved_models/peptide_vae/peptide-vae.ckpt
Enc params: 2,675,904
Dec params: 360,349


In [None]:
def attatch_latents(start_idx: int = 576000, vae_batch_size=64):
    num_batches_per_block = 1000
    block_size = num_batches_per_block * vae_batch_size
    
    with h5py.File(PEPTIDE_DATASET_PATH, 'r+') as f:
        peptide_ds = f['PEPTIDES']
        latents_ds = f['LATENTS']
        
        block_num = start_idx // block_size
        block_start_index = block_num * block_size
        
        try:
            for block_idx in range(block_start_index, total_len, block_size):
                print(f"Block Number: {block_idx // block_size} --- Start Idx: {block_idx}")
                peptide_block = peptide_ds[block_idx:block_idx + block_size]
                
                # Fix: total should be number of batches, not number of items
                num_batches = (len(peptide_block) + vae_batch_size - 1) // vae_batch_size
                
                for i in tqdm(range(0, len(peptide_block), vae_batch_size), 
                             total=num_batches, desc='VAE Block'):
                    vae_batch = peptide_block[i:i + vae_batch_size]
                    vae_batch = [vae_ele.decode('utf-8') for vae_ele in vae_batch]
                    latents = gd.peptides_to_latent(vae_batch, vae=vae).cpu()
                    latents_ds[block_idx + i:block_idx + i + len(vae_batch)] = latents
                    
        except Exception as e:
            print(f"Encountered error: {e}")
            print(f"block_idx: {block_idx}")
            print(f"block_num: {block_idx // block_size}")
            print(f"i: {i}")
            print(f"len(peptide_block): {len(peptide_block)}")
            print(f"vae_batch_size: {vae_batch_size}")
            print(f"corrupted: {block_idx + i} to {block_idx + i + len(vae_batch)}")

# vb = attatch_latents(start_idx=576000)

Block Number: 0 --- Start Idx: 0


VAE Block:   0%|          | 0/1000 [00:00<?, ?it/s]

VAE Block: 100%|██████████| 1000/1000 [04:36<00:00,  3.61it/s]


Block Number: 1 --- Start Idx: 64000


VAE Block: 100%|██████████| 1000/1000 [04:31<00:00,  3.68it/s]


Block Number: 2 --- Start Idx: 128000


VAE Block: 100%|██████████| 1000/1000 [04:34<00:00,  3.64it/s]


Block Number: 3 --- Start Idx: 192000


VAE Block: 100%|██████████| 1000/1000 [04:34<00:00,  3.64it/s]


Block Number: 4 --- Start Idx: 256000


VAE Block: 100%|██████████| 1000/1000 [04:36<00:00,  3.62it/s]


Block Number: 5 --- Start Idx: 320000


VAE Block: 100%|██████████| 1000/1000 [04:35<00:00,  3.64it/s]


Block Number: 6 --- Start Idx: 384000


VAE Block: 100%|██████████| 1000/1000 [04:34<00:00,  3.64it/s]


Block Number: 7 --- Start Idx: 448000


VAE Block: 100%|██████████| 1000/1000 [04:42<00:00,  3.54it/s]


Block Number: 8 --- Start Idx: 512000


VAE Block: 100%|██████████| 1000/1000 [04:39<00:00,  3.58it/s]


Block Number: 9 --- Start Idx: 576000


VAE Block:  11%|█         | 110/1000 [00:30<04:08,  3.58it/s]


KeyboardInterrupt: 

[ 2.68168569e+00  4.36633170e-01  9.88466144e-01 -1.96448401e-01
 -2.22259223e-01 -9.87164736e-01  1.19704413e+00  6.30952358e-01
 -1.67972040e+00  9.32729840e-01 -7.07652092e-01 -1.29501843e+00
 -1.82481498e-01 -7.71961927e-01  1.76283073e+00  6.80858016e-01
 -1.60144761e-01 -1.15161479e+00 -2.11487770e+00 -1.61603570e+00
  6.48729503e-04  2.86051452e-01  3.95024627e-01  1.51311064e+00
 -1.32089305e+00  3.64589095e-02 -2.96812952e-01  5.98318279e-01
 -8.72410178e-01 -9.16960716e-01 -5.62300444e-01  1.57823324e+00
 -5.04084945e-01  9.50967252e-01 -1.31093264e+00  1.34911346e+00
  2.95524269e-01 -2.94118762e-01 -4.89745229e-01 -1.24121118e+00
  1.99785769e-01  1.03533983e+00  7.79707670e-01  2.41386205e-01
 -5.03413618e-01 -3.80905360e-01 -2.55689216e+00 -8.02521288e-01
  1.21050131e+00  4.75242376e-01 -1.31748855e+00  1.59162807e+00
  3.91494453e-01  5.66990316e-01  1.40956819e+00  7.56166041e-01
 -2.00776860e-01 -1.17270410e-01  9.90917444e-01  8.68309379e-01
 -4.29015279e-01 -5.51364

In [None]:
# from torch.utils.data import Dataset, DataLoader
# import h5py

# class PeptideDataset(Dataset):
#     """Dataset for the Peptides"""

#     def __init__(self, file_loc:str ="data/peptide_dataset.h5", transform=None):
#         """
#         Arguments:
#             file_loc (string): Path to the peptide dataset
#             transform: transform to be applied on a sample
#         """
        
#         # keep file open
#         self.file = h5py.File(file_loc, 'r')
#         self.latent_dataset = self.file['LATENTS']
#         self.peptide_dataset = self.file['PEPTIDES']
#         self.extinct_dataset = self.file['EXTINCT']
#         self.datasource_dataset = self.file['DATA_SOURCE']
#         self._cached_len = len(self.peptide_dataset[:])
#         self.transform = transform

#     def __len__(self, use_cached=True):
#         if use_cached:
#             return self._cached_len
#         else:
#             return len(self.peptide_dataset[:])
    
#     def __getitem__(self, idx):
#         peptide = self.get_peptide(idx)
#         latent = self.get_latent(idx)
#         extinct = self.get_extinct(idx)
#         datasource = self.get_datasource(idx)

#         out = (peptide, latent, extinct, datasource)
        
#         if self.transform:
#             out = self.transform(out)
#         return out
      

#         out = ()
#         raw_peptide, raw_extinct, raw_datasource, raw_latent = f['PEPTIDES'][i], f['EXTINCT'][i], f['DATA_SOURCE'][i], f['LATENTS'][i]

#         peptide = raw_peptide.decode('utf-8')
#         extinct = bool(raw_extinct)
#         datasource = 'peptide_10M' if raw_datasource == 0 else 'peptide_4.5M'
#         latent = raw_latent
#         out = (self.smiles_dataset[idx], self.selfies_dataset[idx], self.latent_dataset[idx])
#         if self.transform:
#             out = self.transform(out)
#         return out
    
#     def get_peptide(self, idx, raw=True):
#         peptide = self.peptide_dataset[idx]

#         if not raw:
#             peptide = self.transform_peptide(peptide)
#         return peptide
    
#     def transform_peptide(peptide):
#         return [ptd.decode('utf-8') for ptd in peptide] if isinstance(peptide, list) else peptide.decode('utf-8')
    
#     def transform_extinct(extinct):


#     def transform_datasource(datasource):
        

#         extinct = self.extinct_dataset[idx]
#         if isinstance(extinct, list):
#             extinct = [bool(ext) for ext in extinct]
#         else:
#             extinct = bool(extinct)

#         datasource = self.datasource_dataset[idx]
#         if isinstance(datasource, list):
#             datasource = ['peptide_10M' if ds == 0 else 'peptide_4.5M' for ds in datasource]
#         else:
#             datasource = 'peptide_10M' if datasource == 0 else 'peptide_4.5M'

#         latent = self.latent_dataset[idx]
#     def get_peptide(self, idx, raw=True):
#         return self.smiles_dataset[idx]
#     def get_peptide(self, idx, raw=True):
#         return self.smiles_dataset[idx]
#     def get_peptide(self, idx, raw=True):
#         return self.smiles_dataset[idx]
    

1

In [7]:
with h5py.File(PEPTIDE_DATASET_PATH, 'r') as file:
    latent_dataset = file['LATENTS']
    peptide_dataset = file['PEPTIDES']
    extinct_dataset = file['EXTINCT']
    datasource_dataset = file['DATA_SOURCE']

    print(extinct_dataset[0:5])


[ True  True  True  True  True]


In [None]:
peptide, latent, extinct, source = read_peptide_dataset(i=peptide_10m_len-2)
print(extinct)
print(source)

# inclusive!
peptide_10m_start_idx = 0
peptide_10m_end_idx = peptide_10m_len - 2

True
peptide_10M


In [41]:
# Fix error where I accidently set all of the extinct to true in the dataset:

def fix_extinct_dataset(start_idx: int = 0, end_idx: int = peptide_10m_len - 2, start_line_num: int = 1):
    # line_num is 0-indexed, so line_num 0 is the first line!
    
    with open("artifacts/raw_peptide/peptide_raw_10M.csv", 'r') as infile, h5py.File(PEPTIDE_DATASET_PATH, 'r+') as outfile:
        peptide_ds = outfile['PEPTIDES']
        extinct_ds = outfile['EXTINCT']
        data_source_ds = outfile['DATA_SOURCE']

        # skip first start_line_num lines in infile csv
        for _ in range(start_line_num):
            next(infile)

        idx = start_idx
        for line_num, raw_line in tqdm(enumerate(infile, start=start_line_num), total=peptide_10m_len, desc='Reading Peptide10M CSV'):
            try:
                raw_line = raw_line.strip()
                peptide, extinct = raw_line.split(',')
                
                extinct = True if extinct == '1' else False


                if data_source_ds[idx] != 0:
                    raise ValueError(f"overriding wrong data source with peptide={peptide} extinct={extinct} at idx = {idx}")

                if peptide is None or extinct is None:
                    raise ValueError(f"peptide, extinct is wrong: peptide={peptide} extinct={extinct} at idx= {idx}")
                
                extinct_ds[idx] = extinct
                idx += 1

            except Exception as e:
                print(f"Encountered an error while processing line_num {line_num}")
                print(f"Line was {raw_line} ")
                print(f"peptide = {peptide} ")
                print(f"extinct = {extinct} ")
                print(f"Attempted to index into idx={idx}")
                print("Error MSG:")
                print(e)

                # print(f"Removing idx: {idx} data")
                # peptide_ds[idx] = ''
                # extinct_ds[idx] = False
                # data_source_ds[idx] = 0
                return

# fix_extinct_dataset()

Reading Peptide10M CSV: 100%|█████████▉| 10274723/10274724 [05:51<00:00, 29190.98it/s]


In [None]:
# # Apply it to the new dataset

# def transfer_dataset():
#     # line_num is 0-indexed, so line_num 0 is the first line!

#     with h5py.File("data/peptide_dataset.h5", 'r') as infile, h5py.File("data/peptide_data_latents.h5", 'r+') as outfile:
#         extinct_ds_correct = infile['EXTINCT'][:]
#         outfile['EXTINCT'][:] = extinct_ds_correct
        
# transfer_dataset()

In [3]:
vae = gd.load_vae_peptides()
import util.visualization as vis
import torch

loading model from saved_models/peptide_vae/peptide-vae.ckpt
Enc params: 2,675,904
Dec params: 360,349


In [81]:
idx = 14543
peptide, latent, extinct, _ = read_peptide_dataset(idx)
latent = torch.tensor(latent, dtype=torch.float32)
print(f"Peptide: {peptide}")
print(f"Latent: Shape={latent.shape}, first 5 elements: {latent[:5]}")

latent_hat = gd.peptides_to_latent([peptide], vae=vae)
print(f"Latent Predicted: Shape={latent_hat.shape}, first 5 elements: {latent_hat[0][:5]}")

peptide_hat = gd.latent_to_peptides(latent, vae=vae)
print(f"Predicted Peptide from latent: {peptide_hat[0]}")
# print(latent_hat)
# # latent = torch.tensor(latent, dtype=torch.float32)
# # print(z)

Peptide: AYHMVNPSPWPLTGALSALLMTSGLIMWFHYNSMSLLMLGFTTNL
Latent: Shape=torch.Size([256]), first 5 elements: tensor([ 1.2213,  0.8944,  1.1083, -0.5544, -0.3610])
Latent Predicted: Shape=torch.Size([1, 256]), first 5 elements: tensor([ 1.0926,  1.2618,  1.4820, -0.9541, -0.7224], device='cuda:0')
Predicted Peptide from latent: AYHMVNPSPWPLTGALSALLMTSGLIMWFHYNSMSLLMLGFTTNL
