In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

import numpy as np
import pandas as pd
from typing import Tuple, Dict, List
import random
import re

torch.__version__

'2.6.0+cu124'

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
from pathlib import Path

embeddings_path = Path.cwd().parent.parent / "esm" / "embeds"
print(embeddings_path.resolve())

/mnt/tank/scratch/azaikina/esm/embeds


In [5]:
import os
def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str or pathlib.Path): target directory
  
  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

walk_through_dir(embeddings_path)

There are 0 directories and 16691 images in '/mnt/tank/scratch/azaikina/esm/embeds'.


In [None]:
numpy_embed_path = list(Path(embeddings_path).glob("*.npy"))[0]
numpy_embed = np.load(numpy_embed_path)
tensor = torch.from_numpy(numpy_embed).type(torch.float32)
tensor.dtype

torch.float32

In [7]:
class EmbeddingsDataset(Dataset):
    def __init__(self, embeddings_path: str, with_name=False):
        self.paths = list(Path(embeddings_path).glob("*.npy"))
        self.with_name = with_name
    def load_embedding(self, index: int, dtype = torch.float32) -> torch.Tensor:
        "Transforms numpy to torch.Tensor."
        npy_path = self.paths[index]
        npy_embed = np.load(npy_path)
        tensor = torch.from_numpy(npy_embed).type(dtype)
        return tensor
    def __len__(self) -> int:
        "Returns the total number of embeddings."
        return len(self.paths)
    def __getitem__(self, index: int):
        "Returns one sample of data and original file name."
        embedding = self.load_embedding(index)
        name = str(self.paths[index])

        if self.with_name:
            return embedding, name # return tensor, file name
        else:
            return embedding # return tensor

In [8]:
data_custom = EmbeddingsDataset(embeddings_path=embeddings_path, with_name=True)
data_custom

<__main__.EmbeddingsDataset at 0x7f96acc810f0>

In [9]:
print(len(data_custom))
random_samples_idx = random.sample(range(len(data_custom)), k=8)
print('random samples')

for i, targ_sample in enumerate(random_samples_idx):
        targ_emb = data_custom[targ_sample]
        print(targ_emb)

16690
random samples
(tensor([ 0.0103, -0.1574, -0.0748,  ..., -0.0995, -0.0991,  0.0440]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_19630.npy')
(tensor([ 0.0789, -0.0314, -0.0532,  ..., -0.0026,  0.0069,  0.0122]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_6826.npy')
(tensor([ 0.0103, -0.1574, -0.0748,  ..., -0.0995, -0.0991,  0.0440]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_4472.npy')
(tensor([ 0.0687, -0.0285, -0.0512,  ..., -0.0046, -0.0021,  0.0105]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_9102.npy')
(tensor([ 0.0564, -0.0473, -0.0747,  ..., -0.1364, -0.0472,  0.0989]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_13857.npy')
(tensor([ 0.0822, -0.0282, -0.0428,  ..., -0.0012, -0.0089,  0.0079]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_11822.npy')
(tensor([ 0.0103, -0.1574, -0.0748,  ..., -0.0995, -0.0991,  0.0440]), '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_4734.npy')
(tensor([ 0.0172,  0.0108, -0.0695,  ..., -0.0100, -0.0675,

In [10]:
text = '/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_3214.npy'

text.find('*/*.npy')

s = 'esm/embeds/esm_embed_3214.npy'
match = re.search(r"(\d+)(?=\.npy$)", s)
if match:
    print(match.group(1)) 

3214


In [11]:
train_dataloader_custom = DataLoader(dataset=data_custom,
                                     batch_size=5,
                                     shuffle=True)

test_dataloader_custom = DataLoader(dataset=data_custom,
                                    batch_size=1,
                                    shuffle=False)

train_dataloader_custom, test_dataloader_custom

(<torch.utils.data.dataloader.DataLoader at 0x7f96acc82740>,
 <torch.utils.data.dataloader.DataLoader at 0x7f96acc80ee0>)

In [12]:
# Get image and label from custom DataLoader
img_custom = next(iter(train_dataloader_custom))

# Batch size will now be 1, try changing the batch_size parameter above and see what happens
print(f"Image shape: {img_custom.shape} -> [batch_size, color_channels, height, width]")

AttributeError: 'list' object has no attribute 'shape'

# Aptamers dataset

In [6]:
df_path = data_path / '3_checked_intersections.csv'
df = pd.read_csv(df_path, index_col = 0)

In [7]:
#original class
class AptamersDataset(Dataset):
    def __init__(self, df: pd.DataFrame, embeddings_path: str,
                 ab_name_column: str = 'Name of Antibody', apt_name_column: str = 'Name of Aptamer', tg_name_column: str = 'Target_ab',
                 ab_seq_column: str = 'Antibody Sequence', apt_seq_column: str = 'Aptamer Sequence', tg_seq_column: str = 'target_seq_ab',
                 with_names: bool =True):
        self.df = df
        self.embeddings_path = embeddings_path

        self.ab_name_column = ab_name_column
        self.apt_name_column = apt_name_column
        self.tg_name_column = tg_name_column
        self.ab_seq_column = ab_seq_column
        self.apt_seq_column = apt_seq_column
        self.tg_seq_column = tg_seq_column

        self.with_names = with_names
        self.embeddings_path = list(Path(embeddings_path).glob("*.npy"))

    def load_embedding(self, index: int, dtype = torch.float32) -> torch.Tensor:
        "Transforms numpy to torch.Tensor."
        npy_path = self.embeddings_path[index]
        npy_embed = np.load(npy_path)
        tensor = torch.from_numpy(npy_embed).type(dtype)
        return tensor

    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(df)
    
    def __getitem__(self, index: int):
        embedding = self.load_embedding(index)
        emb_path = str(self.embeddings_path[index])
        match = re.search(r"(\d+)(?=\.npy$)", emb_path)
        if match:
            emb_index = match.group(1)   # path to index; '/esm/embeds/esm_embed_3214.npy' -> 3214   

        ab_name = df.loc[emb_index, self.ab_name_column]
        apt_name = df.loc[emb_index, self.apt_name_column]
        tg_name = df.loc[emb_index, self.tg_name_column]
        
        ab_seq = df.loc[emb_index, self.ab_seq_column]
        apt_seq = df.loc[emb_index, self.apt_seq_column]
        tg_seq = df.loc[emb_index, self.tg_seq_column]


        if self.with_names:
            return embedding, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding # return tensor

In [None]:
#modification of AptamerDataset, 
#!! need to check if the returned tensor is the right one
class AptamersDataset(Dataset):
    """    
    Returns:
    A tuple out of:
        embedding (type torch.Tensor), 
        ab_name, apt_name, tg_name, 
        ab_seq, apt_seq, tg_seq
    """
    def __init__(self, df: pd.DataFrame, embeddings_path: str,
                 ab_name_column: str = 'Name of Antibody', apt_name_column: str = 'Name of Aptamer', tg_name_column: str = 'Target_ab',
                 ab_seq_column: str = 'Antibody Sequence', apt_seq_column: str = 'Aptamer Sequence', tg_seq_column: str = 'target_seq_ab',
                 with_names: bool =True):
        self.df = df
        self.embeddings_path = embeddings_path

        self.ab_name_column = ab_name_column
        self.apt_name_column = apt_name_column
        self.tg_name_column = tg_name_column
        self.ab_seq_column = ab_seq_column
        self.apt_seq_column = apt_seq_column
        self.tg_seq_column = tg_seq_column

        self.with_names = with_names
        self.embeddings_path = list(Path(embeddings_path).glob("*.npy"))

    def load_embedding(self, index: int, dtype = torch.float32) -> torch.Tensor:
        "Transforms numpy to torch.Tensor."
        npy_path = self.embeddings_path[index]
        npy_embed = np.load(npy_path)
        tensor = torch.from_numpy(npy_embed).type(dtype)
        return tensor

    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(df)
    
    def __getitem__(self, index: int):
        embedding = self.load_embedding(index)
        # emb_path = str(self.embeddings_path[index])
        # match = re.search(r"(\d+)(?=\.npy$)", emb_path)
        # if match:
        #     emb_index = match.group(1)   # path to index; '/esm/embeds/esm_embed_3214.npy' -> 3214   
        emb_index = index
        ab_name = df.loc[emb_index, self.ab_name_column]
        apt_name = df.loc[emb_index, self.apt_name_column]
        tg_name = df.loc[emb_index, self.tg_name_column]
        
        ab_seq = df.loc[emb_index, self.ab_seq_column]
        apt_seq = df.loc[emb_index, self.apt_seq_column]
        tg_seq = df.loc[emb_index, self.tg_seq_column]


        if self.with_names:
            return embedding, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding # return tensor


apt_dataset = AptamersDataset(df=df, embeddings_path = embeddings_path)
len(apt_dataset)

print(len(apt_dataset))
samples_idx = [15831, 8998, 624, 1677, 16380, 1009]
print('random samples')

for id in samples_idx:
    print(id, apt_dataset[id])

16693
random samples
15831 (tensor([ 0.0750, -0.0339, -0.0551,  ..., -0.0058, -0.0065,  0.0141]), '6iuv_H_L_A', 'APIPred_1934', 'hemagglutinin', 'QVQLVQSGAEVKETGESLNISCKVSGNNFPSYYISWVRQMPGNGLEWMGRIDPSDSDTNYRPSFQGHVTISADKSTSTAYLQWRSLKASDTAMYYCARRATYYYGSGSYFDAFDIWGQGTMVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSC|EIVMTQSPLTLPVTPGAPASISCRSSQSLLHSDGYNYLDWYLQKPGQSPQLLIYLGSHRASGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCMQALQTPDFGQGTRLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC', 'GAATGAGGAATAATCTAGCTCCTTCGCTGA', 'NGVKPLILRDCSVAGWLLGNPMCDEFINVPEWSYIVEKASPANDLCYPGNFNDYEELKHLLSRINHFEKIQIIPKSSWSNHDASSGVSSACPYLGRSSFFRNVVWLIKKNSAYPTIKRSYNNTNQEDLLVLWGIHHPNDAAEQTKLYQNPTTYISVGTSTLNQRLVPEIATRPKVNGQSGRMEFFWTILKPNDAINFESNGNFIAPEYAYKIVKKGDSTIMKSE')
8998 (tensor([ 0.0399, -0.0264, -0.0521,  ..., -0.1692, -0.0243,  0.0457]), '7u9p_H_L_A', 'C9', 'spike glycoprotein', 'VQLLEESGGGAVQPGRSLRLSC

In [None]:
#set embeddings and sequences paths 
data_path = Path("../data/")
#embeddings_path = Path("../../esm/embeds")
embeddings_path = Path.cwd().parent.parent / "esm" / "embeds"  #/mnt/tank/scratch/azaikina/esm/embeds


In [77]:
%%writefile data_setup.py
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

import numpy as np
import pandas as pd
from typing import Tuple, Dict, List
import random
import re

#set embeddings and sequences paths 
data_path = Path("../data/")
#embeddings_path = Path("../../esm/embeds")
embeddings_path = Path.cwd().parent.parent / "esm" / "embeds"  #/mnt/tank/scratch/azaikina/esm/embeds

df_path = data_path / '3_checked_intersections.csv'
df = pd.read_csv(df_path, index_col = 0)

class AptamersDataset(Dataset):
    """    
    Returns:
    A tuple out of:
        embedding (type torch.Tensor), 
        ab_name, apt_name, tg_name, 
        ab_seq, apt_seq, tg_seq
    """
    def __init__(self, df: pd.DataFrame, embeddings_path: str,
                 ab_name_column: str = 'Name of Antibody', apt_name_column: str = 'Name of Aptamer', tg_name_column: str = 'Target_ab',
                 ab_seq_column: str = 'Antibody Sequence', apt_seq_column: str = 'Aptamer Sequence', tg_seq_column: str = 'target_seq_ab',
                 with_names: bool =True):
        self.df = df
        self.embeddings_path = embeddings_path

        self.ab_name_column = ab_name_column
        self.apt_name_column = apt_name_column
        self.tg_name_column = tg_name_column
        self.ab_seq_column = ab_seq_column
        self.apt_seq_column = apt_seq_column
        self.tg_seq_column = tg_seq_column
        self.with_names = with_names

        #Mapping {embedding_number: file_path}
        all_files = list(Path(embeddings_path).glob("*.npy"))
        self.id_to_path = {}
        for f in all_files:
            match = re.search(r"(\d+)(?=\.npy$)", f.name)
            if match:
                emb_id = int(match.group(1))
                self.id_to_path[emb_id] = f

        # --- Keep only DataFrame rows that have an embedding ---
        self.valid_indices = [i for i in self.df.index if i in self.id_to_path]
        if len(self.valid_indices) < len(self.df):
            print(f"⚠️ {len(self.df) - len(self.valid_indices)} rows skipped (no embedding file found).")

    def load_embedding(self, emb_index: int, dtype = torch.float32) -> torch.Tensor:
        "Transforms numpy to torch.Tensor."
        npy_path = self.id_to_path[emb_index]
        
        npy_embed = np.load(npy_path)
        tensor = torch.from_numpy(npy_embed).type(dtype)
        return tensor

    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(self.valid_indices)
    
    def __getitem__(self, index: int):
        emb_index = self.valid_indices[index]
        embedding = self.load_embedding(emb_index)
        embedding_path = self.id_to_path[emb_index]


        ab_name = self.df.loc[emb_index, self.ab_name_column]
        apt_name = self.df.loc[emb_index, self.apt_name_column]
        tg_name = self.df.loc[emb_index, self.tg_name_column]
        
        ab_seq = self.df.loc[emb_index, self.ab_seq_column]
        apt_seq = self.df.loc[emb_index, self.apt_seq_column]
        tg_seq = self.df.loc[emb_index, self.tg_seq_column]

        if self.with_names:
            return embedding, embedding_path, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding # return tensor
    
    def get_by_id(self, emb_id: int):
        """Fetch a sample by its true embedding ID (DataFrame index)"""
        if emb_id not in self.id_to_path:
            raise KeyError(f"No embedding found for ID {emb_id}")

        embedding = self.load_embedding(emb_id)
        embedding_path = self.id_to_path[emb_id]


        ab_name = self.df.loc[emb_id, self.ab_name_column]
        apt_name = self.df.loc[emb_id, self.apt_name_column]
        tg_name = self.df.loc[emb_id, self.tg_name_column]
        
        ab_seq = self.df.loc[emb_id, self.ab_seq_column]
        apt_seq = self.df.loc[emb_id, self.apt_seq_column]
        tg_seq = self.df.loc[emb_id, self.tg_seq_column]

        if self.with_names:
            return embedding, embedding_path, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding
        
def collate_embeddings(batch):
    """
    DataLoader cannot batch Path objects or strings by default. 
    Collate function to batch embeddings (torch.Tensor) and keep other fields as lists.
    """
    embeddings = torch.stack([item[0] for item in batch])  # batch tensor
    paths = [item[1] for item in batch]                   # list of Paths
    ab_names = [item[2] for item in batch]
    apt_names = [item[3] for item in batch]
    tg_names = [item[4] for item in batch]
    ab_seqs = [item[5] for item in batch]
    apt_seqs = [item[6] for item in batch]
    tg_seqs = [item[7] for item in batch]

    return embeddings, paths, ab_names, apt_names, tg_names, ab_seqs, apt_seqs, tg_seqs



Overwriting data_setup.py


In [69]:
#the last modification of AptamerDataset, 
class AptamersDataset(Dataset):
    """    
    Returns:
    A tuple out of:
        embedding (type torch.Tensor), 
        ab_name, apt_name, tg_name, 
        ab_seq, apt_seq, tg_seq
    """
    def __init__(self, df: pd.DataFrame, embeddings_path: str,
                 ab_name_column: str = 'Name of Antibody', apt_name_column: str = 'Name of Aptamer', tg_name_column: str = 'Target_ab',
                 ab_seq_column: str = 'Antibody Sequence', apt_seq_column: str = 'Aptamer Sequence', tg_seq_column: str = 'target_seq_ab',
                 with_names: bool =True):
        self.df = df
        self.embeddings_path = embeddings_path

        self.ab_name_column = ab_name_column
        self.apt_name_column = apt_name_column
        self.tg_name_column = tg_name_column
        self.ab_seq_column = ab_seq_column
        self.apt_seq_column = apt_seq_column
        self.tg_seq_column = tg_seq_column
        self.with_names = with_names

        #Mapping {embedding_number: file_path}
        all_files = list(Path(embeddings_path).glob("*.npy"))
        self.id_to_path = {}
        for f in all_files:
            match = re.search(r"(\d+)(?=\.npy$)", f.name)
            if match:
                emb_id = int(match.group(1))
                self.id_to_path[emb_id] = f

        # --- Keep only DataFrame rows that have an embedding ---
        self.valid_indices = [i for i in self.df.index if i in self.id_to_path]
        if len(self.valid_indices) < len(self.df):
            print(f"⚠️ {len(self.df) - len(self.valid_indices)} rows skipped (no embedding file found).")

    def load_embedding(self, emb_index: int, dtype = torch.float32) -> torch.Tensor:
        "Transforms numpy to torch.Tensor."
        npy_path = self.id_to_path[emb_index]
        
        npy_embed = np.load(npy_path)
        tensor = torch.from_numpy(npy_embed).type(dtype)
        return tensor

    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(self.valid_indices)
    
    def __getitem__(self, index: int):
        emb_index = self.valid_indices[index]
        embedding = self.load_embedding(emb_index)
        embedding_path = self.id_to_path[emb_index]


        ab_name = self.df.loc[emb_index, self.ab_name_column]
        apt_name = self.df.loc[emb_index, self.apt_name_column]
        tg_name = self.df.loc[emb_index, self.tg_name_column]
        
        ab_seq = self.df.loc[emb_index, self.ab_seq_column]
        apt_seq = self.df.loc[emb_index, self.apt_seq_column]
        tg_seq = self.df.loc[emb_index, self.tg_seq_column]

        if self.with_names:
            return embedding, embedding_path, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding # return tensor
    
    def get_by_id(self, emb_id: int):
        """Fetch a sample by its true embedding ID (DataFrame index)"""
        if emb_id not in self.id_to_path:
            raise KeyError(f"No embedding found for ID {emb_id}")

        embedding = self.load_embedding(emb_id)
        embedding_path = self.id_to_path[emb_id]


        ab_name = self.df.loc[emb_id, self.ab_name_column]
        apt_name = self.df.loc[emb_id, self.apt_name_column]
        tg_name = self.df.loc[emb_id, self.tg_name_column]
        
        ab_seq = self.df.loc[emb_id, self.ab_seq_column]
        apt_seq = self.df.loc[emb_id, self.apt_seq_column]
        tg_seq = self.df.loc[emb_id, self.tg_seq_column]

        if self.with_names:
            return embedding, embedding_path, ab_name, apt_name, tg_name, ab_seq, apt_seq, tg_seq
        else:
            return embedding
        
def collate_embeddings(batch):
    """
    DataLoader cannot batch Path objects or strings by default. 
    Collate function to batch embeddings (torch.Tensor) and keep other fields as lists.
    """
    embeddings = torch.stack([item[0] for item in batch])  # batch tensor
    paths = [item[1] for item in batch]                   # list of Paths
    ab_names = [item[2] for item in batch]
    apt_names = [item[3] for item in batch]
    tg_names = [item[4] for item in batch]
    ab_seqs = [item[5] for item in batch]
    apt_seqs = [item[6] for item in batch]
    tg_seqs = [item[7] for item in batch]

    return embeddings, paths, ab_names, apt_names, tg_names, ab_seqs, apt_seqs, tg_seqs

apt_dataset = AptamersDataset(df=df, embeddings_path = embeddings_path)
len(apt_dataset)

print(len(apt_dataset))
samples_idx = [4, 1009]
print('random samples')

for id in samples_idx:
    print(id, apt_dataset[id])

⚠️ 4 rows skipped (no embedding file found).
16689
random samples
4 (tensor([ 0.0511, -0.0160, -0.0115,  ..., -0.0082, -0.0421,  0.0080]), PosixPath('/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_4.npy'), '2nz9_D_C_A', 'aptamer_limited_ds_482', 'botulinum neurotoxin type a', 'QVQLQESGGGLVQPGGSLRLSCAASGFTFSDHYMYWVRQAPGKGLEWVATISDGGSYTYYSDSVEGRFTTSRDNSKNTLYLQMNSLRAEDTAIYYCSRYRYDDAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKT|EIVLTQSPATLSLSPGERATISCRASESVDSYGHSFMQWYQQKPGQAPRLLIYRASNLEPGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQGNEVPFTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC', 'ATACCAGCTTATTCAATTGACATGACTGGGATTTTTGGCGAAATCGAAGGAAGCGGAGAGATAGTAAGTGCAATCT', 'PFVNKQFNYKDPVNGVDIAYIKIPNVGQMQPVKAFKIHNKIWVIPERDTFTNPEEGDLNPPPEAKQVPVSYYDSTYLSTDNEKDNYLKGVTKLFERIYSTDLGRMLLTSIVRGIPFWGGSTIDTELKVIDTNCINVIQPDGSYRSEELNLVIIGPSADIIQFECKSFGHEVLNLTRNGYGSTQYIRFSPDFTFG

In [71]:
df.loc[1161]

Target_ab                         vascular endothelial growth factor a
Target_apt                          Vascular_endothelial_growth_factor
Name of Aptamer                                           APIPred_1872
Name of Antibody                                            2qr0_L_K_I
Aptamer Sequence                                GAAUCAUACACAAGUUGUGGAG
Antibody Sequence    EVQLVESGGGLVQPGGSLRLSCAASGFNFSSSSIHWVRQAPGKGLE...
target_seq_ab        EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCG...
target_seq_apt       MNFLLSWVHWSLALLLYLHHAKWSQAAPMAEGGGQNHHEVVKFMDV...
Name: 1161, dtype: object

In [72]:
apt_dataset.get_by_id(1161)

(tensor([ 0.0572, -0.0718, -0.0457,  ..., -0.1947, -0.0906,  0.0224]),
 PosixPath('/mnt/tank/scratch/azaikina/esm/embeds/esm_embed_1161.npy'),
 '2qr0_L_K_I',
 'APIPred_1872',
 'vascular endothelial growth factor a',
 'EVQLVESGGGLVQPGGSLRLSCAASGFNFSSSSIHWVRQAPGKGLEWVAYIYPSYSYTSYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARYYGTGAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSC|DIQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLLIYSASSLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQYSYYYYPFTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNR',
 'GAAUCAUACACAAGUUGUGGAG',
 'EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCGGCCNDEGLECVPTEESNITMQIMRIKPHQGQHIGEMSFLQHNKCECRPKKD')

In [73]:
train_dataloader_custom = DataLoader(dataset=apt_dataset,
                                     batch_size=5,
                                     shuffle=True,
                                     collate_fn=collate_embeddings)

test_dataloader_custom = DataLoader(dataset=apt_dataset,
                                    batch_size=1,
                                    shuffle=False,
                                    collate_fn=collate_embeddings)

train_dataloader_custom, test_dataloader_custom

(<torch.utils.data.dataloader.DataLoader at 0x7f55462f1c00>,
 <torch.utils.data.dataloader.DataLoader at 0x7f55462f19c0>)

In [75]:
# --- Test one batch ---
batch = next(iter(train_dataloader_custom))

embeddings, paths, ab_names, apt_names, tg_names, ab_seqs, apt_seqs, tg_seqs = batch

print(f"Embeddings shape: {embeddings.shape}")  # torch.Tensor shape [batch_size, embedding_dim]
print(f"First embedding path: {paths[0]}")
print(f"First antibody name: {ab_names[0]}")
print(f"First aptamer name: {apt_names[0]}")
print(f"First target name: {tg_names[0]}")
print(f"First antibody sequence: {ab_seqs[0]}")
print(f"First aptamer sequence: {apt_seqs[0]}")
print(f"First target sequence: {tg_seqs[0]}")

# --- Test one batch ---
batch = next(iter(test_dataloader_custom))

embeddings, paths, ab_names, apt_names, tg_names, ab_seqs, apt_seqs, tg_seqs = batch

print(f"Embeddings shape: {embeddings.shape}")  # torch.Tensor shape [batch_size, embedding_dim]
print(f"First embedding path: {paths[0]}")
print(f"First antibody name: {ab_names[0]}")
print(f"First aptamer name: {apt_names[0]}")
print(f"First target name: {tg_names[0]}")
print(f"First antibody sequence: {ab_seqs[0]}")
print(f"First aptamer sequence: {apt_seqs[0]}")
print(f"First target sequence: {tg_seqs[0]}")


Embeddings shape: torch.Size([5, 1280])
First embedding path: /mnt/tank/scratch/azaikina/esm/embeds/esm_embed_1596.npy
First antibody name: 2qr0_H_G_J
First aptamer name: APIPred_865
First target name: vascular endothelial growth factor a
First antibody sequence: EVQLVESGGGLVQPGGSLRLSCAASGFNFSSSSIHWVRQAPGKGLEWVAYIYPSYSYTSYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARYYGTGAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSC|DIQMTQSPSSLSASVGDRVTITCRASQSVSSAVAWYQQKPGKAPKLLIYSASSLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQYSYYYYPFTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNR
First aptamer sequence: AGCAGCACAGAGGTCAGATGCGCGCAGAATTTTGAGTCATGTACTAAGGAATTGATTGGTCCTATGCGTGCTACCGTGAA
First target sequence: EVVKFMDVYQRSYCHPIETLVDIFQEYPDEIEYIFKPSCVPLMRCGGCCNDEGLECVPTEESNITMQIMRIKPHQGQHIGEMSFLQHNKCECRPKKD
Embeddings shape: torch.Size([1, 1280])
First embedding path: /mnt/tank/sc

In [None]:
'''
📘 Explanation — How this dataset is formed

Dataset name: AptamersDataset
Purpose: Used to feed paired biological sequence data (antibody, aptamer, and target sequences) with their precomputed embeddings into PyTorch models.

📘 Structure

Each dataset entry corresponds to one record in your DataFrame (df) that has an existing .npy embedding file.

Each .npy file (e.g. esm_embed_16380.npy) stores a precomputed embedding vector — usually produced by a protein language model such as ESM, ProtBert, or ProtT5.
The numeric part of the filename (16380) matches the DataFrame index.

📘 Returned tuple

Each call to __getitem__(i) returns:

Element	Type	Description
embedding	torch.Tensor	Loaded ESM embedding vector
embedding_path	pathlib.Path	Path to .npy file
ab_name	str	Antibody name
apt_name	str	Aptamer name
tg_name	str	Target name
ab_seq	str	Antibody sequence
apt_seq	str	Aptamer sequence
tg_seq	str	Target sequence





1️⃣__getitem__(self, index)

Used internally by PyTorch DataLoader.

index is a 0-based position in self.valid_indices, not the actual embedding ID.

Converts the DataLoader’s position index into the corresponding embedding ID:

2️⃣ get_by_id(self, emb_id)

Fetches a sample by its true embedding ID / DataFrame index, regardless of position in the dataset.

Looks up the embedding and the row directly:

📘 Example usage
apt_dataset = AptamersDataset(df=df, embeddings_path="/path/to/embeds")
print(len(apt_dataset))  # number of samples with embeddings

# Access a sample by its DataFrame index
embedding, path, ab, apt, tg, ab_seq, apt_seq, tg_seq = apt_dataset[0]
print(path, embedding.shape)

📘 Embedding details

    Format: .npy files containing numpy arrays (float32 or float64).

    Source: Precomputed embeddings (e.g. from ESM) for each antibody–aptamer–target triplet.

    Shape: Depends on the embedding model (e.g. [1280] for ESM-2, [1024] for ProtT5).

    Conversion: Loaded with np.load → torch.from_numpy → torch.float32.

    




    
'''

