<a href="https://colab.research.google.com/github/Fulmenius/Predicting-antibody-escape-with-ML/blob/main_script/models/utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print('utils are loaded')

utils are loaded


In [None]:
import pandas as pd

def vocabulary(series):
    """
    This function creates a vocabulary of letters in the dataset 
    """
    return list(set(series.str.cat()))

In [None]:
def one_hot_encode(series, aa_vocab):
    """
    One-hot encoding. No padding is needed, because all strings have a length of 24
    """
    # Create a mapping of aa to indices
    aa_to_idx = {aa: idx for idx, aa in enumerate(aa_vocab)}
    vocab_size = len(aa_vocab)

    # One-hot encode the sequences
    encoded_seqs = []
    for seq in series:
        encoded_seq = np.zeros((len(seq), vocab_size))
        for i, aa in enumerate(seq):
            encoded_seq[i, aa_to_idx[aa]] = 1
        encoded_seqs.append(encoded_seq)
    
    return encoded_seqs

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    """
    PyTorch Dataset class for the sequence data
    """
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        sequence = torch.unsqueeze(sequence, 0)
        label = self.labels[idx]
        return sequence, label

def prepare_data(dataset, sample_size, batch_size=64):
    """
    This function prepares a PyTorch DataLoader from a pandas DataFrame.
    """
    # Randomly sample from the dataset
    dataset_sample = dataset.sample(n=sample_size)
    
    # Extract sequences and labels
    sequences = dataset_sample['junction_aa']
    labels = dataset_sample['Label']

    # Create vocabulary
    aa_vocab = vocabulary(sequences)

    # One-hot encode the sequences
    sequences = one_hot_encode(sequences, aa_vocab)

    # Convert to PyTorch tensors
    sequences = torch.tensor(sequences).float() # Changed this line
    labels = torch.tensor(labels.values).float()

    # Create PyTorch Dataset
    data = SequenceDataset(sequences, labels)

    # Create PyTorch DataLoader
    loader = DataLoader(data, batch_size=batch_size, shuffle=True)

    return loader

In [None]:
class EmbeddingDataset(Dataset):
    """
    PyTorch Dataset class for embedded sequence data
    """
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        label = self.labels[idx]
        return embedding, label

from ast import literal_eval
import re
import numpy as np

def str_to_array(s):
    # Remove brackets and split by whitespace
    s = s[1:-1]
    str_nums = re.split('\s+', s)
    # Convert strings to floats and return as np.array
    return np.array([float(num.rstrip(',')) for num in str_nums if num])

def prepare_t5_data(embedded_dataset, sample_size, batch_size=64):
    """
    This function prepares a PyTorch DataLoader from a pandas DataFrame with T5 embeddings 
    """
    # Randomly sample from the dataset
    dataset_sample = embedded_dataset.sample(n=sample_size)

    # Extract embeddings and labels
    embeddings = dataset_sample['embeddings']
    labels = dataset_sample['Labels']

    # If the embeddings are stored as strings, convert them to lists of floats
    if isinstance(embeddings.iloc[0], str):
        embeddings = embeddings.apply(str_to_array)

    # Convert to PyTroch tensors
    embeddings = torch.tensor(np.stack(embeddings.values)).float()
    labels = torch.tensor(labels.values).float()

    # Create PyTorch Dataset
    data = EmbeddingDataset(embeddings, labels)

    # Create PyTorch DataLoader
    loader = DataLoader(data, batch_size=batch_size, shuffle=True)

    return loader

In [None]:
from sklearn.metrics import roc_auc_score

def calculate_roc_auc(model, loader):
    model.eval()
    y_true = []
    y_score = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            y_true.extend(labels.tolist())
            y_score.extend(outputs.tolist())
    return roc_auc_score(y_true, y_score)