In [5]:
print('utils are loaded')

utils are loaded


In [14]:
import pandas as pd

def vocabulary(series):
    """
    This function creates a vocabulary of letters in the dataset 
    """
    return list(set(series.str.cat()))

In [15]:
def one_hot_encode(series, aa_vocab):
    """
    One-hot encoding. No padding is needed, because all strings have a length of 24
    """
    # Create a mapping of aa to indices
    aa_to_idx = {aa: idx for idx, aa in enumerate(aa_vocab)}
    vocab_size = len(aa_vocab)

    # One-hot encode the sequences
    encoded_seqs = []
    for seq in series:
        encoded_seq = np.zeros((len(seq), vocab_size))
        for i, aa in enumerate(seq):
            encoded_seq[i, aa_to_idx[aa]] = 1
        encoded_seqs.append(encoded_seq)
    
    return encoded_seqs

In [20]:
class SequenceDataset(Dataset):
    """
    PyTorch Dataset class for the sequence data
    """
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        sequence = torch.unsqueeze(sequence, 0)
        label = self.labels[idx]
        return sequence, label

def prepare_data(dataset, sample_size, batch_size=64):
    """
    This function prepares a PyTorch DataLoader from a pandas DataFrame.
    """
    # Randomly sample from the dataset
    dataset_sample = dataset.sample(n=sample_size)
    
    # Extract sequences and labels
    sequences = dataset_sample['junction_aa']
    labels = dataset_sample['Label']

    # Create vocabulary
    aa_vocab = vocabulary(sequences)

    # One-hot encode the sequences
    sequences = one_hot_encode(sequences, aa_vocab)

    # Convert to PyTorch tensors
    sequences = torch.tensor(sequences).float() # Changed this line
    labels = torch.tensor(labels.values).float()

    # Create PyTorch Dataset
    data = SequenceDataset(sequences, labels)

    # Create PyTorch DataLoader
    loader = DataLoader(data, batch_size=batch_size, shuffle=True)

    return loader

In [21]:
from sklearn.metrics import roc_auc_score

def calculate_roc_auc(model, loader):
    model.eval()
    y_true = []
    y_score = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            y_true.extend(labels.tolist())
            y_score.extend(outputs.tolist())
    return roc_auc_score(y_true, y_score)