In [1]:
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter

# Encapsulate SMILES and Labels

In [2]:
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer):
        self.smiles = smiles
        #self.labels = labels
        self.labels = np.array(labels)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded_smile = [self.tokenizer['stoi']['<SOS>']] + [self.tokenizer['stoi'].get(c, self.tokenizer['stoi']['<PAD>']) for c in smile] + [self.tokenizer['stoi']['<EOS>']]
        return torch.tensor(encoded_smile, dtype=torch.long), torch.tensor(label, dtype=torch.long)

# Create vocabulary

In [3]:
def create_vocab(smiles_list):
    counter = Counter()
    for smile in smiles_list:
        counter.update(smile)
    vocab = ['<PAD>', '<SOS>', '<EOS>'] + list(counter.keys())
    stoi = {s: i for i, s in enumerate(vocab)}
    itos = {i: s for i, s in enumerate(vocab)}
    return {'stoi': stoi, 'itos': itos}

# Create batch function

In [4]:
def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    #labels_tensor = torch.tensor(labels, dtype=torch.float).unsqueeze(1)  # Reshape for consistency with output
    labels_tensor = torch.tensor(labels, dtype=torch.long) # Reshape for consistency with output

    return inputs_padded, labels_tensor

# load dataset

In [5]:
def load_data_1d(smiles_list,labels,batch_size):
    tokenizer = create_vocab(smiles_list)
    smiles_train, smiles_test, labels_train, labels_test = train_test_split(smiles_list, labels, test_size=0.2, random_state=42)
    train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)
    test_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_loader,test_loader

In [6]:
def load_data_1d_10fold_cv(smiles_list, labels, train_idx, test_idx, batch_size):
    train_smiles = np.array(smiles_list)[train_idx]
    test_smiles = np.array(smiles_list)[test_idx]
    train_labels = np.array(labels)[train_idx]
    test_labels = np.array(labels)[test_idx]
    tokenizer = create_vocab(smiles_list)
    train_dataset = SMILESDataset(train_smiles, train_labels, tokenizer)
    test_dataset = SMILESDataset(test_smiles, test_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_loader, test_loader

In [7]:
def pack_smiles_label(smiles_list, labels):
    data = (smiles_list, labels)
    return data

In [8]:
def unpack_smiles_label(data):
    smiles_list, labels = data
    return smiles_list, labels