In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from sklearn.model_selection import train_test_split

# get dadatset and process labels

In [15]:
data_path = '../dataset/canonical_trainset.csv'
df = pd.read_csv(data_path)

smiles = df['SMILES'].tolist()
labels = df['Label'].apply(lambda x: 1 if x == 'Positive' else 0).tolist()

# preprocess SMILES

In [16]:
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded_smile = [self.tokenizer['stoi']['<SOS>']] + [self.tokenizer['stoi'].get(c, self.tokenizer['stoi']['<PAD>']) for c in smile] + [self.tokenizer['stoi']['<EOS>']]
        return torch.tensor(encoded_smile, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# create vocabulary
def create_vocab(smiles_list):
    counter = Counter()
    for smile in smiles_list:
        counter.update(smile)
    vocab = ['<PAD>', '<SOS>', '<EOS>'] + list(counter.keys())
    stoi = {s: i for i, s in enumerate(vocab)}
    itos = {i: s for i, s in enumerate(vocab)}
    return {'stoi': stoi, 'itos': itos}

def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels_tensor = torch.tensor(labels, dtype=torch.float).unsqueeze(1)  # Reshape for consistency with output
    return inputs_padded, labels_tensor

# load data

In [17]:
tokenizer = create_vocab(smiles)
stoi = tokenizer['stoi']

smiles_train, smiles_test, labels_train, labels_test = train_test_split(smiles, labels, test_size=0.2, random_state=42)

train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)
test_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


# Build model and extract feature

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd


class RNNModel(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, extract_features=False):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        if extract_features:
            return hidden.squeeze(0) # extract feature
        hidden = hidden.squeeze(0)
        dense_outputs = self.fc(hidden)
        outputs = self.sigmoid(dense_outputs)
        return outputs

input_dim = len(stoi)
embed_dim = 64
hidden_dim = 256
output_dim = 1

model = RNNModel(input_dim, embed_dim, hidden_dim, output_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for texts, labels in iterator:
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')


Epoch 1, Train Loss: 0.6982
Epoch 2, Train Loss: 0.6934
Epoch 3, Train Loss: 0.6932
Epoch 4, Train Loss: 0.6931
Epoch 5, Train Loss: 0.6949


In [19]:
def extract_features(model, loader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for texts, label in loader:
            feature = model(texts, extract_features=True)
            features.append(feature)
            labels.append(label)
    features = torch.cat(features, 0)
    labels = torch.cat(labels, 0)
    return features, labels

all_features, all_labels = extract_features(model, test_loader)


In [21]:
print(all_features)

tensor([[ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288],
        [ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288],
        [ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288],
        ...,
        [ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288],
        [ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288],
        [ 0.0051, -0.0065, -0.0327,  ..., -0.0071, -0.0011, -0.0288]])
