In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from sklearn.model_selection import train_test_split
import numpy as np

# get dadatset and process labels

In [2]:
data_path = '../dataset/canonical_trainset.csv'
df = pd.read_csv(data_path)

smiles = df['SMILES'][:50].tolist()
labels = df['Label'][:50].apply(lambda x: 1 if x == 'Positive' else 0).tolist()

# preprocess SMILES

In [3]:
class SMILESDataset(Dataset):
    def __init__(self, smiles, labels, tokenizer):
        self.smiles = smiles
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        smile = self.smiles[idx]
        label = self.labels[idx]
        encoded_smile = [self.tokenizer['stoi']['<SOS>']] + [self.tokenizer['stoi'].get(c, self.tokenizer['stoi']['<PAD>']) for c in smile] + [self.tokenizer['stoi']['<EOS>']]
        return torch.tensor(encoded_smile, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# create vocabulary
def create_vocab(smiles_list):
    counter = Counter()
    for smile in smiles_list:
        counter.update(smile)
    vocab = ['<PAD>', '<SOS>', '<EOS>'] + list(counter.keys())
    stoi = {s: i for i, s in enumerate(vocab)}
    itos = {i: s for i, s in enumerate(vocab)}
    return {'stoi': stoi, 'itos': itos}

def collate_fn(batch):
    inputs, labels = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    labels_tensor = torch.tensor(labels, dtype=torch.float).unsqueeze(1)  # Reshape for consistency with output
    return inputs_padded, labels_tensor


# load data

In [4]:
tokenizer = create_vocab(smiles)
stoi = tokenizer['stoi']

smiles_train, smiles_test, labels_train, labels_test = train_test_split(smiles, labels, test_size=0.2, random_state=42)

train_dataset = SMILESDataset(smiles_train, labels_train, tokenizer)
test_dataset = SMILESDataset(smiles_test, labels_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


# Build model

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd

class RNNModel(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, extract_features=False):
        embedded = self.embedding(text)
        output, (hidden, _) = self.lstm(embedded)
        if extract_features:
            return hidden.squeeze(0) # extract feature
        hidden = hidden.squeeze(0)
        dense_outputs = self.fc(hidden)
        outputs = self.sigmoid(dense_outputs)
        return outputs

# train and evaluate model

In [6]:
input_dim = len(stoi)
print(input_dim)
embed_dim = 64
hidden_dim = 256
output_dim = 1

model = RNNModel(input_dim, embed_dim, hidden_dim, output_dim)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

def train(model, train_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    total_correct = 0
    total = 0
    
    for texts, labels in train_loader:
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)  
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        predicted = (predictions >= 0.5).float()
        total_correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    average_loss = total_loss / len(train_loader)
    accuracy = total_correct / total
    return average_loss, accuracy

def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total = 0
    
    with torch.no_grad():
        for texts, labels in test_loader:
            predictions = model(texts)
            loss = criterion(predictions, labels) 
            total_loss += loss.item()
            predicted = (predictions >= 0.5).float()
            total_correct += (predicted == labels).sum().item() 
            total += labels.size(0)
    
    average_loss = total_loss / len(test_loader)
    accuracy = total_correct / total
    return average_loss, accuracy


best_val_accuracy = 0
num_epochs=10

for epoch in range(num_epochs):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy = evaluate(model, test_loader, criterion)
    
    print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}')
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best1_model.pth')



21
Epoch: 1/10, Train Loss: 0.6257, Train Acc: 0.9500, Val Loss: 0.5005, Val Acc: 1.0000
Epoch: 2/10, Train Loss: 0.4844, Train Acc: 0.9500, Val Loss: 0.3924, Val Acc: 1.0000
Epoch: 3/10, Train Loss: 0.3650, Train Acc: 0.9500, Val Loss: 0.2998, Val Acc: 1.0000
Epoch: 4/10, Train Loss: 0.2638, Train Acc: 0.9500, Val Loss: 0.2268, Val Acc: 1.0000
Epoch: 5/10, Train Loss: 0.1852, Train Acc: 0.9500, Val Loss: 0.1789, Val Acc: 1.0000
Epoch: 6/10, Train Loss: 0.1356, Train Acc: 0.9500, Val Loss: 0.1532, Val Acc: 1.0000
Epoch: 7/10, Train Loss: 0.1113, Train Acc: 0.9500, Val Loss: 0.1409, Val Acc: 1.0000
Epoch: 8/10, Train Loss: 0.1015, Train Acc: 0.9500, Val Loss: 0.1354, Val Acc: 1.0000
Epoch: 9/10, Train Loss: 0.0978, Train Acc: 0.9500, Val Loss: 0.1337, Val Acc: 1.0000
Epoch: 10/10, Train Loss: 0.0961, Train Acc: 0.9500, Val Loss: 0.1347, Val Acc: 1.0000


# extract features

In [7]:
import numpy as np
import torch
from torch.utils.data import DataLoader

def extract_features_1d(model, iterator):
    model.eval()
    features = []
    all_labels = [] 
    with torch.no_grad():
        for texts, labels in iterator:
            feature = model(texts, extract_features=True)
            features.append(feature.cpu().numpy())  
            all_labels.append(labels.cpu().numpy()) 
    features = np.concatenate(features, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return features, all_labels


In [8]:
model.load_state_dict(torch.load('best1_model.pth'))
all_dataset = SMILESDataset(smiles, labels, tokenizer)
all_data_loader = DataLoader(all_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
all_features, all_labels = extract_features_1d(model, all_data_loader)

In [9]:
print(all_features.shape)
print(all_labels.shape)

(50, 256)
(50, 1)
