# Dataset

## 加载数据文件

In [1]:
from rdkit.Chem import PandasTools # Load the Pandas-Tools module
import pandas as pd
from rdkit import Chem 
from rdkit.Chem import AllChem

In [2]:
file = r'D:\儿童医院ZWT_MS\25博士申请\密西西比大学\tox21_10k_data_all.sdf'

data = PandasTools.LoadSDF(file) # Load the SDF file into a Pandas DataFrame
data['Smiles'] = data.ROMol.map(Chem.MolToSmiles) # Add a column with the SMILES representation of the molecule

X = data['Smiles'].to_list() # Get the SMILES as a list
Y = (~pd.isna(data['NR-PPAR-gamma'])).values  # Get the NR-PPAR-gamma column as a list of booleans
assert len(X)==len(Y) # Check that the number of SMILES and the number of labels are the same

[15:44:30] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[15:44:30] Explicit valence for atom # 3 Cl, 1, is greater than permitted
[15:44:30] ERROR: Could not sanitize molecule ending on line 21572
[15:44:30] ERROR: Explicit valence for atom # 3 Cl, 1, is greater than permitted
[15:44:30] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[15:44:31] Explicit valence for atom # 3 Cl, 1, is greater than permitted
[15:44:31] ERROR: Could not sanitize molecule ending on line 446665
[15:44:31] ERROR: Explicit valence for atom # 3 Cl, 1, is greater than permitted
[15:44:31] The 2 defining bonds for an atropisomer are co-planar - atoms are: 4 10
[15:44:31] Explicit valence for atom # 1 Cl, 1, is greater than permitted
[15:44:31] ERROR: Could not sanitize molecule ending on line 619150
[15:44:31] ERROR: Explicit valence for atom # 1 Cl, 1, is greater than permitted


In [3]:
len(X)

11761

In [4]:
X

['C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N)cc3nc2c1.[Cl-]',
 'O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(Br)c([O-])c(Br)cc12.[Na+].[Na+]',
 'CO[C@@H]1[C@@H](OC)[C@H](C)[C@@](O)(CC(=O)[O-])O[C@H]1[C@H](C)[C@H]1O[C@@]2(CC[C@@](C)([C@H]3CC[C@@](C)([C@@H]4O[C@@H]([C@H]5O[C@](C)(O)[C@H](C)C[C@@H]5C)C[C@@H]4OC4C[C@H](OC)[C@@H](OC)[C@H](C)O4)O3)O2)C[C@H](O)[C@H]1C.[NH4+]',
 'CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)cc1.CN(C)c1ccc(C(=C2C=CC(=[N+](C)C)C=C2)c2ccccc2)cc1.O=C(O)C(=O)O.O=C([O-])C(=O)O.O=C([O-])C(=O)O',
 'CC(=O)O.CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=O)[C@H](CC(C)C)NC(=O)[C@@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(=O)[C@H](Cc1c[nH]cn1)NC(=O)[C@@H]1CCC(=O)N1',
 'CCCCCCCCNC(C)C(O)c1ccc(SC(C)C)cc1',
 'Cc1ccc([N+](=O)[O-])c2c1O[Hg]2',
 'CCN(CC)C(=S)SSC(=S)N(CC)CC',
 'CCCCCCOc1ccc(C(=N)N(CCCC)CCCC)c2ccccc12.Cl',
 'COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc2cc(F)ccc2[C@@H]1C(C)C.Cl.Cl',
 'CCCCC(CC)CNC(=N)NC(=N)NC

In [5]:
Y

array([False, False, False, ...,  True,  True,  True])

In [6]:
Y.shape

(11761,)

In [7]:
from rdkit import Chem 
from rdkit.Chem import Draw 

smiles = "Cl.Cc1cc(N)c(N)c(C)c1Nc1cc(C)c(N)c(N)c1C" # SMILES of the molecule

mol = Chem.MolFromSmiles(smiles) # Create a molecule from the SMILES


img = Draw.MolToImage(mol) 

# 显示绘制的分子结构图像
img.show()

## Smiles to One-hot Encoding

In [8]:
import re
import torch 
from collections import Counter

class SmilesTokenizer(object):
    """
    A simple regex-based tokenizer adapted from the deepchem smiles_tokenizer package.
    SMILES regex pattern for the tokenization is designed by Schwaller et. al., ACS Cent. Sci 5 (2019)
    """

    def __init__(self):
        self.regex_pattern = (
            r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\."
            r"|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
        )
        self.regex = re.compile(self.regex_pattern)

    def tokenize(self, smiles):

        tokens = [token for token in self.regex.findall(smiles)]
        return tokens
    
def build_vocab(smiles_list, tokenizer, max_vocab_size):

    tokenized_smiles = [tokenizer.tokenize(s) for s in smiles_list]
    token_counter = Counter(c for s in tokenized_smiles for c in s)
    tokens = [token for token, _ in token_counter.most_common(max_vocab_size)]
    vocab = {token: idx for idx, token in enumerate(tokens)}
    return vocab

def smiles_to_onehot(smiles, tokenizer, vocab):

    unknown_token_id = len(vocab) - 1
    token_ids = [vocab.get(token, unknown_token_id) for token in tokenizer.tokenize(smiles)]
    ohe = torch.eye(len(vocab))[token_ids]
    return ohe

In [9]:
# Example
import numpy as np 


In [10]:
# Example
import numpy as np 
tokenizer = SmilesTokenizer()
smiles = "C==CCCOOOCl"
print("SMILES:\n\t", smiles)
print("SMILESTokenizer:\n\t", ", ".join(tokenizer.tokenize(smiles)))
vocab = build_vocab([smiles], tokenizer, 4)
print("build_vocab:\n\t", vocab)
print("onehot:\n", np.array(smiles_to_onehot(smiles, tokenizer, vocab)).T)

SMILES:
	 C==CCCOOOCl
SMILESTokenizer:
	 C, =, =, C, C, C, O, O, O, Cl
build_vocab:
	 {'C': 0, 'O': 1, '=': 2, 'Cl': 3}
onehot:
 [[1. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [11]:
print("SMILES string:\n\t", smiles)

SMILES string:
	 C==CCCOOOCl


In [12]:
tokenizer = SmilesTokenizer()
print("Tokens:\n\t", ", ".join(tokenizer.tokenize(smiles)))

Tokens:
	 C, =, =, C, C, C, O, O, O, Cl


In [13]:
vocab = build_vocab([smiles], tokenizer, 4)
print("Vocab:\n\t", vocab)

Vocab:
	 {'C': 0, 'O': 1, '=': 2, 'Cl': 3}


In [16]:
print("OHE:\n", np.array(smiles_to_onehot(smiles, tokenizer, vocab)).T)

OHE:
 [[1. 0. 0. 1. 1. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## 制作数据集Dataset

In [17]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset

# build a vocab using the training data
max_vocab_size = 30
vocab = build_vocab(X, tokenizer, max_vocab_size) 
vocab_size = len(vocab)

# transform smiles to one-hot encoded tensors and apply padding
Xcode = pad_sequence(
    sequences=[smiles_to_onehot(i, tokenizer, vocab) for i in X],
    batch_first=True,
    padding_value=0,
)

dataset = TensorDataset(Xcode, torch.Tensor(Y).unsqueeze(1))

# build dataset
total_length = len(dataset)

# 
train_length = int(0.8 * total_length)  # 0.8 of the data is used for training
test_length = total_length - train_length  # 0.2 of the data is used for testing


all_indices = torch.randperm(total_length) # shuffle the indices

train_indices = all_indices[:train_length] # split the indices
test_indices = all_indices[train_length:] # split the indices

train_dataset = Subset(dataset, train_indices) # creat the train_datset
test_dataset = Subset(dataset, test_indices) # creat the test_datset

# creat DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [18]:
for x, y in train_loader:
    print(x)
    # print(y.size(0))


tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [19]:
Xcode

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [20]:
Y

array([False, False, False, ...,  True,  True,  True])

In [21]:
torch.Tensor(Y).unsqueeze(1)

tensor([[0.],
        [0.],
        [0.],
        ...,
        [1.],
        [1.],
        [1.]])

In [22]:
dataset

<torch.utils.data.dataset.TensorDataset at 0x1c5549bf4c0>

In [23]:
data = dataset[0]
data 

(tensor([[1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([0.]))

# Model 

In [24]:
import torch.nn as nn

class RNNModel(nn.Module):
    """Vanilla RNN with one recurrent layer"""

    def __init__(self, input_size, hidden_size=32, num_layers=1):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.bn1 = nn.BatchNorm1d(num_features = hidden_size)
        self.fc = nn.Linear(hidden_size, 2)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, hn = self.rnn(x, h0)
        out = out[:, -1]
        out = self.dropout(out)
        out = self.bn1(out)
        out = self.fc(out)
        return out


# Train 

In [25]:
import time

In [27]:
def train():
    model = RNNModel(vocab_size, hidden_size=32)
    model.train()
    epoch_loss = 0
    batch_losses = []
        
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

    # define loss function
    criterion = nn.CrossEntropyLoss()
    
    # define optimizer
    epoch = 50

    for i in range(epoch):
        sam_num = 0 # number of samples
        total_loss = 0 # total loss
        Start = time.time() # start time
        correct = 0 # number of correct samples
        for x, y in train_loader:

            # forward
            y_pred = model(x)

            # compute loss
            loss = criterion(y_pred, y.view(-1).long())
            total_loss += loss.item()

            # backward
            optimizer.zero_grad() # clear the gradient
            loss.backward() # backward
            optimizer.step() # update the parameters

            # compute the number of correct samples
            correct += (torch.argmax(y_pred.data, -1) == y.squeeze(1)).sum().item()
            # print("arg:",torch.argmax(y_pred, -1))
            # print("len",y.squeeze(1))
            sam_num += len(y)
            total_loss += loss.item()
            accuracy = correct/sam_num
            

        print("Epoch {},Loss:{},Accuracy:{}, Time:{}".format(i, total_loss/sam_num, accuracy, time.time()-Start))


In [28]:
train()

Epoch 0,Loss:0.17783598383875932,Accuracy:0.5502763605442177, Time:18.427626132965088
Epoch 1,Loss:0.1644002953480904,Accuracy:0.6256377551020408, Time:18.48573327064514
Epoch 2,Loss:0.1605632542664198,Accuracy:0.6668792517006803, Time:18.236082792282104
Epoch 3,Loss:0.15757895958590873,Accuracy:0.6846301020408163, Time:18.123430490493774
Epoch 4,Loss:0.156677350889714,Accuracy:0.6899447278911565, Time:18.368205547332764
Epoch 5,Loss:0.1560206001586452,Accuracy:0.6953656462585034, Time:18.311565160751343
Epoch 6,Loss:0.15502565755781267,Accuracy:0.6975977891156463, Time:18.569801092147827
Epoch 7,Loss:0.1547970063921039,Accuracy:0.6978103741496599, Time:18.604939460754395
Epoch 8,Loss:0.15436164458237944,Accuracy:0.6978103741496599, Time:18.947700262069702
Epoch 9,Loss:0.15449612998866102,Accuracy:0.6977040816326531, Time:19.552510738372803
Epoch 10,Loss:0.15417549935277222,Accuracy:0.6975977891156463, Time:19.844913482666016
Epoch 11,Loss:0.1540066413961503,Accuracy:0.6977040816326531