In [1]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install rdkit


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2023.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.1


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# Define a Reaction dataset class
class ReactionDataset(Dataset):
    def __init__(self, X, y_G_act, y_G_r):
        self.X = X
        self.y_G_act = y_G_act
        self.y_G_r = y_G_r

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y_G_act[idx], self.y_G_r[idx]

In [7]:
# Read the dataset
df = pd.read_csv('full_dataset.csv') 

# Prepare the feature matrix X and target variables y
X_smiles = df['rxn_smiles']
y_G_act = df['G_act']
y_G_r = df['G_r']

In [8]:
# Define the encoding function
def encode_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    bitstring = fp.ToBitString()
    features = np.array([int(bit) for bit in bitstring], dtype=np.float32)
    features_tensor = torch.tensor(features)
    return features_tensor

# Encode SMILES strings to fingerprints
X_fingerprints = []

for idx, smiles in enumerate(X_smiles):
    reactants, products = smiles.split('>>')
    reactant_fingerprints = [encode_smiles(reactant) for reactant in reactants.split('.')]
    product_fingerprints = [encode_smiles(product) for product in products.split('.')]
    X_fingerprints.append(reactant_fingerprints + product_fingerprints)

# Convert the list of fingerprints and target values to tensors
X_tensor = torch.stack([torch.cat(fingerprints) for fingerprints in X_fingerprints]).to(device)

# Print the fingerprint tensor
print(X_tensor)
print(X_tensor.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([5269, 3072])


In [9]:
# Convert the data into PyTorch tensors
y_G_act_tensor = torch.tensor(y_G_act.values, dtype=torch.float32).to(device)
y_G_r_tensor = torch.tensor(y_G_r.values, dtype=torch.float32).to(device)

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_G_act_train, y_G_act_test, y_G_r_train, y_G_r_test = train_test_split(X_tensor, y_G_act_tensor, y_G_r_tensor, test_size=0.2, random_state=42)


In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(128, 128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.2)
        self.fc4_act = nn.Linear(64, 1)  # Output layer for G_act
        self.fc4_r = nn.Linear(64, 1)  # Output layer for G_r

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        output_act = self.fc4_act(x)  # Output for G_act
        output_r = self.fc4_r(x)  # Output for G_r
        return output_act, output_r

In [25]:
# Instantiate the model
model = NeuralNetwork(X_train.shape[1])
model.to(device)

NeuralNetwork(
  (fc1): Linear(in_features=3072, out_features=128, bias=True)
  (relu1): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (relu2): ReLU()
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=128, out_features=64, bias=True)
  (relu3): ReLU()
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc4_act): Linear(in_features=64, out_features=1, bias=True)
  (fc4_r): Linear(in_features=64, out_features=1, bias=True)
)

In [45]:
# Define the loss function and optimizer
criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.001)

In [46]:
# Define the data loaders
train_dataset = ReactionDataset(X_train, y_G_act_train, y_G_r_train)
test_dataset = ReactionDataset(X_test, y_G_act_test, y_G_r_test)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [47]:
# Training loop
num_epochs = 1000

for epoch in range(num_epochs):
    running_loss = 0.0
    for inputs, targets_G_act, targets_G_r in train_dataloader:
        targets_G_act = targets_G_act.to(device)
        targets_G_r = targets_G_r.to(device)
        inputs = inputs.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs_G_act, outputs_G_r = outputs  # Separate the outputs into two tensors

        loss_G_act = criterion(outputs_G_act, targets_G_act.unsqueeze(1))
        loss_G_r = criterion(outputs_G_r, targets_G_r.unsqueeze(1))
        loss = loss_G_act + loss_G_r
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss}")

Epoch 1/1000, Loss: 4.703514603051272
Epoch 2/1000, Loss: 4.525348285833995
Epoch 3/1000, Loss: 4.467112788648317
Epoch 4/1000, Loss: 4.434290223049395
Epoch 5/1000, Loss: 4.379505493424156
Epoch 6/1000, Loss: 4.372170851086125
Epoch 7/1000, Loss: 4.329987495234518
Epoch 8/1000, Loss: 4.377725120746728
Epoch 9/1000, Loss: 4.320981148517493
Epoch 10/1000, Loss: 4.245370138775218
Epoch 11/1000, Loss: 4.192886227911169
Epoch 12/1000, Loss: 4.216735446091854
Epoch 13/1000, Loss: 4.143590708573659
Epoch 14/1000, Loss: 4.145339216246749
Epoch 15/1000, Loss: 4.0558641408429
Epoch 16/1000, Loss: 4.072325966574929
Epoch 17/1000, Loss: 4.140241066614787
Epoch 18/1000, Loss: 4.085428750876225
Epoch 19/1000, Loss: 4.06447248025374
Epoch 20/1000, Loss: 4.148945943875746
Epoch 21/1000, Loss: 4.015003953919266
Epoch 22/1000, Loss: 4.0000294121828945
Epoch 23/1000, Loss: 4.027165402065624
Epoch 24/1000, Loss: 4.056331891002077
Epoch 25/1000, Loss: 3.9512478889841023
Epoch 26/1000, Loss: 3.958374740499

In [49]:
# Evaluate the model on the testing data
model.eval()
with torch.no_grad():
    test_loss_G_act = 0.0
    test_loss_G_r = 0.0
    total_mse = 0.0
    total_mae = 0.0
    for inputs, targets_G_act, targets_G_r in test_dataloader:
        targets_G_act = targets_G_act.to(device)
        targets_G_r = targets_G_r.to(device)
        outputs = model(inputs)
        outputs_G_act, outputs_G_r = outputs  # Separate the outputs into two tensors

        loss_G_act = criterion(outputs_G_act, targets_G_act.unsqueeze(1))
        loss_G_r = criterion(outputs_G_r, targets_G_r.unsqueeze(1))

        test_loss_G_act += loss_G_act.item()
        test_loss_G_r += loss_G_r.item()

        mse_G_act = mean_squared_error(targets_G_act.cpu(), outputs_G_act.cpu().detach())
        mae_G_act = mean_absolute_error(targets_G_act.cpu(), outputs_G_act.cpu().detach())
        mse_G_r = mean_squared_error(targets_G_r.cpu(), outputs_G_r.cpu().detach())
        mae_G_r = mean_absolute_error(targets_G_r.cpu(), outputs_G_r.cpu().detach())

        total_mse += mse_G_act.item() + mse_G_r.item()
        total_mae += mae_G_act.item() + mae_G_r.item()

    average_test_loss_G_act = test_loss_G_act / len(test_dataloader)
    average_test_loss_G_r = test_loss_G_r / len(test_dataloader)

    avg_mse = total_mse / len(test_dataloader)
    avg_mae = total_mae / len(test_dataloader)

    print(f"Average Test Loss G_act: {average_test_loss_G_act}")
    print(f"Average Test Loss G_r: {average_test_loss_G_r}")

    print(f"Mean Squared Error (MSE): {avg_mse:.4f}")
    print(f"Mean Absolute Error (MAE): {avg_mae:.4f}")

RuntimeError: ignored

# For showing how it works, we will use the following example:

In [50]:

# Read the dataset
df = pd.read_csv('debuging_dataset.csv')

# Extract the SMILES strings
smiles = df['rxn_smiles']

print(f"G_act {df['G_act']} G_r {df['G_r']}  ")

# Encode SMILES strings to fingerprints
X_fingerprints = []

for idx, smiles in enumerate(smiles):
    reactants, products = smiles.split('>>')
    reactant_fingerprints = [encode_smiles(reactant) for reactant in reactants.split('.')]
    product_fingerprints = [encode_smiles(product) for product in products.split('.')]
    X_fingerprints.append(reactant_fingerprints + product_fingerprints)

# Convert the list of fingerprints and target values to tensors
X_test = torch.stack([torch.cat(fingerprints) for fingerprints in X_fingerprints]).to(device)


G_act 0     15.875896
1     15.155477
2     18.013824
3     23.687079
4     23.438094
5     20.969801
6     11.722625
7      7.268044
8      6.289673
9     16.044475
10    13.604005
Name: G_act, dtype: float64 G_r 0    -51.881526
1    -51.398681
2    -66.822349
3    -62.481289
4    -84.836459
5    -57.785046
6    -63.721331
7    -86.996102
8    -86.799671
9    -50.409197
10   -51.008126
Name: G_r, dtype: float64  


In [52]:

# Set the model to evaluation mode
model.eval()

# Make predictions on test inputs
with torch.no_grad():
    test_inputs = X_test  
    predictions_G_act, predictions_G_r = model(test_inputs)  # Separate the predictions into two tensors

# Convert predictions to numpy arrays
predictions_G_act = predictions_G_act.detach().cpu().numpy()
predictions_G_r = predictions_G_r.detach().cpu().numpy()


print("Predictions for G_act:")
for pred in predictions_G_act:
    print(pred[0])

print("Predictions for G_r:")
for pred in predictions_G_r:
    print(pred[0])

Predictions for G_act:
15.528451
15.004355
18.296814
22.711151
23.070389
20.372826
11.329659
7.5581665
7.8600636
15.880939
13.679186
Predictions for G_r:
-50.93228
-51.38302
-65.1774
-61.78781
-82.55413
-56.837776
-64.16272
-86.10514
-77.14561
-48.911022
-48.26027
