In [None]:
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install rdkit


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2023.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.1


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.feature_selection import SelectFromModel

In [None]:
device = torch.device("cpu")

In [None]:
# Define a Reaction dataset class
class ReactionDataset(Dataset):
    def __init__(self, X, y_G_act, y_G_r):
        self.X = X
        self.y_G_act = y_G_act
        self.y_G_r = y_G_r

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y_G_act[idx], self.y_G_r[idx]

In [None]:
# Read the dataset
df = pd.read_csv('full_dataset.csv') 

# Prepare the feature matrix X and target variables y
X_smiles = df['rxn_smiles']
y_G_act = df['G_act']
y_G_r = df['G_r']

In [None]:
# Define the encoding function
def encode_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    bitstring = fp.ToBitString()
    features = np.array([int(bit) for bit in bitstring], dtype=np.float32)
    features_tensor = torch.tensor(features)
    return features_tensor

# Encode SMILES strings to fingerprints
X_fingerprints = []

for idx, smiles in enumerate(X_smiles):
    reactants, products = smiles.split('>>')
    reactant_fingerprints = [encode_smiles(reactant) for reactant in reactants.split('.')]
    product_fingerprints = [encode_smiles(product) for product in products.split('.')]
    X_fingerprints.append(reactant_fingerprints + product_fingerprints)

# Convert the list of fingerprints and target values to tensors
X_tensor = torch.stack([torch.cat(fingerprints) for fingerprints in X_fingerprints]).to(device)

# Print the fingerprint tensor
print(X_tensor)
print(X_tensor.shape)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([5269, 3072])


In [None]:
# Convert the data into PyTorch tensors
y_G_act_tensor = torch.tensor(y_G_act.values, dtype=torch.float32).to(device)
y_G_r_tensor = torch.tensor(y_G_r.values, dtype=torch.float32).to(device)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_G_act_train, y_G_act_test, y_G_r_train, y_G_r_test = train_test_split(X_tensor, y_G_act_tensor, y_G_r_tensor, test_size=0.2, random_state=42)


In [None]:
class RandomForestModel(nn.Module):
    def __init__(self, input_size):
        super(RandomForestModel, self).__init__()
        self.random_forest_act = RandomForestRegressor(n_estimators=100)  # Random Forest for G_act
        self.random_forest_r = RandomForestRegressor(n_estimators=100)  # Random Forest for G_r

    def forward(self, x):
        x_act = x  # Input for G_act
        x_r = x  # Input for G_r
        output_act = self.random_forest_act.predict(x_act)  # Output for G_act
        output_r = self.random_forest_r.predict(x_r)  # Output for G_r
        return output_act, output_r

In [None]:

# Instantiate the Random Forest regressor
rf_model_act1 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_act2 = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model_act3 = RandomForestRegressor(n_estimators=300, random_state=42)


rf_model_r1 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_r2 = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model_r3 = RandomForestRegressor(n_estimators=300, random_state=42)

ensemble_model_G_act = VotingRegressor([('rf_model_act1', rf_model_act1), ('rf_model_act2', rf_model_act2), ('rf_model_act3', rf_model_act3)])
ensemble_model_G_r = VotingRegressor([('rf_model_r1', rf_model_r1), ('rf_model_r2', rf_model_r2), ('rf_model_r3', rf_model_r3)])



In [None]:
# Fit the models
ensemble_model_G_act.fit(X_train, y_G_act_train)
ensemble_model_G_r.fit(X_train, y_G_r_train)

# For showing how it works, we will use the following example:

In [None]:

# Read the dataset
df = pd.read_csv('debuging_dataset.csv')

# Extract the SMILES strings
smiles = df['rxn_smiles']

print(f"G_act {df['G_act']} G_r {df['G_r']}  ")

# Encode SMILES strings to fingerprints
X_fingerprints = []

for idx, smiles in enumerate(smiles):
    reactants, products = smiles.split('>>')
    reactant_fingerprints = [encode_smiles(reactant) for reactant in reactants.split('.')]
    product_fingerprints = [encode_smiles(product) for product in products.split('.')]
    X_fingerprints.append(reactant_fingerprints + product_fingerprints)

# Convert the list of fingerprints and target values to tensors
X_test = torch.stack([torch.cat(fingerprints) for fingerprints in X_fingerprints])

# Evaluate the models on the testing data
predictions_G_act = ensemble_model_G_act.predict(X_test)
predictions_G_r = ensemble_model_G_r.predict(X_test)

# Print the predictions
print("Predictions for G_act:")
for pred in predictions_G_act:
    print(pred)

print("Predictions for G_r:")
for pred in predictions_G_r:
    print(pred)

G_act 0     15.875896
1     15.155477
2     18.013824
3     23.687079
4     23.438094
5     20.969801
6     11.722625
7      7.268044
8      6.289673
9     16.044475
10    13.604005
Name: G_act, dtype: float64 G_r 0    -51.881526
1    -51.398681
2    -66.822349
3    -62.481289
4    -84.836459
5    -57.785046
6    -63.721331
7    -86.996102
8    -86.799671
9    -50.409197
10   -51.008126
Name: G_r, dtype: float64  
Predictions for G_act:
15.24899767793991
15.198802469394826
18.07200829346975
22.02099354143496
22.0973322418001
17.597455409102967
13.671995250648921
8.612696935247492
7.606726751106758
15.846369749704998
14.730198397901324
Predictions for G_r:
-50.09631182140774
-49.97316423027604
-63.17122292518616
-62.095371262232455
-72.03792874477527
-59.25069643047121
-62.123618178235155
-74.71369118266635
-71.37308301862082
-48.084038217332626
-48.307733727031284
