In [2]:
!pip install rdkit pysmiles prettytable pybel;



In [3]:
# Import required packages

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from prettytable import PrettyTable
import multiprocessing
from scipy import stats
import os, sys
import time

In [3]:
start = time.time()

file_dir = os.path.dirname("../")
sys.path.append(file_dir)

from anima.smiles import SMILES

sml = SMILES()

cores = int(multiprocessing.cpu_count())
print(f"Cores: {cores}, GPUs: {torch.cuda.device_count()}")

# Read the database
print("\n Reading the database \n")
database = pd.read_csv("../anima-master/databases/OMEAD_41801.csv", nrows=30000)
database["homo_lumo_gap"] = database.lumo - database.homo

print(f" \n Database shape {database.shape} \n")

Cores: 24, GPUs: 1

 Reading the database 

 
 Database shape (30000, 21) 



In [4]:
# Perform database cleaning
clean = ["Se", "Zn", "se", "zn", "6", "-"]
for sm in database.smiles:
    for ii in clean:
        if ii in sm:
            indx = database[database.smiles == sm].index
            database.drop(index=indx, inplace=True)
            continue
        count = 0
        for entry in sm:
            if entry.isalpha():
                count += 1
        if count > 30:
            indx = database[database.smiles == sm].index
            database.drop(index=indx, inplace=True)
            
indx = database[database.oxidation <= 6.0].index
database.drop(index=indx, inplace=True)

print(f"\n New database shape {database.shape} \n")

# Identifying redox unstable structures
print(f"Number of redox unstable molecules = {len(database[database.redox_stable == 'no'])}")

print("\n Selecting redox stable structures \n")
train_db = database[database.redox_stable == "yes"]

print(f"\n Trainable structures = {len(train_db)}")

print("\n Data preprocessing \n")

smiles = np.array(train_db.smiles)

# Reading homo lumo gap
gap = np.array(train_db.homo_lumo_gap)

# Defining SMILES vocabulary
print("\n Defining SMILES vocabulary \n")
vocab = sml.smilesVOC(smiles, n_jobs=cores)
vocab_size = len(vocab)

print(f"\n Vocab size: {vocab_size} \n")

# counting elements / smiles participation
selected_smiles = smiles

Vtotal = {}

# Initialize
for ii in vocab:
    Vtotal[ii] = 0

for i in range(len(selected_smiles)):
    for ii in vocab:
        if ii in selected_smiles[i] or ii.lower() in selected_smiles[i]:
            Vtotal[ii] += 1

voc_temp = list()

voc_temp.extend(vocab[0:6])
voc_temp.extend(vocab[7:15])
voc_temp.extend(vocab[17:])

full_vocab = vocab
vocab = voc_temp

vocab_size = len(vocab)
print(f"\n New vocab size after cleaning: {vocab_size} \n")

print("\n Saving vocabulary \n")

# Saving the vocabulary
with open("vocab.dat", "w") as f:
    for i in vocab:
        f.write(str(i) + "\n")

# Saving the full full vocabulary
with open("full_vocab.dat", "w") as f:
    for i in full_vocab:
        f.write(str(i) + "\n")


 New database shape (28242, 21) 

Number of redox unstable molecules = 6938

 Selecting redox stable structures 


 Trainable structures = 21304

 Data preprocessing 


 Defining SMILES vocabulary 



[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.4s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.7s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    1.0s
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:    1.5s
[Parallel(n_jobs=24)]: Done 1752 tasks      | elapsed:    2.9s
[Parallel(n_jobs=24)]: Done 2402 tasks      | elapsed:    5.0s
[Parallel(n_jobs=24)]: Done 3152 tasks      | elapsed:    7.6s
[Parallel(n_jobs=24)]: Done 4002 tasks      | elapsed:   10.5s
[Parallel(n_jobs=24)]: Done 4952 tasks      | elapsed:   13.8s
[Parallel(n_jobs=24)]: Done 6002 tasks      | elapsed:   17.4s
[Parallel(n_jobs=24)]: Done 7152 tasks      | elapsed:   21.6s
[Parallel(n_jobs=24)]: Done 8402 tasks      | elapsed:   25.9s
[Parallel(n_jobs=24)]: Done 9752 tasks      | elapsed:   30.7s
[Parallel(n_jobs=24)]: Done 11202 tasks 


 Vocab size: 22 


 New vocab size after cleaning: 19 


 Saving vocabulary 



In [1]:
print("\n Preparing SMILES for PyTorch \n")

# Converting SMILES into sequences or 1h tensors

# for embeddings
all_sequences = []

l = 1
for i in smiles:
    print(l, end="\r")
    all_sequences.append(torch.tensor(sml.smilesToSequence(i, vocab)))
    l += 1

# for embedding
packing = torch.nn.utils.rnn.pack_sequence(
    all_sequences,
    enforce_sorted=False
)

packing_padding = torch.nn.utils.rnn.pad_packed_sequence(
    packing,
    batch_first=True
)

# Check dimensions

# for embeddings
# BATCH x SEQUENCE x INFO
print(f"\n Check packing shape {packing_padding[0][:, :, :].size()}")

print("\n Train / test split \n")

# Final data format for NN
# for embedding
temp = packing_padding[0][:, :, 0]

random_state = 1
test_size = 0.12

# inputs
x_train, x_test = train_test_split(
    temp.numpy(),
    test_size=test_size,
    random_state=1
)
del temp

# targets
y_train_gap, y_test_gap = train_test_split(
    gap,
    test_size=test_size,
    random_state=1
)

# max length of sequence in batch
print(f"\n Max length of tensor sequences: {x_train.shape}")

# for embeddings
max_length = x_train.shape[-1]
print(f"\nDebugging ### max_length: {max_leng}")

print(f"\n Test size: {x_test.shape} \n")
print(f"\n Train size: {x_train.shape} \n")

# ---
# # Networks
# ## Defining the NN class and tools
print("\n Starting the NN configs \n")

# Checking CUDA
use_cuda = True
print(f"Cuda available: {torch.cuda.is_available()}")
device = torch.device(
    "cuda" if (use_cuda and torch.cuda.is_available()) else "cpu"
)
torch.cuda.set_per_process_memory_fraction(0.7, device=0) # Limits GPU usage to 70%


 Preparing SMILES for PyTorch 



NameError: name 'smiles' is not defined

In [6]:
# Tools
def count_parameters(model):
    # function to return the params/layers of the model
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params += param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    print('\n\nModel Details\n')
    for i in np.array(list(model.named_modules())[1:])[:, :]:
        print(i)

def model_evaluation(model, x_test, y_test, device):
    # Performance on test
    batch_size = 64

    test_data = TensorDataset(
        torch.tensor(x_test),
        torch.tensor(y_test),
        
    )
    test_loader = DataLoader(
        test_data,
        shuffle=False,
        batch_size=batch_size,
        drop_last=False
    )

    running_mae = []
    running_mse = []
    mae = torch.nn.L1Loss().to(device)
    mse = torch.nn.SmoothL1Loss().to(device)

    model.eval().to(device)

    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, targets = data
            inputs, targets = inputs.to(device), targets.to(device)
            targets = targets.double()

            output = model(inputs)

            loss_mae = mae(output, targets)
            loss_mse = mse(output, targets)

            running_mae.append(loss_mae.item())
            running_mse.append(loss_mse.item())

    model.train()
    return np.mean(running_mae), np.mean(running_mse)

def model_predictions(model, x, device, batch_size = 64):
    # predictions

    model.eval().to(device)

    pred_data = TensorDataset(torch.tensor(x))
    pred_loader = DataLoader(
        pred_data,
        shuffle=False,
        batch_size=batch_size,
        drop_last=False,
    )

    batches = len(x) / batch_size

    with torch.no_grad():
        for batch_idx, data in enumerate(pred_loader):
            print(f"Batch: {batch_idx + 1:010.2f} of {batches:010.2f}", end="\r")
            inputs = data[0]
            inputs = inputs.to(device)

            output = model(inputs)
            if batch_idx == 0:
                temp = output.cpu().detach().numpy()
            else:
                temp = np.append(temp, output.cpu().detach().numpy())
            del inputs, output

    return np.reshape(temp, -1)

In [5]:
# Defining the NN model
class NN(torch.nn.Module):
    def __init__(self, hidden_dim, output_dim, n_layers, decoder_in, decoder_out, vocab_size, emb_dim, max_length, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.lstmA = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.lstmB = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.lstmC = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)
        self.lstmD = torch.nn.LSTM(emb_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout)

        self.fcA = torch.nn.Linear(hidden_dim, decoder_in)
        self.fcB = torch.nn.Linear(hidden_dim, decoder_in)
        self.fcC = torch.nn.Linear(hidden_dim, decoder_in)
        self.fcD = torch.nn.Linear(hidden_dim, decoder_in)

        self.decoder = torch.nn.Linear(max_length * 4 * decoder_in, decoder_out)
        self.pre_output = torch.nn.Linear(decoder_out, 1)
        self.output = torch.nn.Linear(decoder_out, output_dim)

        self.activation = torch.nn.Mish()
        self.embeddings = torch.nn.Embedding(vocab_size + 1, emb_dim, max_norm=1.0, padding_idx=0)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, inputs):
        batch = len(inputs)
        inputs = self.embeddings(inputs)

        hidden, cell = self.initHidden(batch, self.hidden_dim)
        lstmA, _ = self.lstmA(inputs, (hidden, cell))
        hidden, cell = self.initHidden(batch, self.hidden_dim)
        lstmB, _ = self.lstmB(inputs, (hidden, cell))
        hidden, cell = self.initHidden(batch, self.hidden_dim)
        lstmC, _ = self.lstmC(inputs, (hidden, cell))
        hidden, cell = self.initHidden(batch, self.hidden_dim)
        lstmD, _ = self.lstmD(inputs, (hidden, cell))
        

        fcA = self.fcA(lstmA)
        fcB = self.fcB(lstmB)
        fcC = self.fcC(lstmC)
        fcD = self.fcD(lstmD)

        cat = torch.cat((fcA, fcB, fcC, fcD), -1)
        cat = cat.reshape(batch, -1)
        cat = self.activation(cat)

        decoder = self.decoder(cat)
        decoder = self.activation(decoder)
        decoder = self.dropout(decoder)

        output = self.output(decoder)

        return output[:, 0]

    def initHidden(self, batch_size, hidden_dim):
        return (torch.zeros(self.n_layers, batch_size, hidden_dim, dtype=torch.float, device=device),
               torch.zeros(self.n_layers, batch_size, hidden_dim, dtype=torch.float, device=device))

In [8]:
print("\n Gap NN \n")

epochs = 2000
all_data = len(x_train)
log = 10

decoder_in = 32
decoder_out = 64
hidden_dim = 128
n_layers = 3
emb_dim = 256
output_dim = 1
batch_size = 256
dropout = 0.15
weight_decay = 1e-5

nn_gap = NN(hidden_dim, output_dim, n_layers, decoder_in, decoder_out, vocab_size, emb_dim, max_length, dropout)
nn_gap = nn_gap.float().to(device)

# loss
metric = torch.nn.SmoothL1Loss().double()
optimizer = torch.optim.Adam(nn_gap.parameters(), lr=0.001, weight_decay=weight_decay)

# setting the dataloader
train_data = TensorDataset(torch.tensor(x_train), torch.tensor(y_train_gap))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=False, pin_memory=True)


 Gap NN 



In [None]:
epoch_loss = []
test_loss = []
for epoch in range(epochs):

    running_loss = []
    running_metric = []
    for batch_idx, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0]
        targets = data[1]
        inputs, targets = inputs.to(device), targets.to(device)
        targets = targets.double()

        optimizer.zero_grad()

        output = nn_gap(inputs)

        loss = torch.mean(torch.abs(targets - output))
        loss.backward()
        optimizer.step()

        # Print statistics every log
        running_loss.append(loss.item())
        running_metric.append(metric(output, targets).item())
        if batch_idx % log == 0:
            print(f"Epoch {epoch+1:04d} [{batch_idx * len(inputs):05d}/{all_data:05d} ({100 * batch_idx * batch_size / len(train_data):04.1f})%] \t\t Loss: {np.mean(running_loss):02.3f}", end="\r")

    epoch_loss.append(np.mean(running_loss))
    test_mae, test_mse = model_evaluation(nn_gap, x_test, y_test_gap, device)
    test_loss.append(test_mae)

    print(f"Epoch: {epoch + 1:04d} [] Train (Loss={np.mean(running_loss):02.3f} | Metric={np.mean(running_metric):02.3f})\tVal. (MAE={test_mae:02.3f} | HL={test_mse:02.3f})", end="\n")

print("\n")
print(f"\n Train Loss: {round(epoch_loss[-1], 2)}")
print(f"\n Test MAE: {round(test_loss[-1], 2)}")
print("\n Finished training")

print("\n Saving model \n")

# save
torch.save(nn_gap.state_dict(), "nn_gap.pt")
torch.save(nn_gap, "nn_gap.pt_full")

Epoch: 0001 [] Train (Loss=0.045 | Metric=0.002)	Val. (MAE=0.024 | HL=0.000)
Epoch: 0002 [] Train (Loss=0.023 | Metric=0.000)	Val. (MAE=0.023 | HL=0.000)
Epoch: 0003 [] Train (Loss=0.022 | Metric=0.000)	Val. (MAE=0.018 | HL=0.000)
Epoch: 0004 [] Train (Loss=0.022 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0005 [] Train (Loss=0.020 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0006 [] Train (Loss=0.020 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0007 [] Train (Loss=0.020 | Metric=0.000)	Val. (MAE=0.022 | HL=0.000)
Epoch: 0008 [] Train (Loss=0.019 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0009 [] Train (Loss=0.019 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0010 [] Train (Loss=0.018 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0011 [] Train (Loss=0.019 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0012 [] Train (Loss=0.018 | Metric=0.000)	Val. (MAE=0.017 | HL=0.000)
Epoch: 0013 [] Train (Loss=0.018 | Metric=0.000)	Val. (MAE=0.020 | HL=0.000)

In [None]:
# Plot training and test errors
plt.plot(range(epoch + 1), test_loss, 'o', c='black', ms=5, label='Test')
plt.plot(range(epoch + 1), epoch_loss, 'o', c='red', ms=5, label='Training')
plt.xlabel("n of epochs", size=14)
plt.ylabel("MAE", size=14)
plt.legend(fontsize=14)
plt.tight_layout()
plt.savefig("gap_model_training.png", dpi=200)

predictions = model_predictions(nn_gap, x_test, device, batch_size=64)

px = predictions.reshape(-1)
py = y_test_gap.reshape(-1)

# Generate linear fit
slope, intercept, r_value, p_value, std_err = stats.linregress(px, py)
line = slope * px + intercept
conf = std_err * 2.58  # 99% confidence interval

fig, axs = plt.subplots(1, 1, dpi=80)
axs.tick_params(axis="both", which="major", labelsize=12)

axs.plot(px, py, "o", color="xkcd:tomato", aa=True, alpha=0.6)
axs.plot(px, line, color="xkcd:azure", aa=True, alpha=0.6)

axs.set_ylabel(r"Target Values (Hartree)", fontsize=14)
axs.set_xlabel("Predicted Values (Hartree)", fontsize=14)
fig.text(0.13, 0.90, "R$^2$ = " + str(round(r_value**2, 1)), ha="left", fontsize=13)

fig.tight_layout()
plt.savefig("gap_model_correlation.png", dpi=200)

end = time.time()
print(f"\n Time: {end-start}")

In [6]:
nn_homo = torch.load("nn_gap.pt_full", weights_only=False)

In [7]:
# Total number of parameters
total_params = sum(p.numel() for p in nn_homo.parameters())
print(f"Total parameters: {total_params:,}")

Total parameters: 2,311,490
