In [1]:
from csv import DictReader

PROPERTY_1 = "properties.research_octane_number.value"
PROPERTY_2 = "properties.motor_octane_number.value"
PROP_SHORTHAND = "os"

with open("data/combustdb_master.csv", "r") as csv_file:
    reader = DictReader(csv_file)
    rows = [r for r in reader]
csv_file.close()

compounds_1 = [r for r in rows if r[PROPERTY_1] != "-"]
compounds_2 = [r for r in rows if r[PROPERTY_2] != "-"]
print(len(compounds_1))
print(len(compounds_2))

278
278


In [2]:
from sklearn.model_selection import train_test_split

smiles = [c["canonical_smiles"] for c in compounds_1]
prop = [[float(c[PROPERTY_1]) - float(c[PROPERTY_2])] for c in compounds_1]  # octane sensitivity = RON - MON

smiles_train, smiles_test, prop_train, prop_test = train_test_split(smiles, prop, test_size=0.2, random_state=42)

print(len(smiles_train), len(prop_train))
print(len(smiles_test), len(prop_test))

222 222
56 56


In [3]:
from ecnet.datasets import QSPRDataset
from sklearn.decomposition import PCA
import pickle
import torch

ds_train = QSPRDataset(smiles_train, prop_train, backend="alvadesc")
ds_test = QSPRDataset(smiles_test, prop_test, backend="alvadesc")
ds_train.target_vals.type(torch.float32)
ds_test.target_vals.type(torch.float32)

print("Pre-PCA data dim:")
print(ds_train.desc_vals.shape, ds_test.desc_vals.shape)

pca = PCA(n_components=len(ds_train))
pca.fit(ds_train.desc_vals.detach().numpy())
ds_train.desc_vals = torch.tensor(pca.transform(ds_train.desc_vals.detach().numpy())).type(torch.float32)
ds_test.desc_vals = torch.tensor(pca.transform(ds_test.desc_vals.detach().numpy())).type(torch.float32)

with open(f"models/{PROP_SHORTHAND}/trf.pca", "wb") as f:
    pickle.dump(pca, f)

print("Post-PCA data dim:")
print(ds_train.desc_vals.shape, ds_test.desc_vals.shape)

print("Target dim:")
print(ds_train.target_vals.shape, ds_test.target_vals.shape)

Pre-PCA data dim:
torch.Size([222, 5305]) torch.Size([56, 5305])
Post-PCA data dim:
torch.Size([222, 222]) torch.Size([56, 222])
Target dim:
torch.Size([222, 1]) torch.Size([56, 1])


In [4]:
from ecnet import ECNet
from sklearn.metrics import median_absolute_error, r2_score

maes_train, maes_test = [], []
r2s_train, r2s_test = [], []

for i in range(10):
    print(f"Model: {i}")
    _model = ECNet(ds_train.desc_vals.shape[1], ds_train.target_vals.shape[1], 512, 2)
    _model.fit(dataset=ds_train, valid_size=0.33, shuffle=True, patience=16, epochs=512, lr=0.001)
    pred_train = _model(ds_train.desc_vals).detach().numpy()
    pred_test = _model(ds_test.desc_vals).detach().numpy()
    maes_train.append(median_absolute_error(prop_train, pred_train))
    maes_test.append(median_absolute_error(prop_test, pred_test))
    r2s_train.append(r2_score(prop_train, pred_train))
    r2s_test.append(r2_score(prop_test, pred_test))
    _model.save(f"models/{PROP_SHORTHAND}/{PROP_SHORTHAND}_{i}.pt")

Model: 0
Model: 1
Model: 2
Model: 3
Model: 4
Model: 5
Model: 6
Model: 7
Model: 8
Model: 9


In [5]:
import numpy as np

print('Training MAE: {:.4f} +/- {:.4f}'.format(np.mean(maes_train), np.std(maes_train)))
print('Training R2: {:.4f} +/- {:.4f}'.format(np.mean(r2s_train), np.std(r2s_train)))
print('Test MAE: {:.4f} +/- {:.4f}'.format(np.mean(maes_test), np.std(maes_test)))
print('Test R2: {:.4f} +/- {:.4f}'.format(np.mean(r2s_test), np.std(r2s_test)))

Training MAE: 0.3906 +/- 0.0759
Training R2: 0.9919 +/- 0.0027
Test MAE: 2.2653 +/- 0.2774
Test R2: 0.5600 +/- 0.0280
