In [1]:
from csv import DictReader

with open('octane_database.csv', 'r') as csv_file:
    reader = DictReader(csv_file)
    compounds = [r for r in reader]
csv_file.close()
print(len(compounds))

278


In [2]:
smiles = [c['canonical_smiles'] for c in compounds]
mon = [[float(c['properties.motor_octane_number.value'])] for c in compounds]
ron = [[float(c['properties.research_octane_number.value'])] for c in compounds]
os = [[ron[i][0] - mon[i][0]] for i in range(len(compounds))]

In [3]:
from sklearn.model_selection import train_test_split

mon_smiles_train, mon_smiles_test, mon_train, mon_test = train_test_split(smiles, mon, test_size=0.1, random_state=42)
ron_smiles_train, ron_smiles_test, ron_train, ron_test = train_test_split(smiles, ron, test_size=0.1, random_state=42)
os_smiles_train, os_smiles_test, os_train, os_test = train_test_split(smiles, os, test_size=0.1, random_state=42)

In [4]:
from ecnet.datasets import QSPRDataset

dataset_mon_train = QSPRDataset(mon_smiles_train, mon_train, backend='alvadesc')
dataset_mon_test = QSPRDataset(mon_smiles_test, mon_test, backend='alvadesc')
dataset_ron_train = QSPRDataset(ron_smiles_train, ron_train, backend='alvadesc')
dataset_ron_test = QSPRDataset(ron_smiles_test, ron_test, backend='alvadesc')
dataset_os_train = QSPRDataset(os_smiles_train, os_train, backend='alvadesc')
dataset_os_test = QSPRDataset(os_smiles_test, os_test, backend='alvadesc')

In [5]:
from ecnet.tasks import select_rfr

desc_idx_mon, _ = select_rfr(dataset_mon_train, total_importance=0.9999, n_estimators=50, n_jobs=4)
desc_idx_mon = desc_idx_mon[:250]

desc_idx_ron, _ = select_rfr(dataset_ron_train, total_importance=0.9999, n_estimators=50, n_jobs=4)
desc_idx_ron = desc_idx_ron[:250]

desc_idx_os, _ = select_rfr(dataset_os_train, total_importance=0.9999, n_estimators=50, n_jobs=4)
desc_idx_os = desc_idx_os[:250]

In [6]:
dataset_mon_train.set_desc_index(desc_idx_mon)
dataset_mon_test.set_desc_index(desc_idx_mon)
dataset_ron_train.set_desc_index(desc_idx_ron)
dataset_ron_test.set_desc_index(desc_idx_ron)
dataset_os_train.set_desc_index(desc_idx_os)
dataset_os_test.set_desc_index(desc_idx_os)

In [7]:
from ecnet import ECNet
from sklearn.metrics import mean_squared_error, r2_score
from time import time

mon_test_rmses = []
mon_test_r2s = []
ron_test_rmses = []
ron_test_r2s = []
os_test_rmses = []
os_test_r2s = []
dos_test_rmses = []
dos_test_r2s = []

t_start = time()

for i in range(25):

    print(f'Iteration: {i} | Time: {time() - t_start}')

    model_mon = ECNet(dataset_mon_train.desc_vals.shape[1], dataset_mon_train.target_vals.shape[1], 128, 2)
    model_ron = ECNet(dataset_ron_train.desc_vals.shape[1], dataset_ron_train.target_vals.shape[1], 128, 2)
    model_os = ECNet(dataset_os_train.desc_vals.shape[1], dataset_os_train.target_vals.shape[1], 128, 2)

    _, _ = model_mon.fit(
        dataset=dataset_mon_train, valid_size=0.112, verbose=0,
        patience=32, epochs=512, random_state=24, lr=0.001
    )

    _, _ = model_ron.fit(
        dataset=dataset_ron_train, valid_size=0.112, verbose=0,
        patience=32, epochs=512, random_state=24, lr=0.001
    )

    _, _, model_os.fit(
        dataset=dataset_os_train, valid_size=0.112, verbose=0,
        patience=32, epochs=512, random_state=24, lr=0.001
    )

    mon_test_exp = [t[0] for t in dataset_mon_test.target_vals]
    mon_test_pred = model_mon(dataset_mon_test.desc_vals).detach().numpy()
    mon_test_pred = [t[0] for t in mon_test_pred]

    ron_test_exp = [t[0] for t in dataset_ron_test.target_vals]
    ron_test_pred = model_ron(dataset_ron_test.desc_vals).detach().numpy()
    ron_test_pred = [t[0] for t in ron_test_pred]

    os_test_exp = [t[0] for t in dataset_os_test.target_vals]
    os_test_pred = model_os(dataset_os_test.desc_vals).detach().numpy()
    os_test_pred = [t[0] for t in os_test_pred]

    dos_test_exp = [ron_test_exp[i] - mon_test_exp[i] for i in range(len(ron_test_exp))]
    dos_test_pred = [ron_test_pred[i] - mon_test_pred[i] for i in range(len(ron_test_pred))]

    mon_test_rmses.append(mean_squared_error(mon_test_exp, mon_test_pred, squared=False))
    mon_test_r2s.append(r2_score(mon_test_exp, mon_test_pred))

    ron_test_rmses.append(mean_squared_error(ron_test_exp, ron_test_pred, squared=False))
    ron_test_r2s.append(r2_score(ron_test_exp, ron_test_pred))

    os_test_rmses.append(mean_squared_error(os_test_exp, os_test_pred, squared=False))
    os_test_r2s.append(r2_score(os_test_exp, os_test_pred))

    dos_test_rmses.append(mean_squared_error(dos_test_exp, dos_test_pred, squared=False))
    dos_test_r2s.append(r2_score(dos_test_exp, dos_test_pred))

Iteration: 0 | Time: 0.0002620220184326172
Iteration: 1 | Time: 5.773781061172485
Iteration: 2 | Time: 12.227968215942383
Iteration: 3 | Time: 17.57283592224121
Iteration: 4 | Time: 21.300493955612183
Iteration: 5 | Time: 27.736977100372314
Iteration: 6 | Time: 33.560230016708374
Iteration: 7 | Time: 37.654128074645996
Iteration: 8 | Time: 44.07474207878113
Iteration: 9 | Time: 48.01907515525818
Iteration: 10 | Time: 52.77380013465881
Iteration: 11 | Time: 56.01689600944519
Iteration: 12 | Time: 62.206191062927246
Iteration: 13 | Time: 67.74489212036133
Iteration: 14 | Time: 72.9430980682373
Iteration: 15 | Time: 79.20378398895264
Iteration: 16 | Time: 83.99540114402771
Iteration: 17 | Time: 88.10066413879395
Iteration: 18 | Time: 92.90237307548523
Iteration: 19 | Time: 99.01235318183899
Iteration: 20 | Time: 104.90335607528687
Iteration: 21 | Time: 111.0126621723175
Iteration: 22 | Time: 116.35843992233276
Iteration: 23 | Time: 119.65638613700867
Iteration: 24 | Time: 125.854383945465

In [8]:
import numpy as np

print('MON:')
print(f'RMSE: {np.mean(mon_test_rmses)} +/- {np.std(mon_test_rmses)}')
print(f'R2: {np.mean(mon_test_r2s)} +/- {np.std(mon_test_r2s)}')
print()

print('RON:')
print(f'RMSE: {np.mean(ron_test_rmses)} +/- {np.std(ron_test_rmses)}')
print(f'R2: {np.mean(ron_test_r2s)} +/- {np.std(ron_test_r2s)}')
print()

print('OS:')
print(f'RMSE: {np.mean(os_test_rmses)} +/- {np.std(os_test_rmses)}')
print(f'R2: {np.mean(os_test_r2s)} +/- {np.std(os_test_r2s)}')
print()

print('dOS:')
print(f'RMSE: {np.mean(dos_test_rmses)} +/- {np.std(dos_test_rmses)}')
print(f'R2: {np.mean(dos_test_r2s)} +/- {np.std(dos_test_r2s)}')
print()

MON:
RMSE: 7.616631984710693 +/- 0.791519820690155
R2: 0.7688152112790544 +/- 0.04845783208786662

RON:
RMSE: 8.676424026489258 +/- 1.6391263008117676
R2: 0.8105279815777808 +/- 0.0744763112970065

OS:
RMSE: 5.771915912628174 +/- 0.4058595597743988
R2: 0.514358310218067 +/- 0.06794389653676955

dOS:
RMSE: 8.976502418518066 +/- 1.9798811674118042
R2: -0.22568220765416144 +/- 0.5697310570361092

