In [1]:
from ecnet.datasets import load_cn, load_ysi

smiles_cn, cn = load_cn()
smiles_ysi, ysi = load_ysi()

data_cn = []
data_ysi = []
smiles = []

for idx_cn, smi_cn in enumerate(smiles_cn):
    for idx, smi in enumerate(smiles_ysi):
        if smi_cn == smi:
            data_cn.append(cn[idx_cn])
            data_ysi.append(ysi[idx])
            smiles.append(smi_cn)
            break

print(len(data_cn), len(data_ysi), len(smiles))

132 132 132


In [2]:
from sklearn.preprocessing import StandardScaler

scaler_cn = StandardScaler()
scaler_ysi = StandardScaler()
scaler_cn.fit(data_cn)
scaler_ysi.fit(data_ysi)
data_cn = scaler_cn.transform(data_cn)
data_ysi = scaler_ysi.transform(data_ysi)

print(min(data_cn), max(data_cn))
print(min(data_ysi), max(data_ysi))

[-1.38752046] [3.31798372]
[-0.74680261] [3.87871805]


In [3]:
from ecnet.datasets import QSPRDataset

ds_cn = QSPRDataset(smiles, data_cn, backend='alvadesc')
ds_ysi = QSPRDataset(smiles, data_ysi, backend='alvadesc')

In [4]:
###############
### JUST CN ###
###############

from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.decomposition import PCA
from ecnet.datasets import QSPRDatasetFromValues
from ecnet import ECNet
import numpy as np
import torch

maes_train = []
maes_test = []
r2s_train = []
r2s_test = []

for i in range(10):

    print(f'Model {i + 1}')

    desc_train, desc_test, targets_train, targets_test = train_test_split(
        ds_cn.desc_vals.detach().numpy(), ds_cn.target_vals.detach().numpy(), test_size=0.25, random_state=i
    )
    ds_train = QSPRDatasetFromValues(desc_train, targets_train)
    ds_test = QSPRDatasetFromValues(desc_test, targets_test)

    pca = PCA(n_components=len(ds_train))
    pca.fit(ds_train.desc_vals.detach().numpy())
    ds_train.desc_vals = torch.tensor(pca.transform(ds_train.desc_vals.detach().numpy())).type(torch.float32)
    ds_test.desc_vals = torch.tensor(pca.transform(ds_test.desc_vals.detach().numpy())).type(torch.float32)

    ds_train.target_vals = ds_train.target_vals.type(torch.float32)
    ds_test.target_vals = ds_train.target_vals.type(torch.float32)

    _maes_train = []
    _maes_test = []
    _r2s_train = []
    _r2s_test = []
    
    for _ in range (5):

        model = ECNet(ds_train.desc_vals.shape[1], ds_train.target_vals.shape[1], 512, 2)
        model.fit(dataset=ds_train, epochs=512, valid_size=0.25, shuffle=True, patience=64, lr=0.001)
        pred_train = model(ds_train.desc_vals).detach().numpy()
        pred_test = model(ds_test.desc_vals).detach().numpy()
        _maes_train.append(median_absolute_error(targets_train, pred_train))
        _maes_test.append(median_absolute_error(targets_test, pred_test))
        _r2s_train.append(r2_score(targets_train, pred_train))
        _r2s_test.append(r2_score(targets_test, pred_test))

    maes_train.append(np.mean(_maes_train))
    maes_test.append(np.mean(_maes_test))
    r2s_train.append(np.mean(_r2s_train))
    r2s_test.append(np.mean(_r2s_test))

print('Training MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_train), np.std(maes_train)))
print('Training R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_train), np.std(r2s_train)))
print('Test MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_test), np.std(maes_test)))
print('Test R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_test), np.std(r2s_test)))

Model 1
Model 2
Model 3
Model 4
Model 5
Model 6
Model 7
Model 8
Model 9
Model 10
Training MAE: 0.065 +/- 0.037
Training R2: 0.928 +/- 0.116
Test MAE: 0.261 +/- 0.048
Test R2: 0.621 +/- 0.208


In [5]:
################
### JUST YSI ###
################

maes_train = []
maes_test = []
r2s_train = []
r2s_test = []

maes_train = []
maes_test = []
r2s_train = []
r2s_test = []

for i in range(10):

    print(f'Model {i + 1}')

    desc_train, desc_test, targets_train, targets_test = train_test_split(
        ds_ysi.desc_vals.detach().numpy(), ds_ysi.target_vals.detach().numpy(), test_size=0.25, random_state=i
    )
    ds_train = QSPRDatasetFromValues(desc_train, targets_train)
    ds_test = QSPRDatasetFromValues(desc_test, targets_test)

    pca = PCA(n_components=len(ds_train))
    pca.fit(ds_train.desc_vals.detach().numpy())
    ds_train.desc_vals = torch.tensor(pca.transform(ds_train.desc_vals.detach().numpy())).type(torch.float32)
    ds_test.desc_vals = torch.tensor(pca.transform(ds_test.desc_vals.detach().numpy())).type(torch.float32)

    ds_train.target_vals = ds_train.target_vals.type(torch.float32)
    ds_test.target_vals = ds_train.target_vals.type(torch.float32)

    _maes_train = []
    _maes_test = []
    _r2s_train = []
    _r2s_test = []
    
    for _ in range (5):

        model = ECNet(ds_train.desc_vals.shape[1], ds_train.target_vals.shape[1], 512, 2)
        model.fit(dataset=ds_train, epochs=512, valid_size=0.25, shuffle=True, patience=64, lr=0.001)
        pred_train = model(ds_train.desc_vals).detach().numpy()
        pred_test = model(ds_test.desc_vals).detach().numpy()
        _maes_train.append(median_absolute_error(targets_train, pred_train))
        _maes_test.append(median_absolute_error(targets_test, pred_test))
        _r2s_train.append(r2_score(targets_train, pred_train))
        _r2s_test.append(r2_score(targets_test, pred_test))

    maes_train.append(np.mean(_maes_train))
    maes_test.append(np.mean(_maes_test))
    r2s_train.append(np.mean(_r2s_train))
    r2s_test.append(np.mean(_r2s_test))

print('Training MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_train), np.std(maes_train)))
print('Training R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_train), np.std(r2s_train)))
print('Test MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_test), np.std(maes_test)))
print('Test R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_test), np.std(r2s_test)))

Model 1
Model 2
Model 3
Model 4
Model 5
Model 6
Model 7
Model 8
Model 9
Model 10
Training MAE: 0.049 +/- 0.026
Training R2: 0.931 +/- 0.058
Test MAE: 0.090 +/- 0.026
Test R2: 0.644 +/- 0.579


In [6]:
##################
### CN AND LHV ###
##################

maes_train_cn = []
maes_train_ysi = []
maes_test_cn = []
maes_test_ysi = []
r2s_train_cn = []
r2s_train_ysi = []
r2s_test_cn = []
r2s_test_ysi = []

target_vals = np.concatenate((data_cn, data_ysi), axis=1)

for i in range(10):

    print(f'Model {i + 1}')

    desc_train, desc_test, targets_train, targets_test = train_test_split(
        ds_ysi.desc_vals.detach().numpy(), target_vals, test_size=0.25, random_state=i
    )
    ds_train = QSPRDatasetFromValues(desc_train, targets_train)
    ds_test = QSPRDatasetFromValues(desc_test, targets_test)

    pca = PCA(n_components=len(ds_train))
    pca.fit(ds_train.desc_vals.detach().numpy())
    ds_train.desc_vals = torch.tensor(pca.transform(ds_train.desc_vals.detach().numpy())).type(torch.float32)
    ds_test.desc_vals = torch.tensor(pca.transform(ds_test.desc_vals.detach().numpy())).type(torch.float32)

    ds_train.target_vals = ds_train.target_vals.type(torch.float32)
    ds_test.target_vals = ds_train.target_vals.type(torch.float32)

    _maes_train_cn = []
    _maes_train_ysi = []
    _maes_test_cn = []
    _maes_test_ysi = []
    _r2s_train_cn = []
    _r2s_train_ysi = []
    _r2s_test_cn = []
    _r2s_test_ysi = []

    for _ in range(5):

        model = ECNet(ds_train.desc_vals.shape[1], ds_train.target_vals.shape[1], 512, 2)
        model.fit(dataset=ds_train, epochs=512, valid_size=0.25, shuffle=True, patience=64, lr=0.001)
        pred_train = model(ds_train.desc_vals).detach().numpy()
        pred_test = model(ds_test.desc_vals).detach().numpy()
        pred_train_cn = [p[0] for p in pred_train]
        pred_train_lhv = [p[1] for p in pred_train]
        pred_test_cn = [p[0] for p in pred_test]
        pred_test_lhv = [p[1] for p in pred_test]
        _maes_train_cn.append(median_absolute_error([t[0] for t in targets_train], pred_train_cn))
        _maes_train_ysi.append(median_absolute_error([t[1] for t in targets_train], pred_train_lhv))
        _maes_test_cn.append(median_absolute_error([t[0] for t in targets_test], pred_test_cn))
        _maes_test_ysi.append(median_absolute_error([t[1] for t in targets_test], pred_test_lhv))
        _r2s_train_cn.append(r2_score([t[0] for t in targets_train], pred_train_cn))
        _r2s_train_ysi.append(r2_score([t[1] for t in targets_train], pred_train_lhv))
        _r2s_test_cn.append(r2_score([t[0] for t in targets_test], pred_test_cn))
        _r2s_test_ysi.append(r2_score([t[1] for t in targets_test], pred_test_lhv))

    maes_train_cn.append(np.mean(_maes_train_cn))
    maes_train_ysi.append(np.mean(_maes_train_ysi))
    maes_test_cn.append(np.mean(_maes_test_cn))
    maes_test_ysi.append(np.mean(_maes_test_ysi))
    r2s_train_cn.append(np.mean(_r2s_train_cn))
    r2s_train_ysi.append(np.mean(_r2s_train_ysi))
    r2s_test_cn.append(np.mean(_r2s_test_cn))
    r2s_test_ysi.append(np.mean(_r2s_test_ysi))

print('CN:')
print('Training MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_train_cn), np.std(maes_train_cn)))
print('Training R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_train_cn), np.std(r2s_train_cn)))
print('Test MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_test_cn), np.std(maes_test_cn)))
print('Test R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_test_cn), np.std(r2s_test_cn)))
print('\nLHV:')
print('Training MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_train_ysi), np.std(maes_train_ysi)))
print('Training R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_train_ysi), np.std(r2s_train_ysi)))
print('Test MAE: {:.3f} +/- {:.3f}'.format(np.mean(maes_test_ysi), np.std(maes_test_ysi)))
print('Test R2: {:.3f} +/- {:.3f}'.format(np.mean(r2s_test_ysi), np.std(r2s_test_ysi)))

Model 1
Model 2
Model 3
Model 4
Model 5
Model 6
Model 7
Model 8
Model 9
Model 10
CN:
Training MAE: 0.054 +/- 0.015
Training R2: 0.965 +/- 0.030
Test MAE: 0.260 +/- 0.047
Test R2: 0.580 +/- 0.310

LHV:
Training MAE: 0.046 +/- 0.020
Training R2: 0.917 +/- 0.113
Test MAE: 0.092 +/- 0.035
Test R2: 0.697 +/- 0.381
