In [1]:
from csv import DictReader

with open('data/combustdb_master.csv', 'r') as csv_file:
    reader = DictReader(csv_file)
    rows = [r for r in reader]
csv_file.close()

compounds = [r for r in rows if r['properties.cetane_number.value'] != '-']
print(len(compounds))

408


In [2]:
from alvadescpy import smiles_to_descriptors

descriptors = [smiles_to_descriptors(c['canonical_smiles']) for c in compounds]

In [3]:
import pandas as pd
import numpy as np

df = pd.DataFrame(descriptors)

X = df.values
for i in range(len(X)):
    for j in range(len(X[i])):
        if X[i][j] == 'na':
            X[i][j] = 0.0
X = X.astype('float32')

y = np.array([[float(c['properties.cetane_number.value'])] for c in compounds], dtype='float32')

print(X.shape, y.shape)

(408, 5305) (408, 1)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)

X_norm = scaler.transform(X)
print(X_norm.shape)

(408, 5305)


In [5]:
from sklearn.decomposition import PCA

pca = PCA(svd_solver='full', random_state=0)
pca.fit(X_norm)
X_pca = pca.transform(X_norm)
print(X_pca.shape)

(408, 408)


In [6]:
from sklearn.model_selection import LeaveOneOut
from ecnet import ECNet
from ecnet.datasets import QSPRDatasetFromValues
import torch

loo = LeaveOneOut()
print(f'Number of splits: {loo.get_n_splits(X)}')

preds = []
vars = []
i = 0
for train_index, test_index in loo.split(X):

    i += 1
    print(f'SPLIT: {i}')

    X_train, X_test = X_pca[train_index], X_pca[test_index]
    y_train = y[train_index]
    X_test = torch.tensor(X_test, dtype=torch.float32)

    train_set = QSPRDatasetFromValues(X_train, y_train)

    _pred = []
    for _ in range(10):
        _model = ECNet(408, 1, 256, 1)
        _, _ = _model.fit(dataset=train_set, epochs=100)
        _pred.append(_model(X_test).detach().numpy()[0])
    preds.append(np.mean(_pred))
    vars.append(np.std(_pred))

Number of splits: 408
SPLIT: 1
SPLIT: 2
SPLIT: 3
SPLIT: 4
SPLIT: 5
SPLIT: 6
SPLIT: 7
SPLIT: 8
SPLIT: 9
SPLIT: 10
SPLIT: 11
SPLIT: 12
SPLIT: 13
SPLIT: 14
SPLIT: 15
SPLIT: 16
SPLIT: 17
SPLIT: 18
SPLIT: 19
SPLIT: 20
SPLIT: 21
SPLIT: 22
SPLIT: 23
SPLIT: 24
SPLIT: 25
SPLIT: 26
SPLIT: 27
SPLIT: 28
SPLIT: 29
SPLIT: 30
SPLIT: 31
SPLIT: 32
SPLIT: 33
SPLIT: 34
SPLIT: 35
SPLIT: 36
SPLIT: 37
SPLIT: 38
SPLIT: 39
SPLIT: 40
SPLIT: 41
SPLIT: 42
SPLIT: 43
SPLIT: 44
SPLIT: 45
SPLIT: 46
SPLIT: 47
SPLIT: 48
SPLIT: 49
SPLIT: 50
SPLIT: 51
SPLIT: 52
SPLIT: 53
SPLIT: 54
SPLIT: 55
SPLIT: 56
SPLIT: 57
SPLIT: 58
SPLIT: 59
SPLIT: 60
SPLIT: 61
SPLIT: 62
SPLIT: 63
SPLIT: 64
SPLIT: 65
SPLIT: 66
SPLIT: 67
SPLIT: 68
SPLIT: 69
SPLIT: 70
SPLIT: 71
SPLIT: 72
SPLIT: 73
SPLIT: 74
SPLIT: 75
SPLIT: 76
SPLIT: 77
SPLIT: 78
SPLIT: 79
SPLIT: 80
SPLIT: 81
SPLIT: 82
SPLIT: 83
SPLIT: 84
SPLIT: 85
SPLIT: 86
SPLIT: 87
SPLIT: 88
SPLIT: 89
SPLIT: 90
SPLIT: 91
SPLIT: 92
SPLIT: 93
SPLIT: 94
SPLIT: 95
SPLIT: 96
SPLIT: 97
SPLIT: 98
SPLIT: 

In [7]:
from csv import DictWriter

headers = ['pred', 'actual', 'var']
with open('pca_results.csv', 'w', encoding='utf8') as csv_file:
    writer = DictWriter(csv_file, headers, delimiter=',', lineterminator='\n')
    writer.writeheader()
    for i in range(len(preds)):
        writer.writerow({
            'pred': preds[i],
            'actual': y[i][0],
            'var': vars[i]
        })