In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
random_state = 13
seeds = np.arange(100)
#seeds = [42]

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
x_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'Mor22s', 'LUMO / eV']
#x_cols = ['VE2_G/D', 'Eig14_EA(dm)', 'Mor31m', 'TDB04u', 'HATS1e']
X = data[col_names[3:]]
y = data[col_names[1]]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

In [6]:
scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [7]:
scaley = MinMaxScaler(feature_range=(0, 1))
scaley.fit(y_train)
[y_train_sc, y_valid_sc] = [pd.DataFrame(scaley.transform(y), columns=y.columns) for y in [y_train, y_valid]]

In [8]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.GaussianNoise(stddev=0.1, input_shape=(len(x_cols),)),
        keras.layers.Dense(50, activation='relu'),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss='mean_squared_error')
    return model

In [9]:
losses = []
predictions = pd.DataFrame(y_valid)

for seed in seeds:
    tf.keras.backend.clear_session()
    tf.random.set_seed(seed)
    idxs = np.random.default_rng(seed=seed).choice(len(col_names)-3, size=len(x_cols), replace=False)
    model = get_model()
    X_train_sel = X_train_sc[X_train_sc.columns[idxs]]
    X_val_sel = X_valid_sc[X_valid.columns[idxs]]
    history = model.fit(X_train_sel, y_train_sc, validation_data=(X_val_sel, y_valid_sc), epochs=25, verbose=0)
    losses.append(history.history['val_loss'][-1])
    y_pred = model.predict(tf.convert_to_tensor(X_val_sel))
    predictions[seed] = scaley.inverse_transform(y_pred)
    if seed%10 == 9:
        print('Training {:.2f}% done'.format((seed+1)*100/len(seeds)))

Training 10.00% done


Training 20.00% done


Training 30.00% done


Training 40.00% done


Training 50.00% done


Training 60.00% done


Training 70.00% done
Training 80.00% done


Training 90.00% done


Training 100.00% done


In [10]:
np.mean(losses)

0.25533721342682836

In [11]:
np.std(losses)

0.9267275489306924

In [12]:
means = predictions[predictions.columns[1:]].mean(axis=1).to_numpy()

In [13]:
stds = predictions[predictions.columns[1:]].std(axis=1)

In [14]:
def rmse(x, y):
    return np.sqrt(((x-y)**2).mean())

In [15]:
yv = y_valid['inhibition efficiency ZE41 / %'].to_numpy()

In [16]:
from scipy.stats import linregress

In [17]:
linregress(means, yv)

LinregressResult(slope=0.5446015725565521, intercept=-28.269050021280417, rvalue=0.2059284813731311, pvalue=0.5208053848181026, stderr=0.8183763306122817, intercept_stderr=41.35437797120769)

In [18]:
r,p = linregress(means, yv)[2:4]

In [19]:
print('{:.2f}, {:.0f}, {:.2f}, {:.3f}'.format(r**2, rmse(yv, means), r, p))

0.04, 91, 0.21, 0.521


In [20]:
means2 = means[:4] + means[5]
yv2 = yv[:4] + yv[5]

In [21]:
r2,p2 = linregress(means2, yv2)[2:4]

In [22]:
print('{:.2f}, {:.0f}, {:.2f}, {:.3f}'.format(r2**2, rmse(yv2, means2), r2, p2))

0.22, 129, -0.47, 0.527
