In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42
seeds = np.arange(100)
#seeds = [42]

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
x_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'Mor22s', 'LUMO / eV']
#x_cols = ['VE2_G/D', 'Eig14_EA(dm)', 'Mor31m', 'TDB04u', 'HATS1e']
X = data[col_names[3:]]
y = data[col_names[1]]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

In [6]:
scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [7]:
scaley = MinMaxScaler(feature_range=(0, 1))
scaley.fit(y_train)
[y_train_sc, y_valid_sc] = [pd.DataFrame(scaley.transform(y), columns=y.columns) for y in [y_train, y_valid]]

In [8]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.GaussianNoise(stddev=0.1, input_shape=(len(x_cols),)),
        keras.layers.Dense(50, activation='relu'),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss='mean_squared_error')
    return model

In [9]:
losses = []
predictions = pd.DataFrame(y_valid)

for seed in seeds:
    tf.keras.backend.clear_session()
    tf.random.set_seed(seed)
    idxs = np.random.default_rng(seed=seed).choice(len(col_names)-3, size=len(x_cols), replace=False)
    model = get_model()
    X_train_sel = X_train_sc[X_train_sc.columns[idxs]]
    X_val_sel = X_valid_sc[X_valid.columns[idxs]]
    history = model.fit(X_train_sel, y_train_sc, validation_data=(X_val_sel, y_valid_sc), epochs=25, verbose=0)
    losses.append(history.history['val_loss'][-1])
    y_pred = model.predict(tf.convert_to_tensor(X_val_sel))
    predictions[seed] = scaley.inverse_transform(y_pred)
    if seed%10 == 9:
        print('Training {:.2f}% done'.format((seed+1)*100/len(seeds)))

Training 10.00% done


Training 20.00% done


Training 30.00% done


Training 40.00% done


Training 50.00% done


Training 60.00% done


Training 70.00% done
Training 80.00% done


Training 90.00% done


Training 100.00% done


In [10]:
np.mean(losses)

0.05153697581961751

In [11]:
np.std(losses)

0.06514222976274076

In [12]:
predictions[predictions.columns[1:]].mean(axis=1)

0    -41.964741
5    -16.331856
36   -23.956989
45   -26.986404
13    28.683794
54   -27.655865
dtype: float32

In [13]:
predictions[predictions.columns[1:]].std(axis=1)

0      48.060986
5      30.418253
36     43.225697
45     32.819363
13    105.863937
54     37.101299
dtype: float32