In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
seeds = np.arange(100)
random_state=42

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[1]]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=random_state)
[X_train, X_valid, y_train, y_valid] = [pd.DataFrame(x) for x in [X_train, X_valid, y_train, y_valid]]

In [6]:
scalex = MinMaxScaler(feature_range=(-1,1))
scalex.fit(X_train)
[X_train_sc, X_valid_sc] = [pd.DataFrame(scalex.transform(x), columns=X.columns) for x in [X_train, X_valid]]

In [7]:
scaley = MinMaxScaler(feature_range=(0, 1))
scaley.fit(y_train)
[y_train_sc, y_valid_sc] = [pd.DataFrame(scaley.transform(y), columns=y.columns) for y in [y_train, y_valid]]

In [8]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.GaussianNoise(stddev=0.1, input_shape=(X_train.shape[1],)),
        keras.layers.Dense(100, activation='relu'),
        keras.layers.Dense(50, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
    ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.005),
        loss='mean_squared_error')
    return model

In [9]:
losses = []
predictions = pd.DataFrame(y_valid)

for seed in seeds:
    tf.keras.backend.clear_session()
    tf.random.set_seed(seed)
    model = get_model()
    history = model.fit(X_train_sc, y_train_sc, validation_data=(X_valid_sc, y_valid_sc), epochs=25, verbose=0)
    losses.append(history.history['val_loss'][-1])
    y_pred = model.predict(tf.convert_to_tensor(X_valid_sc))
    predictions[seed] = scaley.inverse_transform(y_pred)
    if seed%10 == 9:
        print('Training {:.2f}% done'.format((seed+1)*100/len(seeds)))

Training 10.00% done


Training 20.00% done


Training 30.00% done


Training 40.00% done


Training 50.00% done


Training 60.00% done


Training 70.00% done
Training 80.00% done


Training 90.00% done


Training 100.00% done


In [10]:
np.mean(losses)

0.037154702260158955

In [11]:
np.std(losses)

0.05322825759373066

In [13]:
predictions[predictions.columns[1:]].mean(axis=1)

0    -77.539284
5     14.464326
36     8.626293
45   -19.814734
13    64.303955
54    -4.009440
dtype: float32

In [15]:
predictions[predictions.columns[1:]].std(axis=1)

0     48.667503
5     41.444981
36    45.997875
45    37.874989
13    97.764748
54    39.345806
dtype: float32