In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
seed = 42
tf.random.set_seed(seed)
rng = np.random.default_rng(seed=seed)

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
#x_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'Mor22s', 'LUMO / eV']
x_cols = ['VE2_G/D', 'Eig14_EA(dm)', 'Mor31m', 'TDB04u', 'HATS1e']
X_full = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X_full)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=seed)

In [7]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.Dense(50, activation='relu', input_shape=(len(x_cols),)),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss='mean_squared_error')
    return model

In [8]:
model = get_model()

In [9]:
weights = model.get_weights()

In [10]:
results = []

In [11]:
x_col_idxs = [col_names.tolist().index(x) for x in x_cols]
x_col_idxs

[534, 460, 682, 538, 850]

In [12]:
X_train_sel = X_train[:, x_col_idxs]
X_val_sel = X_valid[:, x_col_idxs]
history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)
results.append({'idxs': x_col_idxs, 'loss': history.history['loss'][-1], 'val loss': history.history['val_loss'][-1]})

In [13]:
for _ in range(25):
    model = get_model()
    model.set_weights(weights)
    idxs = rng.choice(len(col_names)-3, size=len(x_cols), replace=False)
    X_train_sel = X_train[:, idxs]
    X_val_sel = X_valid[:, idxs]
    history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)
    results.append({'idxs': idxs, 'loss': history.history['loss'][-1], 'val loss': history.history['val_loss'][-1]})

In [14]:
results

[{'idxs': [534, 460, 682, 538, 850],
  'loss': 0.060117561370134354,
  'val loss': 0.0277117732912302},
 {'idxs': array([972, 552, 823, 112, 545], dtype=int64),
  'loss': 0.05426488071680069,
  'val loss': 0.018380610272288322},
 {'idxs': array([1227,  118,  661,  959,  926], dtype=int64),
  'loss': 0.05677643418312073,
  'val loss': 0.040437936782836914},
 {'idxs': array([ 629, 1054,  566,  466,  230], dtype=int64),
  'loss': 0.052160825580358505,
  'val loss': 0.027218176051974297},
 {'idxs': array([ 685,  286,  567,  557, 1033], dtype=int64),
  'loss': 0.051707673817873,
  'val loss': 0.02054666168987751},
 {'idxs': array([ 208, 1078, 1040,  348,  795], dtype=int64),
  'loss': 0.05945942550897598,
  'val loss': 0.03363513574004173},
 {'idxs': array([1123,  980,  560, 1219,  853], dtype=int64),
  'loss': 0.0623667873442173,
  'val loss': 0.031627316027879715},
 {'idxs': array([ 55, 625, 687, 936, 194], dtype=int64),
  'loss': 0.0488118939101696,
  'val loss': 0.03139013424515724},
 {

In [15]:
y_valid

0     0.328
5     0.896
36    0.817
45    0.765
13    0.893
54    0.733
Name: LinIE ZE41, dtype: float64

In [16]:
val_losses = [r['val loss'] for r in results]

In [17]:
val_losses = sorted(val_losses)

In [18]:
val_losses.index(results[0]['val loss'])

12

In [19]:
len(val_losses)

26

In [20]:
val_losses

[0.007308709900826216,
 0.007840313948690891,
 0.012577741406857967,
 0.016792016103863716,
 0.018380610272288322,
 0.02054666168987751,
 0.02238452434539795,
 0.022664234042167664,
 0.024170612916350365,
 0.02543029375374317,
 0.02622690051794052,
 0.027218176051974297,
 0.0277117732912302,
 0.03139013424515724,
 0.031627316027879715,
 0.031837183982133865,
 0.03363513574004173,
 0.03391077741980553,
 0.03537534177303314,
 0.03884335607290268,
 0.040437936782836914,
 0.04283299669623375,
 0.04534071311354637,
 0.04692922532558441,
 0.05191443860530853,
 0.0619671493768692]

In [21]:
np.mean(val_losses)

0.030203625900097765

In [22]:
results[0]['val loss']

0.0277117732912302