In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42
#seeds = np.arange(100)
seeds = [42]

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
x_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'Mor22s', 'LUMO / eV']
#x_cols = ['VE2_G/D', 'Eig14_EA(dm)', 'Mor31m', 'TDB04u', 'HATS1e']
X_full = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X_full)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=random_state)

In [7]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.GaussianNoise(stddev=0.1, input_shape=(len(x_cols),)),
        keras.layers.Dense(50, activation='relu'),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss='mean_squared_error')
    return model

In [8]:
model = get_model()
weights = model.get_weights()

In [9]:
x_col_idxs = [col_names.tolist().index(x) for x in x_cols]
X_train_sel = X_train[:, x_col_idxs]
X_val_sel = X_valid[:, x_col_idxs]
history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)

In [10]:
result = {tuple(sorted(x_col_idxs)) +(-1 ,-1): history.history['val_loss'][-1]}

In [11]:
for seed in seeds:
    print(seed+1)
    rng = np.random.default_rng(seed=seed)
    for i in range(100):
        model = get_model()
        model.set_weights(weights)
        idxs = rng.choice(len(col_names)-3, size=len(x_cols), replace=False)
        X_train_sel = X_train[:, idxs]
        X_val_sel = X_valid[:, idxs]
        history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)
        result[tuple(sorted(idxs)) + (seed, i)] = history.history['val_loss'][-1]

43


In [12]:
len(result)

101

In [13]:
min(result.items(), key=lambda x: x[1])

((409, 466, 516, 1140, 1215, 42, 7), 0.008951899595558643)

In [14]:
val_loss_list = list(result.values())

In [15]:
np.mean(val_loss_list)

0.034060123911367195

In [16]:
np.min(val_loss_list)

0.008951899595558643

In [31]:
best_random = [X_full.columns[i] for i in [409, 466, 516, 1140, 1215]]
sorted(best_random)

['B04[C-C]', 'CATS3D_03_DP', 'Eig11_AEA(ed)', 'J_RG', 'SpMAD_EA(ed)']

In [18]:
#np.save('random_search.npy', result)

In [19]:
res_array = []
for key, value in result.items():
    res_array.append([k for k in key] + [value])

In [20]:
res_array = np.array(res_array)

In [21]:
res_array.shape

(101, 8)

In [22]:
import matplotlib.pyplot as plt

In [23]:
res_array = res_array[:, [5, 6, 0, 1, 2, 3, 4, 7]]

In [24]:
res_df = pd.DataFrame(res_array)

In [25]:
res_df.columns=['seed', 'run', 'idx_0', 'idx_1', 'idx_2', 'idx_3', 'idx_4', 'score']

In [26]:
for col in ['seed', 'run', 'idx_0', 'idx_1', 'idx_2', 'idx_3', 'idx_4']:
    res_df[col] = res_df[col].astype(int)

In [27]:
res_df

Unnamed: 0,seed,run,idx_0,idx_1,idx_2,idx_3,idx_4,score
0,-1,-1,370,657,758,791,1258,0.034692
1,42,0,112,545,552,823,972,0.018517
2,42,1,118,661,926,959,1227,0.043725
3,42,2,230,466,566,629,1054,0.026725
4,42,3,286,557,567,685,1033,0.024174
...,...,...,...,...,...,...,...,...
96,42,95,71,307,842,1009,1066,0.047431
97,42,96,562,877,1031,1055,1062,0.033027
98,42,97,156,634,679,876,938,0.054794
99,42,98,66,195,563,924,1023,0.060844


In [28]:
res_df[res_df['seed']==42].min()

seed      42.000000
run        0.000000
idx_0      6.000000
idx_1    121.000000
idx_2    160.000000
idx_3    215.000000
idx_4    668.000000
score      0.008952
dtype: float64

In [33]:
res_df[(res_df['score']<0.009) & (res_df['score']>0.0089)]

Unnamed: 0,seed,run,idx_0,idx_1,idx_2,idx_3,idx_4,score
8,42,7,409,466,516,1140,1215,0.008952


In [34]:
br42 = [X_full.columns[i] for i in [409, 466, 516, 1140, 1215]]
sorted(br42)

['B04[C-C]', 'CATS3D_03_DP', 'Eig11_AEA(ed)', 'J_RG', 'SpMAD_EA(ed)']