In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
random_state = 42
seeds = np.arange(100)

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
x_cols = ['P_VSA_MR_5', 'Mor04m', 'E1p', 'Mor22s', 'LUMO / eV']
#x_cols = ['VE2_G/D', 'Eig14_EA(dm)', 'Mor31m', 'TDB04u', 'HATS1e']
X_full = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X_full)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=random_state)

In [7]:
def get_model():
    model = keras.models.Sequential([
        keras.layers.Dense(50, activation='relu', input_shape=(len(x_cols),)),
        keras.layers.Dense(20, activation='relu'),
        keras.layers.Dense(10, activation='relu'),
        keras.layers.Dense(1)
        ])
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.01),
        loss='mean_squared_error')
    return model

In [8]:
model = get_model()
weights = model.get_weights()

In [9]:
x_col_idxs = [col_names.tolist().index(x) for x in x_cols]
X_train_sel = X_train[:, x_col_idxs]
X_val_sel = X_valid[:, x_col_idxs]
history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)

In [10]:
result = {tuple(sorted(x_col_idxs)) +(-1 ,-1): history.history['val_loss'][-1]}

In [11]:
for seed in seeds:
    print(seed+1)
    rng = np.random.default_rng(seed=seed)
    for i in range(100):
        model = get_model()
        model.set_weights(weights)
        idxs = rng.choice(len(col_names)-3, size=len(x_cols), replace=False)
        X_train_sel = X_train[:, idxs]
        X_val_sel = X_valid[:, idxs]
        history = model.fit(X_train_sel, y_train, validation_data=(X_val_sel, y_valid), verbose=0, epochs=25)
        result[tuple(sorted(idxs)) + (seed, i)] = history.history['val_loss'][-1]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [12]:
len(result)

10001

In [13]:
min(result.items(), key=lambda x: x[1])

((176, 361, 384, 1062, 1233, 58, 1), 0.0041495985351502895)

In [16]:
val_loss_list = list(result.values())

In [17]:
np.mean(val_loss_list)

0.03346603691032956

In [18]:
np.min(val_loss_list)

0.0041495985351502895

In [27]:
best_random = [X_full.columns[i] for i in [176, 361, 384, 1062, 1233]]
sorted(best_random)

['CATS2D_04_DP', 'CATS3D_02_AP', 'Eta_betaS', 'P_VSA_LogP_4', 'VE1_B(s)']

In [26]:
np.save('random_search.npy', result)

In [32]:
res_array = []
for key, value in result.items():
    res_array.append([k for k in key] + [value])

In [34]:
res_array = np.array(res_array)

In [36]:
res_array.shape

(10001, 8)

In [37]:
import matplotlib.pyplot as plt

In [41]:
res_array = res_array[:, [5, 6, 0, 1, 2, 3, 4, 7]]

In [45]:
res_df = pd.DataFrame(res_array)

In [47]:
res_df.columns=['seed', 'run', 'idx_0', 'idx_1', 'idx_2', 'idx_3', 'idx_4', 'score']

In [53]:
for col in ['seed', 'run', 'idx_0', 'idx_1', 'idx_2', 'idx_3', 'idx_4']:
    res_df[col] = res_df[col].astype(int)

In [55]:
res_df

Unnamed: 0,seed,run,idx_0,idx_1,idx_2,idx_3,idx_4,score
0,-1,-1,370,657,758,791,1258,0.038656
1,0,0,339,387,643,800,1068,0.029177
2,0,1,634,764,816,1021,1148,0.045479
3,0,2,348,703,845,1027,1175,0.031528
4,0,3,42,221,917,961,1065,0.026154
...,...,...,...,...,...,...,...,...
9996,99,95,17,31,135,453,942,0.028576
9997,99,96,19,103,270,811,975,0.055681
9998,99,97,404,760,996,1094,1143,0.061254
9999,99,98,320,500,567,580,582,0.026083


In [56]:
res_df[res_df['seed']==42].min()

seed      42.000000
run        0.000000
idx_0      6.000000
idx_1    121.000000
idx_2    160.000000
idx_3    215.000000
idx_4    668.000000
score      0.005151
dtype: float64

In [59]:
res_df[(res_df['score']<0.00516) & (res_df['score']>0.00515)]

Unnamed: 0,seed,run,idx_0,idx_1,idx_2,idx_3,idx_4,score
4208,42,7,409,466,516,1140,1215,0.005151


In [60]:
br42 = [X_full.columns[i] for i in [409, 466, 516, 1140, 1215]]
sorted(br42)

['B04[C-C]', 'CATS3D_03_DP', 'Eig11_AEA(ed)', 'J_RG', 'SpMAD_EA(ed)']