In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
from tensorflow import keras
import tensorflow as tf

In [2]:
random_state=42

In [3]:
data = pd.read_csv('..\\Data\\ze41_mol_desc_db_red.csv', header=0, sep=';', decimal=',')

In [4]:
col_names = data.columns
X = data[col_names[3:]]
y = data[col_names[2]]

In [5]:
X_scaled = MinMaxScaler(feature_range=(0,1)).fit_transform(X)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_scaled, y, test_size=0.1, random_state=random_state)
X_train = pd.DataFrame(X_train)
X_train.columns = X.columns
X_valid = pd.DataFrame(X_valid)
X_valid.columns = X.columns

In [7]:
rf_sel = ['VE2_G/D',
 'Eig14_EA(dm)',
 'Mor31m',
 'TDB04u',
 'HATS1e',
 'HATS3p',
 'RDF015p',
 'Mor17s',
 'N-072',
 'O-057',
 'MATS2m',
 'E2m',
 'RDF035v',
 'Dp',
 'CATS2D_00_AA',
 'CATS3D_03_DL',
 'SpPosA_D/Dt',
 'CATS2D_02_AP',
 'SpMaxA_EA(bo)',
 'F03[C-N]',
 'RDF080e',
 'Chi_RG',
 'HATS0e',
 'CATS2D_06_PP',
 'Eta_beta_A',
 'SaaNH',
 'MATS3v',
 'SpMax6_Bh(s)',
 'Mor09s',
 'CATS2D_02_PL',
 'HATS6p',
 'Chi_Dz(e)',
 'SsOH',
 'Mor19p',
 'H4i',
 'Eta_FL_A',
 'SAacc',
 'R6m+',
 'SpMAD_B(s)',
 'SpMAD_G/D',
 'R5m',
 'TDB04i',
 'nOHs',
 'SM07_EA(bo)',
 'GATS2m',
 'SpMax2_Bh(e)',
 'R4s',
 'GATS6v',
 'Ts',
 'CATS2D_07_DD',
 'CATS2D_04_AL',
 'F06[C-C]',
 'RDF045m',
 'nRNR2',
 'HATS2m',
 'H5v',
 'X3Av',
 'SIC1',
 'MATS6p',
 'RDF055v',
 'B03[O-O]',
 'G2p',
 'SpMAD_EA(bo)']

In [8]:
rfe_sel = ['P_VSA_MR_5',
 'P_VSA_LogP_2',
 'Mor22s',
 'Mor04m',
 'LUMO / eV',
 'E1p',
 'HOMO / eV',
 'MATS5v',
 'Mor14s',
 'Mor29v',
 'Mor14u',
 'GATS5v',
 'GATS2s',
 'MATS5m',
 'Mor32m',
 'H3m',
 'CATS3D_02_AP',
 'TDB04s',
 'R2e+',
 'E2s',
 'R5p+',
 'ISH',
 'DISPm',
 'R5i+',
 'Ds',
 'Mor04i',
 'E2m',
 'Mor28s',
 'TDB03m',
 'Mor19m',
 'Mor11u',
 'VE2sign_G',
 'Mor03s',
 'SpMAD_RG',
 'E2v',
 'R3s+',
 'R5e+',
 'R2u+',
 'Mor15i',
 'H0v',
 'T(N..O)',
 'E1i',
 'Eta_epsi_5',
 'E3e',
 'MATS4s',
 'Mor13u',
 'H1p',
 'X4Av',
 'Mor15s',
 'Hy',
 'HATS0p',
 'Eig03_AEA(dm)',
 'X3Av',
 'VE1sign_G',
 'GATS5m',
 'E2e',
 'Mor10e',
 'MATS8p',
 'TDB01m',
 'GATS4s',
 'TDB04m',
 'PJI3',
 'Mor16m']

In [9]:
training_cols = rf_sel
X_sel_train = X_train[training_cols]
X_sel_valid = X_valid[training_cols]

In [10]:
model = keras.models.Sequential([
    #keras.layers.GaussianNoise(stddev=0.1),
    keras.layers.Dense(50, activation='relu', input_shape=(len(rf_sel),)),
    keras.layers.Dense(20, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(1)
])

In [11]:
model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.005),
    loss='mean_squared_error')

In [12]:
history = model.fit(X_sel_train, y_train, validation_data=(X_sel_valid, y_valid), epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [13]:
y_pred = model.predict(X_sel_valid)

In [14]:
for i in range(y_pred.shape[0]):
    print('true: {:.0f}, predicted: {:.0f}'.format(y_valid.iloc[i]*345-270, y_pred[i, 0]*345-270))

true: -157, predicted: -45
true: 39, predicted: -39
true: 12, predicted: -26
true: -6, predicted: -54
true: 38, predicted: 35
true: -17, predicted: -24


In [15]:
y_valid

0     0.328
5     0.896
36    0.817
45    0.765
13    0.893
54    0.733
Name: LinIE ZE41, dtype: float64