In [14]:
import pandas as pd
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn import preprocessing
from keras import models, layers, regularizers
import eli5
from eli5.sklearn import PermutationImportance
from utils import path

In [15]:
molecular_descriptor = pd.read_csv(path.molecular_descriptor_train_path)
molecular_descriptor = molecular_descriptor.drop(labels = 'SMILES', axis = 1)

head = molecular_descriptor.columns
print(head)

era_activity = pd.read_csv(path.era_activity_train_path)
era_activity = era_activity.drop(labels = 'SMILES', axis = 1)

# print(molecular_descriptor.values.shape)
# print(era_activity.values.shape)

dataset_df = molecular_descriptor
dataset_df['pIC50'] = era_activity['pIC50']

min_max_scaler = preprocessing.MinMaxScaler()
dataset = min_max_scaler.fit_transform(dataset_df.values)
print(dataset.shape)

x_train = dataset[:, 0:dataset.shape[1] - 1]
y_train = dataset[:, -1]

print(x_train.shape)
print(y_train.shape)


def build_model():
    # Because we will need to instantiate the same model multiple times,（因为需要将同一个模型多次实例化，）
    # we use a function to construct it.（所以用一个函数来构建模型）
    model = models.Sequential()
    model.add(layers.Dense(128, activation = 'relu',
                           input_shape = (x_train.shape[1],)))
    model.add(layers.Dense(64, activation = 'relu',
                           input_shape = (x_train.shape[1],)))
    model.add(layers.Dense(64, activation = 'relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae'])
    model.summary()
    return model


model = build_model()
early_stop = EarlyStopping(monitor = 'loss', patience = 10)
lr_reduce = ReduceLROnPlateau(monitor = 'loss', factor = 0.5, patience = 3)

Index(['nAcid', 'ALogP', 'ALogp2', 'AMR', 'apol', 'naAromAtom', 'nAromBond',
       'nAtom', 'nHeavyAtom', 'nH',
       ...
       'MW', 'WTPT-1', 'WTPT-2', 'WTPT-3', 'WTPT-4', 'WTPT-5', 'WPATH', 'WPOL',
       'XLogP', 'Zagreb'],
      dtype='object', length=729)
(1974, 730)
(1974, 729)
(1974,)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 128)               93440     
_________________________________________________________________
dense_12 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_13 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 65        
Total params: 105,921
Trainable params: 105,921
Non-trainable params: 0
_____________________________________

In [16]:
model.fit(x_train, y_train, validation_split = 0.2, epochs = 200, batch_size = 16, callbacks = [early_stop, lr_reduce],
          verbose = 2)
# model.save_weights('../weights/t1.h5')

Train on 1579 samples, validate on 395 samples
Epoch 1/200
 - 1s - loss: 0.0186 - mean_absolute_error: 0.1042 - val_loss: 0.0231 - val_mean_absolute_error: 0.1263
Epoch 2/200
 - 0s - loss: 0.0110 - mean_absolute_error: 0.0794 - val_loss: 0.0272 - val_mean_absolute_error: 0.1394
Epoch 3/200
 - 0s - loss: 0.0098 - mean_absolute_error: 0.0750 - val_loss: 0.0254 - val_mean_absolute_error: 0.1353
Epoch 4/200
 - 0s - loss: 0.0092 - mean_absolute_error: 0.0723 - val_loss: 0.0285 - val_mean_absolute_error: 0.1425
Epoch 5/200
 - 0s - loss: 0.0089 - mean_absolute_error: 0.0711 - val_loss: 0.0278 - val_mean_absolute_error: 0.1395
Epoch 6/200
 - 0s - loss: 0.0087 - mean_absolute_error: 0.0711 - val_loss: 0.0267 - val_mean_absolute_error: 0.1371
Epoch 7/200
 - 0s - loss: 0.0078 - mean_absolute_error: 0.0666 - val_loss: 0.0252 - val_mean_absolute_error: 0.1288
Epoch 8/200
 - 0s - loss: 0.0082 - mean_absolute_error: 0.0689 - val_loss: 0.0263 - val_mean_absolute_error: 0.1347
Epoch 9/200
 - 0s - loss:

<keras.callbacks.History at 0x7f9eb7b82a10>

In [17]:
perm = PermutationImportance(model, random_state = 1, scoring = 'neg_mean_absolute_error').fit(x_train, y_train)

In [18]:
eli5.show_weights(perm, feature_names = head.tolist())

Weight,Feature
0.0040  ± 0.0003,maxdO
0.0037  ± 0.0005,maxaaN
0.0037  ± 0.0005,minaaN
0.0035  ± 0.0005,mindO
0.0032  ± 0.0003,maxHBint3
0.0030  ± 0.0002,minssO
0.0030  ± 0.0004,maxHBint8
0.0029  ± 0.0007,maxssO
0.0026  ± 0.0002,minsF
0.0025  ± 0.0008,maxHBint2
