In [52]:
import numpy as np
import math

import keras
from keras.models import  Sequential
from keras.layers import Activation, BatchNormalization
from keras.layers.core import Dense
from keras.optimizers import Adam, SGD
from keras.metrics import categorical_crossentropy

from rdkit import Chem, DataStructs
from rdkit.Chem import rdMolDescriptors, Draw
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import MinMaxScaler

In [53]:
'''DATA PREPARE'''
#read data file
with open('1k_SMILES_strings.txt') as f:
    array_SMILES_train = f.readlines()
with open('1k_SMILES_strings_charges.txt') as f:
    array_charges_train = f.readlines()
with open('10_SMILES_strings_test.txt') as f:
    array_SMILES_test = f.readlines()

In [54]:
#convert SMILES strings into molecules
mols_train = [Chem.rdmolfiles.MolFromSmiles(SMILES_string_train) for SMILES_string_train in array_SMILES_train]
mols_test = [Chem.rdmolfiles.MolFromSmiles(SMILES_string_test) for SMILES_string_test in array_SMILES_test]

In [55]:
#convert molecules into fingerprints
bi = {}
fps_train = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m_train,radius=2, bitInfo=bi, nBits=256) for m_train in mols_train]
fps_test = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m_test,radius=2, bitInfo=bi, nBits=256) for m_test in mols_test]
#fingerprints--Morgan
#what is the meaning of 'radius', 'bi'?

In [56]:
#convert fingerprints into binary
nparray_fps_train = []
for fp in fps_train:
    arr = np.zeros((1,),dtype = int)#zeros vector
    DataStructs.ConvertToNumpyArray(fp, arr)
    nparray_fps_train.append(arr)
nparray_fps_test = []
for fp in fps_test:
    arr = np.zeros((1,),dtype = int)#zeros vector
    DataStructs.ConvertToNumpyArray(fp, arr)
    nparray_fps_test.append(arr)

In [57]:
'''MODEL BUILD'''
#Neural Network Architechture
model = Sequential([
    Dense(256, input_shape=(256,), activation='relu'),
    Dense(128, activation='sigmoid'),
    Dense(64, activation='sigmoid'),
    Dense(32, activation='sigmoid'),
    Dense(16, activation='sigmoid'),
    BatchNormalization(axis=1),
    Dense(4, activation='softmax')
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_8 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_9 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_10 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_11 (Dense)             (None, 16)                528       
_________________________________________________________________
batch_normalization_2 (Batch (None, 16)                64        
_________________________________________________________________
dense_12 (Dense)             (None, 4)                

In [58]:
#Compile the model
model.compile(optimizer=Adam(lr=0.00001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#Train the model
model.fit(np.array(nparray_fps_train), np.array(array_charges_train), validation_split=0.1, batch_size=10, epochs=100, shuffle=True, verbose=1)

Train on 900 samples, validate on 100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x25b50f4b550>

In [59]:
#Predict new molecule charges
predictions = model.predict(np.array(nparray_fps_test), batch_size=1, verbose=1)




In [66]:
#Print the charge predictions
values = []
possib = []
for prediction in predictions:
    prediction = list(prediction)
    values.append(prediction.index(max(prediction)))
    possib.append(prediction[values[-1]])
def switch(value):
    case = {
        0 : '0',
        1 : '1',
        2 : '-1',
        3 : '-2',
        4 : '-3'
    }
    return case.get(value, None)
for i in range(len(values)):
    print('charge:', switch(values[i]),' possibility: ',possib[i])

charge: 0  possibility:  0.79426265
charge: 1  possibility:  0.9309378
charge: -1  possibility:  0.94840187
charge: 1  possibility:  0.9655982
charge: 1  possibility:  0.85117674
charge: 1  possibility:  0.8405196
charge: -1  possibility:  0.8367821
charge: 1  possibility:  0.8486453
charge: 0  possibility:  0.8785551
charge: 1  possibility:  0.9571373


In [None]:
'''default parameters
charge: 0  possibility:  0.79426265
charge: 1  possibility:  0.9309378
charge: -1  possibility:  0.94840187
charge: 1  possibility:  0.9655982
charge: 1  possibility:  0.85117674
charge: 1  possibility:  0.8405196
charge: -1  possibility:  0.8367821
charge: 1  possibility:  0.8486453
charge: 0  possibility:  0.8785551
charge: 1  possibility:  0.9571373
'''