In [0]:
!pip install git+https://github.com/samoturk/mol2vec;

In [0]:
# Install RDKit. Takes 2-3 minutes
!wget -c https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
!chmod +x Miniconda3-latest-Linux-x86_64.sh
!time bash ./Miniconda3-latest-Linux-x86_64.sh -b -f -p /usr/local
!time conda install -q -y -c conda-forge rdkit

In [0]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')

In [0]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import ChemicalFeatures
from rdkit import RDConfig
import os

In [0]:
df= pd.read_csv('/content/drive/My Drive/A5Q3DB/train.csv')
target = df['Binding Affinity']
df.drop(columns='Binding Affinity',inplace=True)
print(df)
testdata=pd.read_csv('/content/drive/My Drive/A5Q3DB/test.csv')
testdata.drop(columns='Binding Affinity',inplace=True)

               SMILES sequence
0         CCNC(C)C(NC)c1ccccc1
1             CONC(=O)c1cncnc1
2        CCNC1CCCN(Cc2ccsc2)C1
3      CC(NC(=O)CSCCN)c1ccccc1
4         CCC(CS)CN(C)c1ccccc1
...                        ...
8995  CC(O)CCNC(=O)CNCc1ccccc1
8996     N#Cc1cccc(-c2cnsc2)n1
8997    Cc1ccnc(NC(CN)C(C)C)c1
8998          CC(=O)CCc1cncnc1
8999       COCCC(=O)Nc1cnccc1C

[9000 rows x 1 columns]


In [0]:
Finaltestdata=pd.read_csv('/content/drive/My Drive/A5Q3DB/f_test.csv')
print(Finaltestdata)

              SMILES sequence  Binding Affinity
0         CNC(=O)N(C)c1ncccn1               NaN
1      O=C(Cc1ccccc1)c1ccccn1               NaN
2                NC1C2CCSCC12               NaN
3      CCC(C)C(N)(CC)c1ccccc1               NaN
4      N#Cc1cccc(C2OCCC2=O)n1               NaN
...                       ...               ...
2495    CC(N)C(C)Nc1ncccc1C#N               NaN
2496     CCCCOc1ccccc1C(=O)OC               NaN
2497            NN1CC2CCCC1C2               NaN
2498  CN1CCC(C(=O)O)c2ccccc21               NaN
2499        NCC(=O)Nc1ncncc1N               NaN

[2500 rows x 2 columns]


In [0]:
print(testdata)

               SMILES sequence
0         Cc1ccc(C2CNCCN2C)cc1
1             CCOC(CO)c1ccccc1
2            CC(=O)Nc1cnn(C)n1
3             CCC(C)NCc1ncccn1
4              CC(C)=C1CC(N)C1
...                        ...
2495    N#Cc1ncccc1N1CCC(=O)C1
2496       CNCCSCc1cncc(C#N)c1
2497   NC1=NC(=O)C(=CC(=O)O)S1
2498  C=C(CC)CC(C)C(O)c1ccccc1
2499         N#CCC(CN)c1cncnc1

[2500 rows x 1 columns]


## Getting MorganFingerprints of chemical molecule represented in vectors

In [0]:
# import tensorflow as tf
# import tensorflow.keras as keras
# from keras import Model
# from keras.layers import Activation, Dense, Dropout, Input
# from keras.utils import np_utils

def mol2arr(mol):
  arr = np.zeros((1,))
  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 3)
  DataStructs.ConvertToNumpyArray(fp, arr)
  return arr
  
Mdata=pd.read_csv('/content/drive/My Drive/A5Q3DB/train.csv')
print(Mdata.columns)

Mdata['molecule'] = Mdata['SMILES sequence'].apply(lambda x:Chem.MolFromSmiles(x))
Mdata['molecule'] = Mdata['molecule'].apply(lambda x: Chem.AddHs(x))
molecules_list=Mdata['molecule']
Mdataset= np.array([mol2arr(mol) for mol in molecules_list])
print(Mdataset)

Index(['SMILES sequence', 'Binding Affinity'], dtype='object')
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


## Using Pretrained Model

In [0]:
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec.load('/content/drive/My Drive/model_300dim.pkl')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def create_dataset(df):
  df['mol'] = df['SMILES sequence'].apply(lambda x: Chem.MolFromSmiles(x))
  df['sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)
  df['mol2vec'] = [DfVec(x) for x in sentences2vec(df['sentence'], model, unseen='UNK')]
  dataset= np.array([x.vec for x in df['mol2vec']])
  return dataset

In [0]:
dataset=create_dataset(df)
testdataset=create_dataset(testdata)

In [0]:
Finaltest=create_dataset(Finaltestdata)

## MAE and MSE value using only pretrained model and training model as RidgeCV

In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=.1, random_state=1)
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    mse = mean_squared_error(y_test, prediction)
    return [mae,mse,prediction]
ridge = RidgeCV(cv=5)
ridge.fit(X_train, y_train)
mae,mse,predict_labels=evaluation(ridge, X_test, y_test)

In [0]:
print('MAE score:', mae)
print('MSE score:', mse)

In [0]:
from sklearn.metrics import r2_score
r2_score(y_test,predict_labels)

0.626898246059388

## MAE and MSE value using only pretrained model and training model as SVR

In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    mse = mean_squared_error(y_test, prediction)
    return [mae,mse,prediction]

X_train, X_test, y_train, y_test = train_test_split(dataset, target, test_size=.1, random_state=2)
clf = SVR(kernel='rbf',C=5,epsilon=1)
clf.fit(X_train, y_train)
mae,mse,predict_labels=evaluation(clf, X_test, y_test)
print('MAE score:', mae)
print('MSE score:', mse)

MAE score: 1.6818546639282033
MSE score: 5.488813504046821


In [0]:
predictLabels=clf.predict(Finaltest)
print(predictLabels)

[-21.60094211 -19.69661307 -20.79468405 ... -20.02825511 -18.31507426
 -28.67657895]


## Creating Submission.csv File

In [0]:
def create_file(predictLabels,testdata):
  smiles=testdata['SMILES sequence'] 
  list_of_tuples = list(zip(smiles, predictLabels))  
  sub = pd.DataFrame(list_of_tuples, columns = ['SMILES sequence', 'Binding Affinity'])   
  print(sub)
  sub.to_csv('submission.csv')

In [0]:
create_file(predictLabels,testdata)

               SMILES sequence  Binding Affinity
0         Cc1ccc(C2CNCCN2C)cc1        -21.600942
1             CCOC(CO)c1ccccc1        -19.696613
2            CC(=O)Nc1cnn(C)n1        -20.794684
3             CCC(C)NCc1ncccn1        -16.424992
4              CC(C)=C1CC(N)C1        -19.046919
...                        ...               ...
2495    N#Cc1ncccc1N1CCC(=O)C1        -24.720667
2496       CNCCSCc1cncc(C#N)c1        -12.672911
2497   NC1=NC(=O)C(=CC(=O)O)S1        -20.028255
2498  C=C(CC)CC(C)C(O)c1ccccc1        -18.315074
2499         N#CCC(CN)c1cncnc1        -28.676579

[2500 rows x 2 columns]


# Adding more features in initial dataset

In [0]:
df=pd.read_csv('/content/drive/My Drive/A5Q3DB/train.csv')
testdata=pd.read_csv('/content/drive/My Drive/A5Q3DB/test.csv')

In [0]:
Finaltestdata=pd.read_csv('/content/drive/My Drive/A5Q3DB/f_test.csv')

In [0]:
def number_of_atoms(atom_list, df):
    for i in atom_list:
        df['num_of_{}_atoms'.format(i)] = df['molecule'].apply(lambda x: len(x.GetSubstructMatches(Chem.MolFromSmiles(i))))

def add_features(df):
  df['molecule'] = df['SMILES sequence'].apply(lambda x:Chem.MolFromSmiles(x))
  df['molecule'] = df['molecule'].apply(lambda x: Chem.AddHs(x))
  df['num_of_atoms'] = df['molecule'].apply(lambda x: x.GetNumAtoms())
  df['num_of_heavy_atoms'] = df['molecule'].apply(lambda x: x.GetNumHeavyAtoms())
  df['SSSR'] = df['molecule'].apply(lambda x:len(Chem.GetSymmSSSR(x)))
  # df['rings1']=df['molecule'].apply(lambda x: x.GetRingInfo().NumAtomRings(1))
  df['rings2']=df['molecule'].apply(lambda x: x.GetRingInfo().NumAtomRings(2))
  df['in_rings']=df['molecule'].apply(lambda x: x.GetAtomWithIdx(1).IsInRing())
  df['ringsize3']=df['molecule'].apply(lambda x: x.GetAtomWithIdx(2).IsInRingSize(3))
  df['isAromatic']=df['molecule'].apply(lambda x: x.GetBondWithIdx(1).GetIsAromatic())
  df['tpsa'] = df['molecule'].apply(lambda x: Descriptors.TPSA(x))
  df['mol_w'] = df['molecule'].apply(lambda x: Descriptors.ExactMolWt(x))
  df['num_valence_electrons'] = df['molecule'].apply(lambda x: Descriptors.NumValenceElectrons(x))
  # df['num_heteroatoms'] = df['molecule'].apply(lambda x: Descriptors.NumHeteroatoms(x))
  # df['num_rotable_bonds'] = df['molecule'].apply(lambda x: Descriptors.NumRotatableBonds(x))
  # df['num_amide_bonds'] = df['molecule'].apply(lambda x: Descriptors.NumAmideBonds(x))
  # df['num_SpiroAtoms'] = df['molecule'].apply(lambda x: Descriptors.NumSpiroAtoms(x))
  df['num_Fraction'] = df['molecule'].apply(lambda x: Descriptors.FractionCSP3(x))
  # # df['geometry'] = df['molecule'].apply(lambda x: Descriptors.Autocorr2D(x))
  # df['num_BridgeheadAtoms'] = df['molecule'].apply(lambda x: Descriptors.NumBridgeheadAtoms(x))
  # df['num_rotable bonds'] = df['molecule'].apply(lambda x: Descriptors.FractionCSP3(x))
  number_of_atoms(['C','O', 'N','S', 'Cl'], df)
  return df

In [0]:
# #EXTRA FEATURES THAT ARE OPTIONAL
# fdefName = os.path.join(RDConfig.RDDataDir,'BaseFeatures.fdef')
# factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
# df['feats'] = df['molecule'].apply(lambda x:len(factory.GetFeaturesForMol(x)))
# df['featsfamily'] = df['molecule'].apply(lambda x:factory.GetFeaturesForMol(x)[0].GetFamily())
# df['featstype'] = df['molecule'].apply(lambda x:factory.GetFeaturesForMol(x)[0].GetType())
# df['feats1family'] = df['molecule'].apply(lambda x:factory.GetFeaturesForMol(x)[1].GetFamily())

In [0]:
print(df.shape)
print(testdata.shape)

(9000, 2)
(2500, 2)


In [0]:
df=add_features(df)
testdata=add_features(testdata)

In [0]:
testData=add_features(Finaltestdata)

In [0]:
train_df = df.drop(columns=['SMILES sequence', 'molecule', 'Binding Affinity'])
test_df = testdata.drop(columns=['SMILES sequence', 'molecule', 'Binding Affinity'])

In [0]:
final_df=testData.drop(columns=['SMILES sequence', 'molecule', 'Binding Affinity'])

In [0]:
print(train_df.shape)
print(test_df.shape)

(9000, 16)
(2500, 16)


## Merging Pretrained Model Dataset and features dataset

In [0]:
train_mdf = pd.DataFrame(dataset)
train_mdf = pd.concat((train_mdf, train_df), axis=1)
test_mdf = pd.DataFrame(testdataset)
test_mdf = pd.concat((test_mdf, test_df), axis=1)

In [0]:
final_mdf= pd.DataFrame(Finaltest)
final_mdf = pd.concat((final_mdf, final_df), axis=1)

In [0]:
# new_df=pd.get_dummies(new_df, prefix=['featsfamily', 'feats1family', 'featstype'])

In [0]:
print(train_mdf)
print(test_mdf)

             0         1  ...  num_of_S_atoms  num_of_Cl_atoms
0    -0.357490 -1.206480  ...               0                0
1     1.329632 -1.765945  ...               0                0
2     1.047152 -3.901737  ...               1                0
3    -0.917391 -1.154516  ...               1                0
4     0.685771 -0.099936  ...               1                0
...        ...       ...  ...             ...              ...
8995 -0.454594 -1.671645  ...               0                0
8996  0.666540 -0.541227  ...               1                0
8997  0.455289 -0.767791  ...               0                0
8998  0.867964 -1.368630  ...               0                0
8999 -0.062538 -2.053792  ...               0                0

[9000 rows x 316 columns]
             0         1  ...  num_of_S_atoms  num_of_Cl_atoms
0     2.726033 -2.308697  ...               0                0
1     0.281298 -0.838246  ...               0                0
2     0.560942 -2.171900  ..

In [0]:
print(final_mdf)

## Different Training Models applied on Merged Dataset

### RidgeCV

In [0]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_mdf, target, test_size=.1, random_state=1)
def evaluation(model, X_test, y_test):
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    mse = mean_squared_error(y_test, prediction)
    return [mae,mse,prediction]
ridge = RidgeCV(cv=5)
ridge.fit(X_train, y_train)
mae,mse,predict_labels=evaluation(ridge, X_test, y_test)

In [0]:
print('MAE score:', mae)
print('MSE score:', mse)

MAE score: 1.8464971248863296
MSE score: 6.004769388674274


In [0]:
from sklearn.metrics import r2_score
r2_score(y_test,predict_labels)

0.6036696032690876

### SVR(Support Vector Regression)

In [0]:
from sklearn.svm import SVR
X_train, X_test, y_train, y_test = train_test_split(train_mdf, target, test_size=.1, random_state=2)
clf = SVR(kernel='linear',C=70)
clf.fit(X_train, y_train)
mae,mse,predict_labels=evaluation(clf, X_test, y_test)
print('MAE score:', mae)
print('MSE score:', mse)

### LSTM

In [0]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.layers import LSTM
from sklearn.model_selection import train_test_split

In [0]:
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array(y_train)
y_test=np.array(y_test)
print(type(X_train))

<class 'numpy.ndarray'>


In [0]:
trainX= np.reshape(X_train,(X_train.shape[0], 1, X_train.shape[1]))
print(trainX.shape)
test_mdf=np.array(test_mdf)
test_data= np.reshape(test_mdf,(test_mdf.shape[0], 1, test_mdf.shape[1]))
print(test_data.shape)

(8100, 1, 316)
(2500, 1, 316)


In [0]:
final_mdf=np.array(final_mdf)
final_test_data= np.reshape(final_mdf,(final_mdf.shape[0], 1, final_mdf.shape[1]))
print(final_test_data.shape)

(2500, 1, 316)


In [0]:
from keras.layers import Conv1D, MaxPooling1D
from keras.layers import Dense, Dropout, Flatten

In [0]:
model = Sequential()

model.add(Conv1D(filters=8, kernel_size=3, padding='same', activation='relu'))

model.add(LSTM(200, input_shape=(1,316)))
model.add(Dense(64,activation='relu'))
# model.add(Dense(32,activation='relu'))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, y_train, epochs=100, batch_size=32, verbose=2)

Epoch 1/100
 - 3s - loss: 33.0612
Epoch 2/100
 - 2s - loss: 8.0999
Epoch 3/100
 - 2s - loss: 7.1076
Epoch 4/100
 - 2s - loss: 6.6825
Epoch 5/100
 - 2s - loss: 6.5333
Epoch 6/100
 - 2s - loss: 6.4220
Epoch 7/100
 - 2s - loss: 6.2564
Epoch 8/100
 - 2s - loss: 6.1186
Epoch 9/100
 - 2s - loss: 6.1123
Epoch 10/100
 - 2s - loss: 6.0379
Epoch 11/100
 - 2s - loss: 6.0606
Epoch 12/100
 - 2s - loss: 5.9187
Epoch 13/100
 - 2s - loss: 5.8151
Epoch 14/100
 - 2s - loss: 5.9437
Epoch 15/100
 - 2s - loss: 5.7572
Epoch 16/100
 - 2s - loss: 5.9869
Epoch 17/100
 - 2s - loss: 5.7274
Epoch 18/100
 - 2s - loss: 5.7586
Epoch 19/100
 - 2s - loss: 5.6753
Epoch 20/100
 - 2s - loss: 5.7174
Epoch 21/100
 - 2s - loss: 5.5919
Epoch 22/100
 - 2s - loss: 5.6035
Epoch 23/100
 - 2s - loss: 5.7145
Epoch 24/100
 - 2s - loss: 5.4607
Epoch 25/100
 - 2s - loss: 5.5654
Epoch 26/100
 - 2s - loss: 5.5634
Epoch 27/100
 - 2s - loss: 5.5946
Epoch 28/100
 - 2s - loss: 5.4467
Epoch 29/100
 - 2s - loss: 5.4860
Epoch 30/100
 - 2s - l

<keras.callbacks.callbacks.History at 0x7f0b579abe10>

In [0]:
testX= np.reshape(X_test,(X_test.shape[0], 1, X_test.shape[1]))
testPredict = model.predict(testX)
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y_test, testPredict)
mse = mean_squared_error(y_test,testPredict)
print('MAE score:', mae)
print('MSE score:', mse)

MAE score: 1.7327198859754773
MSE score: 5.303172010510516


In [0]:
from sklearn.metrics import r2_score
r2_score(y_test,testPredict)

0.6497057786267144

In [0]:
predict_labels = model.predict(test_data)


In [0]:
final_predict_labels = model.predict(final_test_data)

In [0]:
print(final_predict_labels)

[[-21.732592]
 [-19.39044 ]
 [-20.75262 ]
 ...
 [-20.16827 ]
 [-18.137794]
 [-25.845291]]


In [0]:
final_predicted=[]
for i in range(0,len(final_predict_labels)):
  final_predicted.append(final_predict_labels[i][0])
create_file(final_predicted,Finaltestdata)

              SMILES sequence  Binding Affinity
0         CNC(=O)N(C)c1ncccn1        -21.732592
1      O=C(Cc1ccccc1)c1ccccn1        -19.390440
2                NC1C2CCSCC12        -20.752621
3      CCC(C)C(N)(CC)c1ccccc1        -17.209328
4      N#Cc1cccc(C2OCCC2=O)n1        -20.115131
...                       ...               ...
2495    CC(N)C(C)Nc1ncccc1C#N        -26.213448
2496     CCCCOc1ccccc1C(=O)OC        -13.094802
2497            NN1CC2CCCC1C2        -20.168270
2498  CN1CCC(C(=O)O)c2ccccc21        -18.137794
2499        NCC(=O)Nc1ncncc1N        -25.845291

[2500 rows x 2 columns]
