## Import Packages

In [4]:
import rdkit
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics 

## Read files and create dataframe with Smiles

In [6]:
df_smiles1 = pd.read_csv('tested_molecules-1.csv')
df_smiles2 = pd.read_csv('tested_molecules_v2.csv')
df_smiles = pd.concat([df_smiles1, df_smiles2], ignore_index=True)
PandasTools.AddMoleculeColumnToFrame(df_smiles, smilesCol='SMILES')
print(df_smiles)

                                                 SMILES  ALDH1_inhibition   
0     COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccc...                 1  \
1                O=C(CSc1nc2cccnc2n1Cc1ccccc1)NCc1ccco1                 1   
2     Cc1cccc2cc(C[NH+](CC3CCCO3)C(c3nnnn3Cc3ccco3)C...                 1   
3                     CCN(CC)c1ccc2c(Cl)c(Br)c(=O)oc2c1                 1   
4     CS(=O)(=O)N1CCc2cc(-c3csc(NC(=O)Cc4cccs4)n3)ccc21                 1   
...                                                 ...               ...   
1995                    C/C(=N\NC(=S)Nc1ccc(F)cc1)C1CC1                 1   
1996                     COC(=O)c1c(NC(C)=O)sc2c1CCCCC2                 1   
1997                            O=C(CCl)NC1CCCc2ccccc21                 1   
1998    COc1ccc(-n2c(SCC(=O)N3CCCCC3C)nnc2-c2cccnc2)cc1                 1   
1999  COc1ccc(NC(=O)C2CCC(N3C(=O)C4C5C=CC(C5)C4C3=O)...                 1   

                                                  ROMol  
0     <rdkit.Chem

## Check descriptor values for all Smiles

In [11]:
radius=3
nBits=1024

ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=radius, nBits=nBits) for x in df_smiles['ROMol']]
ecfp6_name = [f'Bit_{i}' for i in range(nBits)]
ecfp6_bits = [list(l) for l in ECFP6]
df_morgan = pd.DataFrame(ecfp6_bits, index = df_smiles['SMILES'], columns=ecfp6_name)
df_morgan.head(1)

Unnamed: 0_level_0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_1014,Bit_1015,Bit_1016,Bit_1017,Bit_1018,Bit_1019,Bit_1020,Bit_1021,Bit_1022,Bit_1023
SMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COc1ccccc1CC(NC(C)=O)C(=O)NC1CCN(c2nnnn2-c2ccccc2)CC1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0


## Scale data & perform PCA

In [19]:
scaler = MinMaxScaler()
scaled_array = scaler.fit_transform(df_morgan)
df_scaled = pd.DataFrame(scaled_array)
df_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
df_scaled.corr()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,1.000000,-0.037444,-0.002617,0.016343,-0.042859,0.026509,0.028490,-0.001656,-0.001654,-0.026419,...,-0.017860,0.014729,0.008177,-0.032006,0.055361,-0.034900,0.003438,-0.029348,0.019650,-0.022285
1,-0.037444,1.000000,-0.006518,-0.024305,-0.014069,0.021502,-0.009000,0.009055,-0.027780,0.031265,...,0.042473,0.025331,-0.023956,-0.018009,-0.020171,0.007966,-0.016849,-0.009688,-0.002210,-0.003961
2,-0.002617,-0.006518,1.000000,-0.053376,0.470184,0.050739,0.044628,0.045747,-0.033758,-0.004986,...,-0.004456,0.022497,-0.013771,0.030509,0.048416,0.209578,-0.017633,0.009677,0.072198,-0.006464
3,0.016343,-0.024305,-0.053376,1.000000,0.002802,-0.020080,0.012632,0.057700,-0.021082,-0.017792,...,-0.030357,-0.000647,0.040828,-0.036839,-0.016834,0.075197,0.097520,0.048923,0.004175,-0.009718
4,-0.042859,-0.014069,0.470184,0.002802,1.000000,0.024156,0.038703,-0.035701,0.007442,0.031214,...,0.031385,0.078083,0.006893,0.034507,-0.000283,0.328169,0.004863,0.013643,0.047443,-0.030011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,-0.034900,0.007966,0.209578,0.075197,0.328169,0.203986,0.046842,0.049196,-0.044042,0.056816,...,0.019260,-0.013766,0.005081,0.081942,-0.013766,1.000000,0.014599,-0.000534,0.022547,0.018949
1020,0.003438,-0.016849,-0.017633,0.097520,0.004863,-0.029313,-0.025413,0.008364,0.038808,0.001537,...,-0.040713,0.002792,0.045506,-0.001542,-0.014234,0.014599,1.000000,-0.005146,0.044208,-0.007235
1021,-0.029348,-0.009688,0.009677,0.048923,0.013643,0.014105,-0.024103,-0.027168,-0.024025,0.039983,...,-0.024636,0.006126,0.050159,-0.007086,0.024014,-0.000534,-0.005146,1.000000,-0.008266,0.015800
1022,0.019650,-0.002210,0.072198,0.004175,0.047443,-0.016389,-0.019724,0.001269,0.017602,-0.003523,...,0.019094,0.062233,0.040153,0.024528,-0.002638,0.022547,0.044208,-0.008266,1.000000,0.054030


## Split data for testing

In [25]:
features = np.array(df_scaled)
feature_names = list(df_scaled.columns)
target = np.array(df_smiles['ALDH1_inhibition'])

### Training & test set

In [26]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, target, test_size = 0.20, random_state = 42)

#check the shapes
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1600, 1024)
Training Labels Shape: (1600,)
Testing Features Shape: (400, 1024)
Testing Labels Shape: (400,)


## Predictor models

### Random Forest Regressor Model

In [27]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

Predictions

In [30]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)

predictions
#Print out the mean absolute error (mae)
#print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

array([0.65, 0.22, 0.  , 0.09, 0.  , 0.02, 0.55, 0.56, 0.33, 0.05, 0.54,
       0.02, 0.05, 0.72, 0.39, 0.61, 0.04, 0.12, 0.01, 0.67, 0.48, 0.04,
       0.7 , 0.  , 0.67, 0.43, 0.01, 0.52, 0.51, 0.36, 0.19, 0.37, 0.48,
       0.58, 0.  , 0.44, 0.  , 0.03, 0.  , 0.52, 0.39, 0.51, 0.39, 0.31,
       0.  , 0.18, 0.46, 0.71, 0.73, 0.44, 0.03, 0.4 , 0.  , 0.64, 0.44,
       0.7 , 0.  , 0.1 , 0.  , 0.16, 0.01, 0.44, 0.  , 0.  , 0.64, 0.37,
       0.58, 0.  , 0.45, 0.03, 0.01, 0.42, 0.01, 0.44, 0.36, 0.46, 0.  ,
       0.57, 0.29, 0.84, 0.26, 0.54, 0.05, 0.  , 0.02, 0.01, 0.79, 0.07,
       0.47, 0.09, 0.  , 0.96, 0.57, 0.54, 0.53, 0.74, 0.16, 0.01, 0.  ,
       0.52, 0.37, 0.02, 0.36, 0.01, 0.37, 0.44, 0.49, 0.23, 0.59, 0.01,
       0.61, 0.01, 0.  , 0.6 , 0.36, 0.39, 0.01, 0.14, 0.42, 0.01, 0.7 ,
       0.01, 0.03, 0.34, 0.5 , 0.  , 0.16, 0.52, 0.01, 0.53, 0.4 , 0.49,
       0.02, 0.51, 0.55, 0.  , 0.39, 0.66, 0.  , 0.07, 0.04, 0.28, 0.02,
       0.49, 0.  , 0.34, 0.47, 0.31, 0.38, 0.82, 0.

Test wheter the highest x predictions are accurate

In [41]:
to_be_predicted = 120 #test_labels.sum() #100 in geval van assignment

d={'predictions':predictions, 'labels':test_labels}
best_scores = pd.DataFrame(d).nlargest(to_be_predicted,columns='predictions')

accuracy = sum(best_scores['labels'])/to_be_predicted
print("Accuracy:", accuracy)

Accuracy: 0.6083333333333333


In [34]:
#Eerst antwoorden veranderen naar binary waardes en hiermee accuracy van model bepalen.
#default_acc = metrics.balanced_accuracy_score(test_labels, predictions)
#optimized_acc = metrics.balanced_accuracy_score(test_labels, y_pred_SVM1)
#print(default_acc)
#print(optimized_acc)

### Random Forest Classifier Model

In [42]:
clf=RandomForestClassifier()
clf.fit(train_features, train_labels);

Predictions

In [43]:
preds = clf.predict(test_features)
print(preds)

default_acc = metrics.balanced_accuracy_score(test_labels, preds)
print("Accuracy:", default_acc)

[1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0
 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0
 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]
Accuracy: 0.7125


## Support vector machine

Make SVM model without optimizing the hyperparameters

In [46]:
SVC_model_default = SVC()
SVC_model_default.fit(train_features, train_labels)
y_pred_SVC_default =SVC_model_default.predict(test_features)

Define the parameter values used in GridSearch

In [47]:
estimator_SVM = SVC(gamma = 'auto')
parameters_SVM = {
    'C':(0.1,1, 10, 100), #(0.1, 15.0, 0.1)
    'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
    'coef0': (0.0, 10.0, 1.0),
    'shrinking': (True, False)}   

Find the optimal parameter values 

In [48]:
grid_search_SVM = GridSearchCV(
    estimator=estimator_SVM,
    param_grid=parameters_SVM,
    scoring = 'balanced_accuracy',
    n_jobs = -1,
    cv = 5
)

Fit the model

In [49]:
SVM_1 = grid_search_SVM.fit(train_features, train_labels)
y_pred_SVM1 = SVM_1.predict(test_features)

Evaluate the performance

In [50]:
default_acc = metrics.balanced_accuracy_score(test_labels, y_pred_SVC_default)
optimized_acc = metrics.balanced_accuracy_score(test_labels, y_pred_SVM1)
print(default_acc)
print(optimized_acc)

0.763095238095238
0.7720238095238094


## K-nearest Neighbor

In [51]:
KN_clf=KNeighborsClassifier(n_neighbors=5)
KN_clf.fit(train_features, train_labels)

Predictions

In [52]:
KN_preds = KN_clf.predict(test_features)
print(KN_preds)
print ("accuracy of the training set",KN_clf.score(train_features, train_labels))
print("accuracy of the test set" ,KN_clf.score(test_features, test_labels))

default_acc = metrics.balanced_accuracy_score(test_labels, KN_preds)
print("Default accuracy of the test set",default_acc)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
accuracy of the training set 0.793125
accuracy of the test set 0.7375
Default accuracy of the test set 0.5744047619047619
