In [1]:
import numpy as np
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from rdkit import Chem
from rdkit.Chem import AllChem

Download the data file for the lipophilicity dataset.

Lipophilicity.csv

Split the dataset into train and test sets. --Done
Generate Morgan fingerprints and MACCS keys for each of the SMILES in the dataset. --Done
Train models for each using the MLPRegressor class from sklearn --Done

As NN models benefit from feature and target scaling, apply a scaler to the targets (features will be [0,1] already so scaling those will be unnecessary). --Done

Evaluate performance using RMSE on the unscaled targets for both models and compare the results. --Done

In [8]:
dataset = "Lipophilicity.csv"
print(os.getenv("CONDA_DEFAULT_ENV"))

True
HW5


In [105]:
data = pd.read_csv(dataset)
data.drop('CMPD_CHEMBLID', axis=1, inplace=True)
data.head()

Unnamed: 0,exp,smiles
0,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


In [106]:
#train and test split
X = data.drop('exp', axis=1)
y = data['exp']
features, features_test, targets, targets_test = train_test_split(X, y, test_size=0.2, random_state=42)
#features.head()
targets_test.head()


1743    3.00
2196    1.69
1728    0.78
3337    1.29
298     3.07
Name: exp, dtype: float64

In [107]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


#targets scaling
targets_scaled = scaler.fit_transform(targets.values.reshape(-1, 1))
targets_test_scaled = scaler.transform(targets_test.values.reshape(-1, 1))


In [108]:
X_test = features_test
y_test = targets_test_scaled
X_train = features
y_train = targets_scaled

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3360, 1)
(3360, 1)
(840, 1)
(840, 1)


In [109]:
#morgan fingerprints
X_test = features_test
y_test = targets_test_scaled
X_train = features
y_train = targets_scaled

def morgan_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)

train = X_train['smiles'].apply(morgan_fingerprint)
test = X_test['smiles'].apply(morgan_fingerprint)

train_list = train.apply(lambda x: list(x))
test_list = test.apply(lambda x: list(x))

X_train = pd.DataFrame(train_list.tolist())
X_test = pd.DataFrame(test_list.tolist())

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

y_test = y_test.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)

X_train

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [110]:
#morgan fingerprint model
model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
model.fit(X_train, y_train.ravel())
morgan_score = model.score(X_test, y_test)
print(f'Morgan score: {morgan_score}')

#RMSE unscaled targets
y_pred = model.predict(X_test)
morgan_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Morgan RMSE: {morgan_rmse}')


Morgan score: 0.54084254538373
Morgan RMSE: 0.6866939063204012




In [112]:
#MACCS keys
X_test = features_test
y_test = targets_test_scaled
X_train = features
y_train = targets_scaled

def maccs_keys(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [int(x) for x in AllChem.GetMACCSKeysFingerprint(mol)]

X_train['MACCS'] = X_train['smiles'].apply(maccs_keys)
X_test['MACCS'] = X_test['smiles'].apply(maccs_keys)

X_train = pd.concat([X_train.drop('MACCS', axis=1), X_train['MACCS'].apply(pd.Series)], axis=1)
X_test = pd.concat([X_test.drop('MACCS', axis=1), X_test['MACCS'].apply(pd.Series)], axis=1)

X_train = X_train.drop(['smiles'], axis=1)
X_test = X_test.drop(['smiles'], axis=1)

X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,157,158,159,160,161,162,163,164,165,166
2417,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3827,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
239,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3633,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,0,1,0
1543,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,0,1,0


In [113]:
#MACCS keys model
model = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000, random_state=42)
model.fit(X_train, y_train.ravel())
maccs_score = model.score(X_test, y_test)
print(f'MACCS score: {maccs_score}')

#RMSE unscaled targets
y_pred = model.predict(X_test)
maccs_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'MACCS RMSE: {maccs_rmse}')

MACCS score: 0.4111431907912324
MACCS RMSE: 0.7776554578192888




The morgan fingerprint model was slightly better with an RMSE of 0.6866939063204012 while the MACCS model had an RMSE of 0.7776554578192888.