# Model validation with external data
In this notebook I will get the performance of the model on an external curated dataset I have obtained from a [publication](https://pubs.acs.org/doi/full/10.1021/acsomega.2c00642) on Predicting Aqueous Solubility of Organic Molecules Using Deep Learning Models with Varied Molecular Representations. 

In [1]:
# import libraries and set path to files and folders
import pandas as pd
import numpy as np
import sys
from sklearn.metrics import r2_score, mean_absolute_error

sys.path.append('../src')
from smiles_processing import standardise_smiles
from inchikey_processing import standardise_inchikey

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

validation_dataset = '../data/validation datasets/dataset.csv'
training_datasets = '../data/soltranet/data/training_data/final_model/ind_dyn8_s0_train.predictions'
validation_input = '../data/validation.csv'
predictions = '../data/validation_output.csv'

In [2]:
# Load the validation dataset and retain the needed columns
val = pd.read_csv(validation_dataset)

val = val[['SMILES', 'Experimental Solubility in Water', 'Molar Mass']]
val

Unnamed: 0,SMILES,Experimental Solubility in Water,Molar Mass
0,C(F)(F)(F)F,18.800000,88.0000
1,C,22.000000,16.0400
2,C(C(F)(F)F)(F)(F)F,7.780000,138.0100
3,C(F)(F)F,4090.000000,70.0100
4,C(#N)C(F)(F)F,0.104000,95.0200
...,...,...,...
11691,CC(=O)N[C@@H](CC1=CC(=C(C(=C1)Br)O)Br)C(=O)O,2500.046006,381.0202
11692,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,910.986317,532.6220
11693,CN1/C(=C(\NC2=CC=CC=N2)/O)/C(=O)C3=CC=CC=C3S1(...,23.000000,331.3500
11694,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,0.014200,411.9025


In [3]:
# calculate log solubility from the expreminetal solubility of the molecules
val['logS'] = np.round(np.log10(1e-3*val['Experimental Solubility in Water']/val['Molar Mass']), 3)
val

Unnamed: 0,SMILES,Experimental Solubility in Water,Molar Mass,logS
0,C(F)(F)(F)F,18.800000,88.0000,-3.670
1,C,22.000000,16.0400,-2.863
2,C(C(F)(F)F)(F)(F)F,7.780000,138.0100,-4.249
3,C(F)(F)F,4090.000000,70.0100,-1.233
4,C(#N)C(F)(F)F,0.104000,95.0200,-5.961
...,...,...,...,...
11691,CC(=O)N[C@@H](CC1=CC(=C(C(=C1)Br)O)Br)C(=O)O,2500.046006,381.0202,-2.183
11692,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,910.986317,532.6220,-2.767
11693,CN1/C(=C(\NC2=CC=CC=N2)/O)/C(=O)C3=CC=CC=C3S1(...,23.000000,331.3500,-4.159
11694,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,0.014200,411.9025,-7.463


In [4]:
# check for null values and duplicates
print(f"Null values: \n {val.isna().sum()}, \n Duplicate rows: {val.duplicated().sum()}")


Null values: 
 SMILES                              0
Experimental Solubility in Water    0
Molar Mass                          0
logS                                0
dtype: int64, 
 Duplicate rows: 0


In [5]:
# Load the training dataset
training_df = pd.read_csv(training_datasets)
training_df

Unnamed: 0,smile,true,pred
0,[Br-].CCCCCCCCCCCCCCCCCC[N+](C)(C)C,-3.616127,-2.780894
1,O=C1Nc2cccc3cccc1c23,-3.254767,-2.948608
2,Clc1ccc(C=O)cc1,-2.177078,-1.612529
3,[Zn++].CC(c1ccccc1)c2cc(C(C)c3ccccc3)c(O)c(c2)...,-3.924409,-6.276278
4,C1OC1CN(CC2CO2)c3ccc(Cc4ccc(cc4)N(CC5CO5)CC6CO...,-4.662065,-2.360610
...,...,...,...
9977,C(c1ccc(cc1)NCCCC)(=O)OCCN(C)C,-3.010000,-1.315528
9978,OC1=C(C(C2=C(O)[C@@](C(C(C(N)=O)=C(O)[C@H]3N(C...,-2.930000,-1.802101
9979,c1(cc(ccc1C(C)C)C)O,-2.190000,-1.869856
9980,COc1ccc(CCN(C)CCCC(C#N)(C(C)C)c2ccc(OC)c(OC)c2...,-3.980000,-3.701252


## Data cleaning

In [6]:
# standardise SMILES for the validation dataframe
smiles_list = val['SMILES'].tolist()
standardised_smiles_list = standardise_smiles(smiles_list)
val['standard_smiles'] = standardised_smiles_list

# standardise SMILES for the training dataframe
smiles_list = training_df['smile'].tolist()
standardised_smiles_list = standardise_smiles(smiles_list)
training_df['standard_smiles'] = standardised_smiles_list

[19:58:47] Can't kekulize mol.  Unkekulized atoms: 3 10 11 14 15 18
[19:58:50] Can't kekulize mol.  Unkekulized atoms: 3 7
[19:58:55] Can't kekulize mol.  Unkekulized atoms: 3 7
[19:58:56] Can't kekulize mol.  Unkekulized atoms: 3 9
[19:58:56] Can't kekulize mol.  Unkekulized atoms: 3 9
[19:58:59] Can't kekulize mol.  Unkekulized atoms: 0 1 5 6 7 9
[19:59:02] Can't kekulize mol.  Unkekulized atoms: 0 2 4 5 6 8
[19:59:02] Can't kekulize mol.  Unkekulized atoms: 0 2 4 6 7 9


In [7]:
# check for null values in standardised SMILES and dropping them in the datasets
print(f'Null values in the val dataset before dropping:\n{val.isnull().sum()}')
print(f'Null values in the training dataset before dropping:\n{training_df.isnull().sum()}')

val = val.dropna()
training_df = training_df.dropna()

print(f'Null values in the val dataset after dropping:\n{val.isnull().sum()}')
print(f'Null values in the training dataset after dropping:\n{training_df.isnull().sum()}')




Null values in the val dataset before dropping:
SMILES                                0
Experimental Solubility in Water      0
Molar Mass                            0
logS                                  0
standard_smiles                     175
dtype: int64
Null values in the training dataset before dropping:
smile                0
true                 0
pred                 0
standard_smiles    825
dtype: int64
Null values in the val dataset after dropping:
SMILES                              0
Experimental Solubility in Water    0
Molar Mass                          0
logS                                0
standard_smiles                     0
dtype: int64
Null values in the training dataset after dropping:
smile              0
true               0
pred               0
standard_smiles    0
dtype: int64


In [8]:
# check for null values in standardised SMILES in the training dataset


In [9]:
# get standardised InchiKey from SMILES in the val dataset
smiles_list = val['standard_smiles'].tolist()
standardised_inchikeys_list = standardise_inchikey(smiles_list)
val['standard_inchikeys'] = standardised_inchikeys_list
val

Unnamed: 0,SMILES,Experimental Solubility in Water,Molar Mass,logS,standard_smiles,standard_inchikeys
0,C(F)(F)(F)F,18.800000,88.0000,-3.670,FC(F)(F)F,TXEYQDLBPFQVAA-UHFFFAOYSA-N
1,C,22.000000,16.0400,-2.863,C,VNWKTOKETHGBQD-UHFFFAOYSA-N
2,C(C(F)(F)F)(F)(F)F,7.780000,138.0100,-4.249,FC(F)(F)C(F)(F)F,WMIYKQLTONQJES-UHFFFAOYSA-N
3,C(F)(F)F,4090.000000,70.0100,-1.233,FC(F)F,XPDWGBQVDMORPB-UHFFFAOYSA-N
4,C(#N)C(F)(F)F,0.104000,95.0200,-5.961,N#CC(F)(F)F,SFFUEHODRAXXIA-UHFFFAOYSA-N
...,...,...,...,...,...,...
11691,CC(=O)N[C@@H](CC1=CC(=C(C(=C1)Br)O)Br)C(=O)O,2500.046006,381.0202,-2.183,CC(=O)N[C@@H](Cc1cc(Br)c(O)c(Br)c1)C(=O)O,FNGNLZLLBCYMOG-VIFPVBQESA-N
11692,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,910.986317,532.6220,-2.767,CC1=C[C@H]2O[C@@H]3C[C@H]4OC(=O)/C=C\C=C/[C@H]...,NSFWWJIQIKBZMJ-CWFPLRNTSA-N
11693,CN1/C(=C(\NC2=CC=CC=N2)/O)/C(=O)C3=CC=CC=C3S1(...,23.000000,331.3500,-4.159,CN1C(C(=O)Nc2ccccn2)C(=O)c2ccccc2S1(=O)=O,MNSCVRXKDINXEY-UHFFFAOYSA-N
11694,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,0.014200,411.9025,-7.463,Cc1cc(N(CCO)CCO)ccc1N=Nc1ccc(S(C)(=O)=O)cc1Cl,KLMFZVQIWHSLOE-UHFFFAOYSA-N


## Check for data leaks

In [10]:
# Identify unique identifiers 
train_ids = set(training_df['standard_smiles'])
valid_ids = set(val['standard_smiles'])

# Check for overlap
overlap = train_ids.intersection(valid_ids)

if overlap:
    print("Data leakage detected! Overlapping records found.")
    print("Number of overlapping IDs:", len(overlap))
    # print("Overlap IDs:", overlap)
else:
    print("No data leakage detected. Validation set is clean.")

Data leakage detected! Overlapping records found.
Number of overlapping IDs: 5589


In [11]:
# Drop overlapping data from the validation DataFrame
validation = val[~val['standard_smiles'].isin(overlap)]
validation.reset_index(drop = True,inplace=True)
validation


Unnamed: 0,SMILES,Experimental Solubility in Water,Molar Mass,logS,standard_smiles,standard_inchikeys
0,C(#N)C(F)(F)F,0.104000,95.0200,-5.961,N#CC(F)(F)F,SFFUEHODRAXXIA-UHFFFAOYSA-N
1,C(C(C(F)(F)F)(C(F)(F)F)F)(C(C(F)(F)F)(C(F)(F)F...,0.001220,488.0670,-8.602,FC(F)(F)C(F)(F)C(F)(C(F)(C(F)(F)F)C(F)(F)F)C(F...,GEXSLRJJTOFEMA-UHFFFAOYSA-N
2,C(C(C(C(F)(F)F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,0.169000,338.0440,-6.301,FC(F)(F)C(F)(F)C(F)(F)C(F)(C(F)(F)F)C(F)(F)F,ROVMKEZVKFJNBD-UHFFFAOYSA-N
3,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(F)F,1.150000,288.0360,-5.399,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,NJCBUSHGCBERSK-UHFFFAOYSA-N
4,C1(C(C(C(C1(F)F)(F)F)(F)F)(F)F)(C(F)(F)F)F,0.010000,300.0468,-7.477,FC(F)(F)C1(F)C(F)(F)C(F)(F)C(F)(F)C1(F)F,BCNXQFASJTYKDJ-UHFFFAOYSA-N
...,...,...,...,...,...,...
5926,C1=CC(=C(C=C1N)S(=O)(=O)O)O,670.000000,189.1900,-2.451,Nc1ccc(O)c(S(=O)(=O)O)c1,SILINKWDNDDXTL-UHFFFAOYSA-N
5927,CC1=CC(=O)N=C(N1)S,533.241385,142.1752,-2.426,Cc1cc(=O)[nH]c(S)n1,HWGBHCRJGXAGEU-UHFFFAOYSA-N
5928,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,910.986317,532.6220,-2.767,CC1=C[C@H]2O[C@@H]3C[C@H]4OC(=O)/C=C\C=C/[C@H]...,NSFWWJIQIKBZMJ-CWFPLRNTSA-N
5929,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,0.014200,411.9025,-7.463,Cc1cc(N(CCO)CCO)ccc1N=Nc1ccc(S(C)(=O)=O)cc1Cl,KLMFZVQIWHSLOE-UHFFFAOYSA-N


In [12]:
# Save the validation df to CSV
validation.to_csv(validation_input, index=False)

## Run predictions from model

- First, run `ersilia -v fetch eos6oli` on the terminal to fetch the model.

- Run `ersilia serve eos6oli` to serve the model.

- Run `ersilia -v api run -i data/validation.csv -o data/validation_output.csv` to make predictions and save the output in the output.csv

## Visualizing the output

In [13]:
predictions_df = pd.read_csv(predictions)
predictions_df

Unnamed: 0,key,input,solubility
0,SFFUEHODRAXXIA-UHFFFAOYSA-N,C(#N)C(F)(F)F,-0.024
1,GEXSLRJJTOFEMA-UHFFFAOYSA-N,C(C(C(F)(F)F)(C(F)(F)F)F)(C(C(F)(F)F)(C(F)(F)F...,-7.716
2,ROVMKEZVKFJNBD-UHFFFAOYSA-N,C(C(C(C(F)(F)F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,-5.710
3,NJCBUSHGCBERSK-UHFFFAOYSA-N,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(F)F,-4.785
4,BCNXQFASJTYKDJ-UHFFFAOYSA-N,C1(C(C(C(C1(F)F)(F)F)(F)F)(F)F)(C(F)(F)F)F,-5.347
...,...,...,...
5926,SILINKWDNDDXTL-UHFFFAOYSA-N,C1=CC(=C(C=C1N)S(=O)(=O)O)O,-1.232
5927,HWGBHCRJGXAGEU-UHFFFAOYSA-N,CC1=CC(=O)N=C(N1)S,-1.803
5928,NSFWWJIQIKBZMJ-CWFPLRNTSA-N,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,-3.916
5929,KLMFZVQIWHSLOE-UHFFFAOYSA-N,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,-3.707


In [14]:
#merge the validation csv to the predictions csv for visualization

# Concatenate the two DataFrames along the columns axis
merged_df = pd.concat([validation, predictions_df[['solubility']]], axis=1)
merged_df


Unnamed: 0,SMILES,Experimental Solubility in Water,Molar Mass,logS,standard_smiles,standard_inchikeys,solubility
0,C(#N)C(F)(F)F,0.104000,95.0200,-5.961,N#CC(F)(F)F,SFFUEHODRAXXIA-UHFFFAOYSA-N,-0.024
1,C(C(C(F)(F)F)(C(F)(F)F)F)(C(C(F)(F)F)(C(F)(F)F...,0.001220,488.0670,-8.602,FC(F)(F)C(F)(F)C(F)(C(F)(C(F)(F)F)C(F)(F)F)C(F...,GEXSLRJJTOFEMA-UHFFFAOYSA-N,-7.716
2,C(C(C(C(F)(F)F)(F)F)(F)F)(C(F)(F)F)(C(F)(F)F)F,0.169000,338.0440,-6.301,FC(F)(F)C(F)(F)C(F)(F)C(F)(C(F)(F)F)C(F)(F)F,ROVMKEZVKFJNBD-UHFFFAOYSA-N,-5.710
3,C(C(C(F)(F)F)(F)F)(C(C(F)(F)F)(F)F)(F)F,1.150000,288.0360,-5.399,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,NJCBUSHGCBERSK-UHFFFAOYSA-N,-4.785
4,C1(C(C(C(C1(F)F)(F)F)(F)F)(F)F)(C(F)(F)F)F,0.010000,300.0468,-7.477,FC(F)(F)C1(F)C(F)(F)C(F)(F)C(F)(F)C1(F)F,BCNXQFASJTYKDJ-UHFFFAOYSA-N,-5.347
...,...,...,...,...,...,...,...
5926,C1=CC(=C(C=C1N)S(=O)(=O)O)O,670.000000,189.1900,-2.451,Nc1ccc(O)c(S(=O)(=O)O)c1,SILINKWDNDDXTL-UHFFFAOYSA-N,-1.232
5927,CC1=CC(=O)N=C(N1)S,533.241385,142.1752,-2.426,Cc1cc(=O)[nH]c(S)n1,HWGBHCRJGXAGEU-UHFFFAOYSA-N,-1.803
5928,O=C1/C=C\C=C/[C@@H](OCC[C@H]([C@@H](C(=O)OC[C@...,910.986317,532.6220,-2.767,CC1=C[C@H]2O[C@@H]3C[C@H]4OC(=O)/C=C\C=C/[C@H]...,NSFWWJIQIKBZMJ-CWFPLRNTSA-N,-3.916
5929,CC1=C(C=CC(=C1)N(CCO)CCO)N=NC2=C(C=C(C=C2)S(=O...,0.014200,411.9025,-7.463,Cc1cc(N(CCO)CCO)ccc1N=Nc1ccc(S(C)(=O)=O)cc1Cl,KLMFZVQIWHSLOE-UHFFFAOYSA-N,-3.707


In [15]:
# Check for NaN values
print("NaN values in 'logS':", merged_df['logS'].isna().any())
print("NaN values in 'solubility':", merged_df['solubility'].isna().any())

# Check for infinity values
print("Infinity values in 'logS':", np.isinf(merged_df['logS']).any())
print("Infinity values in 'solubility':", np.isinf(merged_df['solubility']).any())

NaN values in 'logS': False
NaN values in 'solubility': False
Infinity values in 'logS': False
Infinity values in 'solubility': False


In [16]:


# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(merged_df['logS'], merged_df['solubility'])

# Calculate R2 score
r2 = r2_score(merged_df['logS'], merged_df['solubility'])

print(f'R2 Score: {r2}\nMean Absolute Error (MAE): {mae}')


R2 Score: 0.5399414291892406
Mean Absolute Error (MAE): 1.0274911489765737


## Conclusion
- The R2 score of 0.54 suggests that approximately 54% of the variance in the predicted solubility can be explained by the measured solubility.
- The MAE of approximately 1.03 indicates that the predicted solubility values deviates from the measured solubility values by approximately 1.03 units

- These metrics suggest that there is still room for improvement as indicated by the MAE for making the models perform better.