In [1]:
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools, AllChem as Chem
import pickle

pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation
PandasTools.RenderImagesInAllDataFrames()  # Render mol images

pka_low = 1.7
pka_high = 13.0

In [2]:
def duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """Find duplicates based on the ISO smiles average the pKa values, then drop the duplicates
    """
    assert 'ISO_SMI' in df.columns, "Column 'ISO_SMI' does not exist in the DataFrame."
    # not sure if the assert is necessary?
    
    df_duplicates = df[df.duplicated(subset='ISO_SMI', keep=False)].sort_values('ISO_SMI')

    if df_duplicates.shape[0] > 0:
        print("Duplicates found: ", df_duplicates.shape)

        mean_pKa = df_duplicates.groupby('ISO_SMI')['pKa'].mean()
        
        # the outliers don't seem to be a problem; add code anyway just in case
        std_pKa = df_duplicates.groupby('ISO_SMI')['pKa'].std()
        threshold = mean_pKa + 2 * std_pKa
        threshold = threshold.reindex(df_duplicates['pKa'].index)
        outliers = df_duplicates[df_duplicates['pKa'] > threshold]
        print("outliers (empty df if none, optional usage so far only)\n", outliers)
        print("----------------------------------------")

        df_duplicates['pKa'] = mean_pKa
        df_duplicates.drop_duplicates(subset='ISO_SMI', inplace=True)
        
        print("bedore drop duplicates", df.shape)
        df = df[~df.index.isin(df_duplicates.index)]
        df = df.reindex()
        print("after drop duplicates", df.shape)
    else:
        print("No duplicates found. DF shape remains at ", df.shape)

    return df

In [3]:
sdf_path = './datasets_AM/avlilumove_cleaned.sdf'
df_avli = PandasTools.LoadSDF(sdf_path)
sdf_path = './datasets_AM/chembl_cleaned.sdf'
df_chembl = PandasTools.LoadSDF(sdf_path)
sdf_path = './datasets_AM/datawarrior_cleaned.sdf'
df_dw = PandasTools.LoadSDF(sdf_path)
sdf_path = './datasets_AM/novartis_cleaned.sdf'
df_novar = PandasTools.LoadSDF(sdf_path)


In [4]:
print("AVLiLumove:", df_avli.shape)
print("Header:", df_avli.columns)
print("Index Name:", df_avli.index.name)
print("----------------------------------------")
print("df_dw:", df_dw.shape)
print("Header:", df_dw.columns)
print("Index Name:", df_dw.index.name)
print("----------------------------------------")
print("df_novar:", df_novar.shape)
print("Header:", df_novar.columns)
print("Index Name:", df_novar.index.name)
print("----------------------------------------")
print("df_chembl:", df_chembl.shape)
print("Header:", df_chembl.columns)
print("Index Name:", df_chembl.index.name)
print("----------------------------------------")


AVLiLumove: (606, 6)
Header: Index(['SMILES', 'pKa', 'database', 'temp', 'ID', 'ROMol'], dtype='object')
Index Name: None
----------------------------------------
df_dw: (7303, 14)
Header: Index(['pKa', 'temp', 'method', 'type', 'group', 'basicOrAcidic', 'Row-ID',
       'Neighbor Similarity OrgFunctions 96%', 'Neighbor Count', 'Neighbor',
       'Neighbor Analysis X', 'Neighbor Analysis Y', 'ID', 'ROMol'],
      dtype='object')
Index Name: None
----------------------------------------
df_novar: (280, 7)
Header: Index(['pKa', 'marvin_pKa', 'marvin_atom', 'marvin_pKa_type', 'ISO_SMI', 'ID',
       'ROMol'],
      dtype='object')
Index Name: None
----------------------------------------
df_chembl: (7639, 4)
Header: Index(['pKa', 'temp', 'ID', 'ROMol'], dtype='object')
Index Name: None
----------------------------------------


## Novartis dataset

In [5]:
df_novar.drop(['marvin_atom', 'marvin_pKa_type', 'ID'], axis=1, inplace=True)
df_novar['Source'] = "Novartis"
df_novar['pKa'] = df_novar['pKa'].astype(float)
df_novar = df_novar[(df_novar['pKa'] >= pka_low) & (df_novar['pKa'] <= pka_high)]
df_novar['marvin_pKa'] = df_novar['marvin_pKa'].astype(float)
df_novar['ISO_SMI'] = df_novar['ISO_SMI'].astype(str)
df_novar = duplicates(df_novar)
df_novar.describe()


No duplicates found. DF shape remains at  (280, 5)


Unnamed: 0,pKa,marvin_pKa
count,280.0,280.0
mean,6.25,6.244
std,2.307,2.38
min,2.1,2.13
25%,4.2,4.057
50%,5.8,5.975
75%,8.35,8.322
max,11.7,11.82


## Avli dataset

In [6]:
df_avli['pKa'] = df_avli['pKa'].astype(float)
df_avli = df_avli[(df_avli['pKa'] >= pka_low) & (df_avli['pKa'] <= pka_high)]
df_avli.drop(['temp', 'ID', 'SMILES'], axis=1, inplace=True)
df_avli['ISO_SMI'] = df_avli['ROMol'].apply(Chem.MolToSmiles, isomericSmiles=True)
df_avli.rename(columns={'database': 'Source'}, inplace=True)
df_avli = duplicates(df_avli)
df_avli.describe()


Duplicates found:  (183, 4)
outliers (empty df if none, optional usage so far only)
 Empty DataFrame
Columns: [pKa, Source, ROMol, ISO_SMI]
Index: []
----------------------------------------
bedore drop duplicates (605, 4)
after drop duplicates (521, 4)


Unnamed: 0,pKa
count,521.0
mean,7.292
std,2.56
min,1.77
25%,4.7
50%,8.15
75%,9.38
max,11.4


## Chembl dataset

In [7]:
df_chembl['pKa'] = df_chembl['pKa'].astype(float)
df_chembl = df_chembl[(df_chembl['pKa'] >= pka_low) & (df_chembl['pKa'] <= pka_high)]
df_chembl.drop(['temp', 'ID'], axis=1, inplace=True)
df_chembl['ISO_SMI'] = df_chembl['ROMol'].apply(Chem.MolToSmiles, isomericSmiles=True)
df_chembl['Source'] = "Chembl"
df_chembl = duplicates(df_chembl)
df_chembl.describe()


Duplicates found:  (2447, 4)
outliers (empty df if none, optional usage so far only)
 Empty DataFrame
Columns: [pKa, ROMol, ISO_SMI, Source]
Index: []
----------------------------------------
bedore drop duplicates (7412, 4)
after drop duplicates (6547, 4)


Unnamed: 0,pKa
count,6547.0
mean,7.145
std,2.267
min,1.7
25%,5.5
50%,7.43
75%,8.9
max,13.0


## Datawarrior dataset

In [8]:
df_dw['pKa'] = df_dw['pKa'].astype(float)
df_dw = df_dw[(df_dw['pKa'] >= pka_low) & (df_dw['pKa'] <= pka_high)]
df_dw = df_dw.loc[:, ['pKa', 'ROMol']]
df_dw['ISO_SMI'] = df_dw['ROMol'].apply(Chem.MolToSmiles, isomericSmiles=True)
df_dw['Source'] = "DataWarrior"
df_dw = duplicates(df_dw)
df_dw.describe()

Duplicates found:  (659, 4)
outliers (empty df if none, optional usage so far only)
 Empty DataFrame
Columns: [pKa, ROMol, ISO_SMI, Source]
Index: []
----------------------------------------
bedore drop duplicates (6793, 4)
after drop duplicates (6465, 4)


Unnamed: 0,pKa
count,6465.0
mean,6.848
std,2.829
min,1.7
25%,4.35
50%,6.9
75%,9.28
max,13.0


## Combine all the dataframes and check again for duplicates

In [9]:
combined_df = pd.concat([df_avli, df_chembl, df_dw, df_novar], ignore_index=True)
combined_df['marvin_pKa'].fillna(np.nan, inplace=True)
print("Combined DF:", combined_df.shape)
print("Combine DF:", combined_df.describe())
print(combined_df.head(5))

Combined DF: (13813, 5)
Combine DF:             pKa  marvin_pKa
count 13813.000     280.000
mean      6.993       6.244
std       2.563       2.380
min       1.700       2.130
25%       4.790       4.057
50%       7.300       5.975
75%       9.100       8.322
max      13.000      11.820
    pKa  Source                                             ROMol  \
0 9.300  Vertex  <rdkit.Chem.rdchem.Mol object at 0x7f27bb8529d0>   
1 5.500  Vertex  <rdkit.Chem.rdchem.Mol object at 0x7f27bb852ab0>   
2 8.600  Vertex  <rdkit.Chem.rdchem.Mol object at 0x7f27bb852c70>   
3 9.400  Vertex  <rdkit.Chem.rdchem.Mol object at 0x7f27bb852e30>   
4 6.050  Vertex  <rdkit.Chem.rdchem.Mol object at 0x7f27bb852f10>   

                                             ISO_SMI  marvin_pKa  
0  COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(...         NaN  
1     CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn(CCF)c3c2F)CC1         NaN  
2  C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...         NaN  
3  COc1c(N2CCNC(C)C2)c(F)cc2c

In [10]:
final_df = duplicates(combined_df)

Duplicates found:  (1769, 5)
outliers (empty df if none, optional usage so far only)
 Empty DataFrame
Columns: [pKa, Source, ROMol, ISO_SMI, marvin_pKa]
Index: []
----------------------------------------
bedore drop duplicates (13813, 5)
after drop duplicates (13219, 5)


In [11]:
# randomize the order of the rows for later ML purposes
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

## write to different formats

In [12]:
PandasTools.WriteSDF(final_df, './datasets_AM/all_data_cleaned.sdf', molColName='ROMol', properties=list(final_df.columns))
final_df.to_csv('./datasets_AM/all_data_cleaned.csv', sep='\t', index=False, header=True)
pickle_path = './datasets_AM/all_data_cleaned.pkl'
with open(pickle_path, 'wb') as file:
    pickle.dump(final_df, file)