## Imports

In [2]:
from collections import defaultdict as ddict, OrderedDict as odict
from typing import Any, Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator


pd.set_option('display.float_format', lambda x: '%.3f' % x)  # Display floats without scientific notation
PandasTools.RenderImagesInAllDataFrames()  # Render mol images



---
## Loading Datasets

In [8]:
sdf_path = './datasets/combined_training_datasets_unique_no_oe.sdf'
all_df = PandasTools.LoadSDF(sdf_path).astype(dict(pKa=float, 
                                                   marvin_atom=int, 
                                                   marvin_pKa=float), 
                                                   copy=False).set_index('ID', verify_integrity=True)
all_df.drop('ROMol', axis=1, inplace=True)
all_df.to_csv('./datasets/combined_training_datasets_unique.csv', sep='\t', index=True, header=True)


In [6]:
novartis_testset = PandasTools.LoadSDF('./datasets/novartis_cleaned_mono_unique_notraindata.sdf').set_index('ID', verify_integrity=True)
#novartis_testset['SMILES'] = novartis_testset['ROMol'].apply(Chem.MolToSmiles)
novartis_testset.drop('ROMol', axis=1, inplace=True)
avlilumove_testset = PandasTools.LoadSDF('./datasets/AvLiLuMoVe_cleaned_mono_unique_notraindata.sdf').set_index('ID', verify_integrity=True)
#avlilumove_testset['SMILES'] = avlilumove_testset['ROMol'].apply(Chem.MolToSmiles)
avlilumove_testset.drop('ROMol', axis=1, inplace=True)

Unnamed: 0_level_0,pKa,marvin_pKa,marvin_atom,marvin_pKa_type,ISO_SMI
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
353,7.42,8.6,10,basic,C#CC[NH+](C)[C@H](C)Cc1ccc(F)cc1
181,9.5,9.86,9,basic,CC(=O)CC1CCCC[NH2+]1


In [5]:
avlilumove_testset.to_csv('./datasets/avlilumove_testset.csv', sep='\t', index=False, header=True)
novartis_testset.to_csv('./datasets/novartis_testset.csv', sep='\t', index=False, header=True)
