# Computational Drug Discovery [Part 3] Descriptor Calculation and Dataset Preparation

In **part 3**, we will be calculating molecular descriptors that are essentially quantitative description of the compounds in the dataset. Finally, we will be preparing this dataset for subsequent model building in part 4.

# Install padelpy


In [9]:
! pip install padelpy



# Prepare Fingerprint *XML*

## Download fingerprint XML files

In [10]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
! unzip fingerprints_xml.zip

--2025-06-21 06:19:35--  https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip [following]
--2025-06-21 06:19:35--  https://raw.githubusercontent.com/dataprofessor/padel/main/fingerprints_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘fingerprints_xml.zip.1’


2025-06-21 06:19:35 (94.2 MB/s) - ‘fingerprints_xml.zip.1’ saved [10871/10871]

Archive:  fingerprints_xml.zip
replace AtomPairs2DFingerprintCount.xml? [y]es, [n]o, [A]ll, [N]one, [

## List and sort fingerprint XML files

In [11]:
import glob
xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

['AtomPairs2DFingerprintCount.xml',
 'AtomPairs2DFingerprinter.xml',
 'EStateFingerprinter.xml',
 'ExtendedFingerprinter.xml',
 'Fingerprinter.xml',
 'GraphOnlyFingerprinter.xml',
 'KlekotaRothFingerprintCount.xml',
 'KlekotaRothFingerprinter.xml',
 'MACCSFingerprinter.xml',
 'PubchemFingerprinter.xml',
 'SubstructureFingerprintCount.xml',
 'SubstructureFingerprinter.xml']

In [12]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphOnly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'Pubchem',
 'SubstructureCount',
 'Substructure']

## Create a dictionary

In [13]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphOnly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'Pubchem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

# Loading Bioactivity data

In [14]:
! unzip results.zip

Archive:  results.zip
  inflating: mannwhitneyu_LogP.csv   
  inflating: plot_MW_vs_LogP.pdf     
  inflating: mannwhitneyu_MW.csv     
  inflating: plot_NumHAcceptors.pdf  
  inflating: plot_pic50.pdf          
  inflating: mannwhitneyu_pIC50.csv  
  inflating: plot_MW.pdf             
  inflating: acetylcholinesterase_bioactivity_data_2class_pIC50.csv  
  inflating: mannwhitneyu_NumHAcceptors.csv  
  inflating: acetylcholinesterase_03_bioactivity_data_curated.csv  
  inflating: plot_NumHDonors.pdf     
  inflating: mannwhitneyu_NumHDonors.csv  
  inflating: plot_LogP.pdf           
  inflating: plot_bioactivity_class.pdf  


In [15]:
import pandas as pd

In [16]:
df = pd.read_csv('acetylcholinesterase_bioactivity_data_2class_pIC50.csv')
df

Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,312.325,2.8032,0.0,6.0,6.124939
1,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,312.325,2.8032,0.0,6.0,7.000000
2,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,376.913,4.5546,0.0,5.0,4.301030
3,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,376.913,4.5546,0.0,5.0,6.522879
4,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,426.851,5.3574,0.0,5.0,6.096910
...,...,...,...,...,...,...,...,...
14343,,,,401.474,2.9571,3.0,5.0,
14344,,,,387.447,2.6149,4.0,5.0,
14345,,,,387.447,2.6149,4.0,5.0,
14346,,,,388.431,3.0419,3.0,5.0,


## creating smi file for padeldescriptor

In [17]:
selection = ['canonical_smiles', 'molecule_chembl_id']
df2 = df[selection]
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,CHEMBL133897
1,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,CHEMBL336398
2,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,CHEMBL131588
3,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,CHEMBL130628
4,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,CHEMBL130478
...,...,...
14343,,
14344,,
14345,,
14346,,


# Calculate fingerprint descriptors

## Calculate PaDEL Descriptors

There are 12 fingerprint types in PaDEL. To calculate all 12, make sure to make adjustments to the ***descriptortype*** input argument to any of the ones in the ***fp*** dictionary variable as shown above, e.g.*SubstructureFingerprintCount.xml*

In [18]:
fp

{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml',
 'EState': 'EStateFingerprinter.xml',
 'CDKextended': 'ExtendedFingerprinter.xml',
 'CDK': 'Fingerprinter.xml',
 'CDKgraphOnly': 'GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'KlekotaRothFingerprinter.xml',
 'MACCS': 'MACCSFingerprinter.xml',
 'Pubchem': 'PubchemFingerprinter.xml',
 'SubstructureCount': 'SubstructureFingerprintCount.xml',
 'Substructure': 'SubstructureFingerprinter.xml'}

In [19]:
fp['Pubchem']

'PubchemFingerprinter.xml'

In [21]:
from padelpy import padeldescriptor

fingerprint = 'Pubchem'

fingerprint_output_file = ''.join([fingerprint, '.csv']) #Pubchem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',
                d_file=fingerprint_output_file, #'Pubchem.csv'
                descriptortypes=fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

## Display calculated fingerprints

In [22]:
descriptors =  pd.read_csv(fingerprint_output_file)
descriptors

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL133897,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL336398,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL131588,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL130628,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL130478,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,CHEMBL95,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1587,CHEMBL54126,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1588,CHEMBL556939,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1589,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Preparing the X and Y Data Matrices

## X data matrix

X matrix only contain Pubchem fingerprint

In [23]:
df3 = pd.read_csv(fingerprint_output_file)
df3

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL133897,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL336398,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL131588,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL130628,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL130478,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,CHEMBL95,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1587,CHEMBL54126,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1588,CHEMBL556939,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1589,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Y variable

Y variable only contain pIC50 values

### Adding pIC50 value
firstly, we merge to  dataset from acetylcholinesterase_bioactivity_data_2class_pIC50.csv and Pubchem and then drop every column execpt pIC50 values

In [25]:
import pandas as pd

# Step 1: Load the original data with pIC50
original_df = pd.read_csv("acetylcholinesterase_bioactivity_data_2class_pIC50.csv")  # this must have molecule_chembl_id and pIC50
# Example columns: ['canonical_smiles', 'molecule_chembl_id', 'pIC50']

# Step 2: Load PaDEL output (descriptors/fingerprints)
padel_df = pd.read_csv("Pubchem.csv")  # contains 'Name' and descriptors

# Step 3: Align by molecule_chembl_id (Name column in PaDEL = molecule_chembl_id)
merged_df = pd.merge(padel_df, original_df, left_on='Name', right_on='molecule_chembl_id')

# Step 4: Create X and Y
X = merged_df.drop(columns=['Name', 'canonical_smiles', 'molecule_chembl_id', 'pIC50'])  # Only descriptors
Y = merged_df['pIC50']

# Optional: Check dimensions
print("X shape:", X.shape)
print("Y shape:", Y.shape)

merged_df

X shape: (11096, 886)
Y shape: (11096,)


Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP879,PubchemFP880,molecule_chembl_id,canonical_smiles,bioactivity_class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL133897,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL133897,CCOc1nn(-c2cccc(OCc3ccccc3)c2)c(=O)o1,active,312.325,2.8032,0.0,6.0,6.124939
1,CHEMBL336398,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL336398,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC1CC1,active,312.325,2.8032,0.0,6.0,7.000000
2,CHEMBL131588,1,1,0,0,0,0,0,0,0,...,0,0,CHEMBL131588,CN(C(=O)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F)c1ccccc1,inactive,376.913,4.5546,0.0,5.0,4.301030
3,CHEMBL130628,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL130628,O=C(N1CCCCC1)n1nc(-c2ccc(Cl)cc2)nc1SCC(F)(F)F,active,376.913,4.5546,0.0,5.0,6.522879
4,CHEMBL130478,1,1,0,0,0,0,0,0,0,...,0,0,CHEMBL130478,CSc1nc(-c2ccc(OC(F)(F)F)cc2)nn1C(=O)N(C)C,active,426.851,5.3574,0.0,5.0,6.096910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11091,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL659,COc1ccc2c3c1O[C@H]1C[C@@H](O)C=C[C@@]31CCN(C)C2,,389.499,3.8744,0.0,5.0,5.889410
11092,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL659,COc1ccc2c3c1O[C@H]1C[C@@H](O)C=C[C@@]31CCN(C)C2,,319.404,4.4459,0.0,3.0,5.698970
11093,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL659,COc1ccc2c3c1O[C@H]1C[C@@H](O)C=C[C@@]31CCN(C)C2,,393.571,6.2538,0.0,3.0,6.226214
11094,CHEMBL659,1,1,1,0,0,0,0,0,0,...,0,0,CHEMBL659,COc1ccc2c3c1O[C@H]1C[C@@H](O)C=C[C@@]31CCN(C)C2,,418.537,3.8684,0.0,5.0,5.920819


In [26]:
merged_df.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

# Final Matrix of X and Y variables

In [27]:
selection_2 = ['molecule_chembl_id', 'canonical_smiles', 'bioactivity_class', 'MW', 'LogP', 'NumHDonors', 'NumHAcceptors', 'Name']

colume_to_drop = selection_2

df_dropped = merged_df.drop(columns=colume_to_drop)

df_dropped

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.124939
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.000000
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,4.301030
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.522879
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.096910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11091,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.889410
11092,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.698970
11093,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.226214
11094,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,5.920819


In [28]:
df_dropped.to_csv('acetylcholinesterase_calculated_and_prepared_data_for_model_building.csv')