In [5]:
import pandas as pd
import wget
import zipfile
import glob
from padelpy import padeldescriptor
import os

#Prepare fingerprint XML
#Download fingerprint XML files

if 'fingerprints_xml.zip' not in os.listdir():
    url =  'https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip'
    wget.download(url)

with zipfile.ZipFile("fingerprints_xml.zip", "r") as zip_ref:
    zip_ref.extractall()

# List and sort fingerprint XML files

xml_files = glob.glob("*.xml")
xml_files.sort()
xml_files

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

# Create a dictionary¶
fp = dict(zip(FP_list, xml_files))
fp

df = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/master/HCV_NS5B_Curated.csv')
df.head()

# Prepare data subset as input to PaDEL
df2 = pd.concat( [df['CANONICAL_SMILES'],df['CMPD_CHEMBLID']], axis=1 )
df2.to_csv('molecule.smi', sep='\t', index=False, header=False)
df2

# Calculate descriptors
# There are 12 fingerprint types in PaDEL. 
# To calculate all 12, make sure to make adjustments to the descriptor types 
#input argument to any of the ones in the fp dictionary variable as shown above, e.g. SubstructureFingerprintCount.xml
fp

# 1st AtomPairs2D Descriptor


fingerprint = 'AtomPairs2D'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #AtomPairs2D.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'AtomPairs2D.csv'
                #descriptortypes='AtomPairs2DFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)


fingerprint = 'AtomPairs2DCount'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #AtomPairs2DCount.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'AtomPairs2DCount.csv'
                #descriptortypes='AtomPairs2DCountFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)


fingerprint = 'CDK'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #CDK.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'CDK.csv'
                #descriptortypes='CDKFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)


fingerprint = 'CDKextended'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #CDKextended.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'CDKextended.csv'
                #descriptortypes='CDKextendedFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)


fingerprint = 'CDKgraphonly'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #CDKgraphonly.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'CDKgraphonly.csv'
                #descriptortypes='CDKgraphonlyFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)


fingerprint = 'EState'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #EState.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'EState.csv'
                #descriptortypes='EStateFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'KlekotaRoth'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #KlekotaRoth.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'KlekotaRoth.csv'
                #descriptortypes='KlekotaRothFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'KlekotaRothCount'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #KlekotaRothCount.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'KlekotaRothCount.csv'
                #descriptortypes='KlekotaRothCountFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'MACCS'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #MACCS.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'MACCS.csv'
                #descriptortypes='MACCSFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'PubChem'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #PubChem.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'PubChem.csv'
                #descriptortypes='PubChemFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'Substructure'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

fingerprint = 'SubstructureCount'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #SubstructureCount.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'SubstructureCount.csv'
                #descriptortypes='SubstructureCountFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

100% [..............................................................................] 10871 / 10871

Collecting padelpy
  Using cached padelpy-0.1.10-py2.py3-none-any.whl (20.9 MB)
Installing collected packages: padelpy
Successfully installed padelpy-0.1.10
