# Computational drug discovery: part 3
## Descriptor Calculation and Dataset Preparation

In [34]:
# Download PaDEL-Descriptor
# http://www.yapcwsoft.com/dd/padeldescriptor/
! wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip
! wget https://raw.githubusercontent.com/AnVales/Drug-discovery-part-1/main/padel.sh

--2021-09-03 11:24:17--  https://github.com/AnVales/Drug-discovery/blob/main/padel.zip
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘padel.zip’

padel.zip               [ <=>                ] 126.78K  --.-KB/s    in 0.1s    

2021-09-03 11:24:17 (1.10 MB/s) - ‘padel.zip’ saved [129819]

--2021-09-03 11:24:17--  https://raw.githubusercontent.com/AnVales/Drug-discovery/main/padel.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231 [text/plain]
Saving to: ‘padel.sh’


2021-09-03 11:24:18 (9.39 MB/s) - ‘padel.sh’ saved [231/231]



In [41]:
# Open zip
! unzip padel.zip

Archive:  padel.zip
replace __MACOSX/._PaDEL-Descriptor? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [51]:
# Load bioactivity data: bioactivity_data_3class_pIC50.csv
# It contains the pIC50 values that we will be using for building a regression model
! wget -O genital_herpes_04_bioactivity_data_3class_pIC50.csv https://raw.githubusercontent.com/AnVales/Drug-discovery/main/genital_herpes_04_bioactivity_data_3class_pIC50.csv 

--2021-09-03 11:30:20--  https://raw.githubusercontent.com/AnVales/Drug-discovery/main/genital_herpes_04_bioactivity_data_3class_pIC50.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34554 (34K) [text/plain]
Saving to: ‘genital_herpes_04_bioactivity_data_3class_pIC50.csv’


2021-09-03 11:30:20 (8.22 MB/s) - ‘genital_herpes_04_bioactivity_data_3class_pIC50.csv’ saved [34554/34554]



In [43]:
# Import library
import pandas as pd

In [44]:
# Read bioactivity data
bioactivaty_data = pd.read_csv('genital_herpes_04_bioactivity_data_3class_pIC50.csv')
bioactivaty_data

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,bioactivity,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL122071,CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C,inactive,214.221,0.22590,1.0,4.0,4.000000
1,1,CHEMBL124107,CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O,inactive,228.248,0.61760,1.0,4.0,4.167491
2,2,CHEMBL120853,C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C...,inactive,354.362,2.47780,2.0,4.0,4.000000
3,3,CHEMBL122296,CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1,intermediate,276.292,1.26370,1.0,4.0,5.187087
4,4,CHEMBL333734,CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1,inactive,276.292,1.26370,1.0,4.0,4.568636
...,...,...,...,...,...,...,...,...,...
230,230,CHEMBL106921,Cc1csc2nc(Cc3cccc(NC(=O)c4ccccc4)c3)oc(=O)c12,intermediate,376.437,4.40102,1.0,5.0,5.823909
231,231,CHEMBL107963,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NN)cc4)c3)oc(=O)c12,intermediate,406.467,3.68662,3.0,7.0,5.619789
232,232,CHEMBL106641,CC(=O)c1cccc(NO)c1,inactive,151.165,1.69030,2.0,3.0,4.958607
233,233,CHEMBL108677,Cc1csc2nc(Cc3cccc(NC(=O)c4ccc(NO)cc4)c3)oc(=O)c12,active,407.451,4.20212,3.0,7.0,7.214670


In [45]:
# chembl_id & canonical smiles to smi
selection = ['canonical_smiles','molecule_chembl_id']
bioactivaty_data_selection = bioactivaty_data[selection]
bioactivaty_data_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [46]:
# check that the smi is fine
! cat molecule.smi | head -5

CC(=O)O[C@@H]1CC(=O)N1C(=O)NC(C)C	CHEMBL122071
CCCCNC(=O)N1C(=O)C[C@H]1OC(C)=O	CHEMBL124107
C[C@@H]1C(=O)N(C(=O)NCc2ccccc2)[C@@H]1Oc1ccc(C(=O)O)cc1	CHEMBL120853
CC(=O)O[C@@H]1[C@@H](C)C(=O)N1C(=O)NCc1ccccc1	CHEMBL122296
CC(=O)O[C@@H]1[C@H](C)C(=O)N1C(=O)NCc1ccccc1	CHEMBL333734


In [47]:
# How many molecules?
! cat molecule.smi | wc -l

235


In [48]:
# Calculate fingerprint descriptors
# Calculate PaDEL descriptors
# Options of 1gb memory, it eliminates salts and other small organic acids, we are cleaning the chemical estructure so there is no impurity
# we want the molecular fingerprint, and it is Pubchem fingerprint
# Output descriptors into descriptors_output.csv
! cat padel.sh

java -Xms1G -Xmx1G -Djava.awt.headless=true -jar ./PaDEL-Descriptor/PaDEL-Descriptor.jar -removesalt -standardizenitro -fingerprints -descriptortypes ./PaDEL-Descriptor/PubchemFingerprinter.xml -dir ./ -file descriptors_output.csv


In [49]:
# Obtain descriptors
! bash padel.sh

Processing CHEMBL122071 in molecule.smi (1/235). 
Processing CHEMBL124107 in molecule.smi (2/235). 
Processing CHEMBL120853 in molecule.smi (3/235). Average speed: 1.64 s/mol.
Processing CHEMBL122296 in molecule.smi (4/235). Average speed: 0.84 s/mol.
Processing CHEMBL333734 in molecule.smi (5/235). Average speed: 0.74 s/mol.
Processing CHEMBL333117 in molecule.smi (7/235). Average speed: 0.61 s/mol.
Processing CHEMBL74483 in molecule.smi (6/235). Average speed: 0.70 s/mol.
Processing CHEMBL334176 in molecule.smi (8/235). Average speed: 0.61 s/mol.
Processing CHEMBL122983 in molecule.smi (9/235). Average speed: 0.59 s/mol.
Processing CHEMBL122731 in molecule.smi (10/235). Average speed: 0.70 s/mol.
Processing CHEMBL123589 in molecule.smi (11/235). Average speed: 0.57 s/mol.
Processing CHEMBL331459 in molecule.smi (12/235). Average speed: 0.56 s/mol.
Processing CHEMBL123457 in molecule.smi (13/235). Average speed: 0.52 s/mol.
Processing CHEMBL123455 in molecule.smi (14/235). Average spe

In [53]:
# Check
! ls -l

total 25660
-rw-r--r-- 1 root root   428879 Sep  3 11:29 descriptors_output.csv
-rw-r--r-- 1 root root    34554 Sep  3 11:30 genital_herpes_04_bioactivity_data_3class_pIC50.csv
drwxr-xr-x 3 root root     4096 Sep  3 11:27 __MACOSX
-rw-r--r-- 1 root root    17656 Sep  3 11:27 molecule.smi
drwxrwxr-x 4 root root     4096 May 30  2020 PaDEL-Descriptor
-rw-r--r-- 1 root root      231 Sep  3 11:26 padel.sh
-rw-r--r-- 1 root root 25768637 Sep  3 11:26 padel.zip
drwxr-xr-x 1 root root     4096 Sep  1 19:26 sample_data


In [56]:
# Preparing the X and Y Data Matrices
# X data matrix

# Lets take a look of the generated file
descriptors_output_x = pd.read_csv('descriptors_output.csv')
descriptors_output_x

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,PubchemFP10,PubchemFP11,PubchemFP12,PubchemFP13,PubchemFP14,PubchemFP15,PubchemFP16,PubchemFP17,PubchemFP18,PubchemFP19,PubchemFP20,PubchemFP21,PubchemFP22,PubchemFP23,PubchemFP24,PubchemFP25,PubchemFP26,PubchemFP27,PubchemFP28,PubchemFP29,PubchemFP30,PubchemFP31,PubchemFP32,PubchemFP33,PubchemFP34,PubchemFP35,PubchemFP36,PubchemFP37,PubchemFP38,...,PubchemFP841,PubchemFP842,PubchemFP843,PubchemFP844,PubchemFP845,PubchemFP846,PubchemFP847,PubchemFP848,PubchemFP849,PubchemFP850,PubchemFP851,PubchemFP852,PubchemFP853,PubchemFP854,PubchemFP855,PubchemFP856,PubchemFP857,PubchemFP858,PubchemFP859,PubchemFP860,PubchemFP861,PubchemFP862,PubchemFP863,PubchemFP864,PubchemFP865,PubchemFP866,PubchemFP867,PubchemFP868,PubchemFP869,PubchemFP870,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL122071,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,CHEMBL124107,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,CHEMBL122296,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,CHEMBL333734,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,CHEMBL120853,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,CHEMBL106921,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
231,CHEMBL106641,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
232,CHEMBL107963,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
233,CHEMBL542448,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [57]:
# We only want the fingerprints, not the name
descriptors_output_x = descriptors_output_x.drop(columns=['Name'])
descriptors_output_x

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,PubchemFP10,PubchemFP11,PubchemFP12,PubchemFP13,PubchemFP14,PubchemFP15,PubchemFP16,PubchemFP17,PubchemFP18,PubchemFP19,PubchemFP20,PubchemFP21,PubchemFP22,PubchemFP23,PubchemFP24,PubchemFP25,PubchemFP26,PubchemFP27,PubchemFP28,PubchemFP29,PubchemFP30,PubchemFP31,PubchemFP32,PubchemFP33,PubchemFP34,PubchemFP35,PubchemFP36,PubchemFP37,PubchemFP38,PubchemFP39,...,PubchemFP841,PubchemFP842,PubchemFP843,PubchemFP844,PubchemFP845,PubchemFP846,PubchemFP847,PubchemFP848,PubchemFP849,PubchemFP850,PubchemFP851,PubchemFP852,PubchemFP853,PubchemFP854,PubchemFP855,PubchemFP856,PubchemFP857,PubchemFP858,PubchemFP859,PubchemFP860,PubchemFP861,PubchemFP862,PubchemFP863,PubchemFP864,PubchemFP865,PubchemFP866,PubchemFP867,PubchemFP868,PubchemFP869,PubchemFP870,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
231,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
232,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [59]:
# Y variable
# Convert IC50 to pIC50
descriptors_output_y = bioactivaty_data['pIC50']
descriptors_output_y

0      4.000000
1      4.167491
2      4.000000
3      5.187087
4      4.568636
         ...   
230    5.823909
231    5.619789
232    4.958607
233    7.214670
234    4.481486
Name: pIC50, Length: 235, dtype: float64

In [60]:
# Merge both
descriptors = pd.concat([descriptors_output_x,descriptors_output_y], axis=1)
descriptors

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,PubchemFP10,PubchemFP11,PubchemFP12,PubchemFP13,PubchemFP14,PubchemFP15,PubchemFP16,PubchemFP17,PubchemFP18,PubchemFP19,PubchemFP20,PubchemFP21,PubchemFP22,PubchemFP23,PubchemFP24,PubchemFP25,PubchemFP26,PubchemFP27,PubchemFP28,PubchemFP29,PubchemFP30,PubchemFP31,PubchemFP32,PubchemFP33,PubchemFP34,PubchemFP35,PubchemFP36,PubchemFP37,PubchemFP38,PubchemFP39,...,PubchemFP842,PubchemFP843,PubchemFP844,PubchemFP845,PubchemFP846,PubchemFP847,PubchemFP848,PubchemFP849,PubchemFP850,PubchemFP851,PubchemFP852,PubchemFP853,PubchemFP854,PubchemFP855,PubchemFP856,PubchemFP857,PubchemFP858,PubchemFP859,PubchemFP860,PubchemFP861,PubchemFP862,PubchemFP863,PubchemFP864,PubchemFP865,PubchemFP866,PubchemFP867,PubchemFP868,PubchemFP869,PubchemFP870,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.000000
1,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.167491
2,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.000000
3,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.187087
4,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.568636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.823909
231,1,1,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5.619789
232,1,1,1,0,0,0,0,0,0,1,1,1,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.958607
233,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7.214670


In [61]:
# Save the data with a csv file
descriptors.to_csv('genital_herpes_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)