### **This script generates descriptors for the classification task. If you want to use it for regression, just change it a little bit according to the comments in the appropriate code blocks**

In [None]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [None]:
!pip install pubchempy

Collecting pubchempy
  Downloading PubChemPy-1.0.4.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pubchempy
  Building wheel for pubchempy (setup.py) ... [?25l[?25hdone
  Created wheel for pubchempy: filename=PubChemPy-1.0.4-py3-none-any.whl size=13820 sha256=7266402143a76d7056c635852da08371d54fa2737be87e295c1afcd8bb389ea4
  Stored in directory: /root/.cache/pip/wheels/90/7c/45/18a0671e3c3316966ef7ed9ad2b3f3300a7e41d3421a44e799
Successfully built pubchempy
Installing collected packages: pubchempy
Successfully installed pubchempy-1.0.4


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

from pubchempy import *

from rdkit import Chem
from rdkit.Chem import Crippen, Descriptors, MACCSkeys, GraphDescriptors, Lipinski, QED, rdMolDescriptors, Fragments, FragmentMatcher

from tqdm import tqdm

In [None]:
import sklearn
from sklearn.feature_selection import SelectKBest, mutual_info_classif, mutual_info_regression

In [None]:
df = pd.read_csv('classification_dataset_without_descriptors.csv') # write the name of your file here
df.head()

Unnamed: 0,SMILES,Activity,Agglomeration,SMILES_uncharge,FORMAL_CHARGE_unch
0,BrC(Br)Br,1,False,BrC(Br)Br,0
1,C#CC(C)(O)CC,1,False,C#CC(C)(O)CC,0
2,C#CC(O)(/C=C/Cl)CC,1,False,C#CC(O)(/C=C/Cl)CC,0
3,C#CC(OC(N)=O)c1ccccc1,1,False,C#CC(OC(N)=O)C1=CC=CC=C1,0
4,C#CC1(OCC(O)CN2CCN(c3ccc(F)cc3)CC2)CCCCC1,1,False,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,0


In [None]:
df = df[['SMILES_uncharge', 'Activity']]


In [None]:
df = df.rename(columns={'SMILES_uncharge':'SMILES'})
df.head()

Unnamed: 0,SMILES,Activity
0,BrC(Br)Br,1
1,C#CC(C)(O)CC,1
2,C#CC(O)(/C=C/Cl)CC,1
3,C#CC(OC(N)=O)C1=CC=CC=C1,1
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1


### QED and Crippen modules

In [None]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    qed_vector = QED.properties(mol)
    df.at[i, 'MW'] = round(qed_vector[0], 2)
    df.at[i, '#HBA'] = qed_vector[2]
    df.at[i, '#HBD'] = qed_vector[3]
    df.at[i, 'PSA'] = qed_vector[4]
    df.at[i, '#ROTB'] = qed_vector[5]
    df.at[i, '#ALERTS'] = qed_vector[7]

    df.at[i, 'MlogP'] = round(Crippen.MolLogP(mol), 2)
    df.at[i, '#MR'] = round(Crippen.MolMR(mol), 2)

df.head()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR
0,BrC(Br)Br,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57
1,C#CC(C)(O)CC,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75
2,C#CC(O)(/C=C/Cl)CC,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66


### Lipinski module

In [None]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, '#HeavyAtoms'] = Lipinski.HeavyAtomCount(mol)
    df.at[i, '#NHOH'] = Lipinski.NHOHCount(mol)
    df.at[i, '#NO'] = Lipinski.NOCount(mol)
    df.at[i, '#AromaticCarbocycles'] = Lipinski.NumAromaticCarbocycles(mol)
    df.at[i, '#AromaticHeterocycles'] = Lipinski.NumAromaticHeterocycles(mol)
    df.at[i, '#Heteroatoms'] = Lipinski.NumHeteroatoms(mol)

df.head()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,#NHOH,#NO,#AromaticCarbocycles,#AromaticHeterocycles,#Heteroatoms
0,BrC(Br)Br,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,4.0,0.0,0.0,0.0,0.0,3.0
1,C#CC(C)(O)CC,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,7.0,1.0,1.0,0.0,0.0,1.0
2,C#CC(O)(/C=C/Cl)CC,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,9.0,1.0,1.0,0.0,0.0,2.0
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,13.0,2.0,3.0,1.0,0.0,3.0
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,26.0,1.0,4.0,1.0,0.0,5.0


### Descriptors module

In [None]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, 'Morgan2'] =  round(Descriptors.FpDensityMorgan2(mol), 2)
    df.at[i, 'Morgan3'] =  round(Descriptors.FpDensityMorgan3(mol), 2)
    df.at[i, 'HeavyAtomMW'] =  round(Descriptors.HeavyAtomMolWt(mol), 2)
    df.at[i, 'MaxPartialCharge'] = Descriptors.MaxPartialCharge(mol)
    df.at[i, 'MinPartialCharge'] = Descriptors.MinPartialCharge(mol)
    df.at[i, '#ValenceElectrons'] = Descriptors.NumValenceElectrons(mol)

df.head()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#NO,#AromaticCarbocycles,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons
0,BrC(Br)Br,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,...,0.0,0.0,0.0,3.0,1.0,1.0,251.72,0.124221,-0.063717,26.0
1,C#CC(C)(O)CC,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,...,1.0,0.0,0.0,1.0,2.29,2.29,88.06,0.121725,-0.377933,40.0
2,C#CC(O)(/C=C/Cl)CC,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,...,1.0,0.0,0.0,2.0,2.44,2.56,135.53,0.144219,-0.37398,50.0
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,...,3.0,1.0,0.0,3.0,2.08,2.54,166.11,0.405593,-0.428173,66.0
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,...,4.0,1.0,0.0,5.0,1.77,2.38,331.24,0.128054,-0.389382,142.0


### GraphDescriptors module

In [None]:
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row.SMILES)
    df.at[i, 'BertzCT'] = round(GraphDescriptors.BertzCT(mol), 2)
    df.at[i, 'Kappa1'] = round(GraphDescriptors.Kappa1(mol), 2)
df.head()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons,BertzCT,Kappa1
0,BrC(Br)Br,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,...,0.0,3.0,1.0,1.0,251.72,0.124221,-0.063717,26.0,8.0,5.44
1,C#CC(C)(O)CC,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,...,0.0,1.0,2.29,2.29,88.06,0.121725,-0.377933,40.0,86.84,6.52
2,C#CC(O)(/C=C/Cl)CC,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,...,0.0,2.0,2.44,2.56,135.53,0.144219,-0.37398,50.0,145.49,8.55
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,...,0.0,3.0,2.08,2.54,166.11,0.405593,-0.428173,66.0,326.53,9.14
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,...,0.0,5.0,1.77,2.38,331.24,0.128054,-0.389382,142.0,599.43,19.16


In [None]:
df.tail()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons,BertzCT,Kappa1
6743,C1=CC=C(CN(CC2=NCCN2)C2=CC=CC=C2)C=C1,0,265.36,2.0,1.0,27.63,5.0,0.0,2.69,84.24,...,0.0,3.0,1.6,2.25,246.21,0.116098,-0.370486,102.0,563.3,12.73
6744,CCOCCN1C(N2CCCN(C)CC2)=NC2=CC=CC=C21,1,302.42,4.0,0.0,33.53,5.0,1.0,2.21,90.55,...,1.0,5.0,2.0,2.86,276.21,0.20617,-0.379883,120.0,609.72,15.31
6745,CN1CCC(=C2C3=CC=CC=C3CC(=O)C3=C2C=CS3)CC1,1,309.43,2.0,0.0,20.31,0.0,0.0,4.01,91.55,...,1.0,3.0,1.82,2.64,290.28,0.177295,-0.305731,112.0,761.25,13.92
6746,CC1=C(C2=CC=NC=C2)C=C(C#N)C(=O)N1,0,211.22,3.0,1.0,69.54,1.0,0.0,1.62,59.75,...,2.0,4.0,2.06,2.69,202.15,0.265591,-0.324752,78.0,608.4,10.12
6747,NC1=CC(C2=CC=NC=C2)=CNC1=O,0,187.2,3.0,2.0,71.77,1.0,0.0,1.02,54.71,...,2.0,4.0,1.93,2.57,178.13,0.270598,-0.39425,70.0,490.22,8.5


### PubChem

In [None]:
def generate_pubchem(data):
    rows_to_del = []
    pubchem_features = []
    idx = 0
    for smi in tqdm((list(data))):
        try:
            cid = get_cids(smi, 'smiles')[0]
            # print(cid)
            if cid == 0:
                rows_to_del.append(idx)
            else:
                features_dict = Compound.from_cid(cid).to_dict()
                descriptors = [features_dict['atom_stereo_count'], features_dict['bond_stereo_count'],
                               features_dict['complexity'], features_dict['covalent_unit_count'],
                               features_dict['isotope_atom_count']
                              ]
                fingerprints = [int(fp) for fp in list(features_dict['cactvs_fingerprint'])]
                pubchem_features.append(descriptors + fingerprints)
        except:
            rows_to_del.append(idx)
            # print('hey')
        idx = idx + 1
    return np.array(pubchem_features), rows_to_del

In [None]:
pubchem_features, rows_to_del = generate_pubchem(df['SMILES'])

100%|██████████| 6748/6748 [55:59<00:00,  2.01it/s]


In [None]:
for i in rows_to_del:
    df = df.drop(labels=[i], axis=0)

In [None]:
df.shape

(4892, 24)

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.tail()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,#AromaticHeterocycles,#Heteroatoms,Morgan2,Morgan3,HeavyAtomMW,MaxPartialCharge,MinPartialCharge,#ValenceElectrons,BertzCT,Kappa1
4887,C1=CC=C(CN(CC2=NCCN2)C2=CC=CC=C2)C=C1,0,265.36,2.0,1.0,27.63,5.0,0.0,2.69,84.24,...,0.0,3.0,1.6,2.25,246.21,0.116098,-0.370486,102.0,563.3,12.73
4888,CCOCCN1C(N2CCCN(C)CC2)=NC2=CC=CC=C21,1,302.42,4.0,0.0,33.53,5.0,1.0,2.21,90.55,...,1.0,5.0,2.0,2.86,276.21,0.20617,-0.379883,120.0,609.72,15.31
4889,CN1CCC(=C2C3=CC=CC=C3CC(=O)C3=C2C=CS3)CC1,1,309.43,2.0,0.0,20.31,0.0,0.0,4.01,91.55,...,1.0,3.0,1.82,2.64,290.28,0.177295,-0.305731,112.0,761.25,13.92
4890,CC1=C(C2=CC=NC=C2)C=C(C#N)C(=O)N1,0,211.22,3.0,1.0,69.54,1.0,0.0,1.62,59.75,...,2.0,4.0,2.06,2.69,202.15,0.265591,-0.324752,78.0,608.4,10.12
4891,NC1=CC(C2=CC=NC=C2)=CNC1=O,0,187.2,3.0,2.0,71.77,1.0,0.0,1.02,54.71,...,2.0,4.0,1.93,2.57,178.13,0.270598,-0.39425,70.0,490.22,8.5


In [None]:
rdkit_df = df.copy()

In [None]:
pubchem_features.shape

(4892, 886)

In [None]:
number = 0
for i, row in df.iterrows():
    for j in range(pubchem_features.shape[1]):
        df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
    number = number + 1

  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubchem_features[number][j]
  df.at[number, f'PUBCHEM_{j+1}'] = pubc

In [None]:
df.head()

Unnamed: 0,SMILES,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,...,PUBCHEM_877,PUBCHEM_878,PUBCHEM_879,PUBCHEM_880,PUBCHEM_881,PUBCHEM_882,PUBCHEM_883,PUBCHEM_884,PUBCHEM_885,PUBCHEM_886
0,BrC(Br)Br,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C#CC(C)(O)CC,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C#CC(O)(/C=C/Cl)CC,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C#CC(OC(N)=O)C1=CC=CC=C1,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C#CC1(OCC(O)CN2CCN(C3=CC=C(F)C=C3)CC2)CCCCC1,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
del df['SMILES']

In [None]:
df.tail()

Unnamed: 0,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,PUBCHEM_877,PUBCHEM_878,PUBCHEM_879,PUBCHEM_880,PUBCHEM_881,PUBCHEM_882,PUBCHEM_883,PUBCHEM_884,PUBCHEM_885,PUBCHEM_886
4887,0,265.36,2.0,1.0,27.63,5.0,0.0,2.69,84.24,20.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4888,1,302.42,4.0,0.0,33.53,5.0,1.0,2.21,90.55,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4889,1,309.43,2.0,0.0,20.31,0.0,0.0,4.01,91.55,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4890,0,211.22,3.0,1.0,69.54,1.0,0.0,1.62,59.75,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4891,0,187.2,3.0,2.0,71.77,1.0,0.0,1.02,54.71,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.head()

Unnamed: 0,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,PUBCHEM_877,PUBCHEM_878,PUBCHEM_879,PUBCHEM_880,PUBCHEM_881,PUBCHEM_882,PUBCHEM_883,PUBCHEM_884,PUBCHEM_885,PUBCHEM_886
0,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature selection

In [None]:
'''If you use this script for the regression task, change mutual_info_classif to mutual_info_regression'''
def select_features(X_train, y_train):
    fs = SelectKBest(score_func=mutual_info_classif, k=682) # k equals to 75% of total features
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    return X_train_fs, fs

In [None]:
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values

In [None]:
X_fs, fs = select_features(X, y)

In [None]:
cols_idxs = fs.get_support(indices=True)
df = df.iloc[:, cols_idxs]

In [None]:
df.head()

Unnamed: 0,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,PUBCHEM_876,PUBCHEM_877,PUBCHEM_878,PUBCHEM_879,PUBCHEM_880,PUBCHEM_881,PUBCHEM_882,PUBCHEM_883,PUBCHEM_884,PUBCHEM_885
0,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df = df.loc[:, df.nunique() > 1] # delete columns with the same values in all rows

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4892 entries, 0 to 4891
Columns: 528 entries, Activity to PUBCHEM_881
dtypes: float64(527), int64(1)
memory usage: 19.7 MB


In [None]:
df.head()

Unnamed: 0,Activity,MW,#HBA,#HBD,PSA,#ROTB,#ALERTS,MlogP,#MR,#HeavyAtoms,...,PUBCHEM_848,PUBCHEM_851,PUBCHEM_853,PUBCHEM_866,PUBCHEM_867,PUBCHEM_868,PUBCHEM_869,PUBCHEM_872,PUBCHEM_874,PUBCHEM_881
0,1,252.73,0.0,0.0,0.0,0.0,1.0,2.45,30.57,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,98.14,1.0,1.0,20.23,1.0,1.0,0.78,29.75,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,144.6,1.0,1.0,20.23,2.0,1.0,1.51,39.07,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,175.19,2.0,1.0,52.32,2.0,1.0,1.46,48.68,13.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,360.47,4.0,1.0,35.94,6.0,1.0,2.66,101.66,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.to_csv('pubchem_classification_actual.csv', index=False)