In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [10]:
pubchem_df= pd.read_csv('pubchem_data.csv')

In [12]:
pubchem_df.head()

pubchem_df.columns



pubchem_df.describe()

Unnamed: 0,cid,mw,polararea,complexity,xlogp,heavycnt,hbonddonor,hbondacc,rotbonds,exactmass,...,definedatomstereocnt,undefinedatomstereocnt,totalbondstereocnt,definedbondstereocnt,undefinedbondstereocnt,pclidcnt,gpidcnt,gpfamilycnt,annothitcnt,cidcdate
count,11509.0,11509.0,11509.0,11509.0,8806.0,11509.0,11509.0,11509.0,11509.0,11509.0,...,11509.0,11509.0,11509.0,11509.0,11509.0,11509.0,11509.0,11509.0,11509.0,11509.0
mean,37174750.0,538.457175,160.320057,886.103363,1.828401,36.810583,4.304805,8.79642,8.449474,537.999315,...,3.51299,1.502824,0.420627,0.312277,0.10835,3049.43062,9005.932,3146.308,5.872274,20089170.0
std,49525110.0,600.573729,256.885691,1447.804471,4.150668,41.453113,8.304964,11.317633,17.27932,600.364355,...,6.106369,4.105038,1.19874,1.040945,0.589702,14962.859368,49369.02,27704.89,5.782783,51010.57
min,119.0,2.016,0.0,0.0,-70.2,0.0,0.0,0.0,0.0,2.01565,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20040920.0
25%,447270.0,295.37,58.9,340.0,0.3,20.0,1.0,4.0,2.0,295.096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20050710.0
50%,11012620.0,398.5,96.2,590.0,2.4,28.0,2.0,6.0,5.0,398.126,...,1.0,0.0,0.0,0.0,0.0,1.0,5.0,3.0,4.0,20061030.0
75%,56843330.0,567.8,178.0,904.0,4.0,38.0,4.0,10.0,9.0,567.313,...,5.0,1.0,0.0,0.0,0.0,682.0,1020.0,334.0,10.0,20120320.0
max,172418900.0,7595.0,3260.0,20600.0,22.1,484.0,188.0,215.0,234.0,7591.76,...,153.0,153.0,14.0,14.0,9.0,549746.0,2452859.0,1610091.0,20.0,20241110.0


In [14]:
pubchem_df.isnull().sum()

cid                          0
cmpdname                     0
cmpdsynonym               2352
mw                           0
mf                           0
polararea                    0
complexity                   0
xlogp                     2703
heavycnt                     0
hbonddonor                   0
hbondacc                     0
rotbonds                     0
inchi                        0
isosmiles                    0
canonicalsmiles              0
inchikey                     0
iupacname                  139
exactmass                    0
monoisotopicmass             0
charge                       0
covalentunitcnt              0
isotopeatomcnt               0
totalatomstereocnt           0
definedatomstereocnt         0
undefinedatomstereocnt       0
totalbondstereocnt           0
definedbondstereocnt         0
undefinedbondstereocnt       0
pclidcnt                     0
gpidcnt                      0
gpfamilycnt                  0
meshheadings              9004
annothit

In [24]:
# Drop columns that are unlikely to be useful for the ML model
columns_to_drop = ['cmpdsynonym', 'iupacname', 'meshheadings', 'annotation', 'annothits', 'aids']
pubchem_df_cleaned = pubchem_df.drop(columns=columns_to_drop)

In [26]:
# Impute missing values for `xlogp` with the median (since it's a numerical feature)
pubchem_df_cleaned['xlogp'] = pubchem_df_cleaned['xlogp'].fillna(pubchem_df_cleaned['xlogp'].median())

In [32]:
# Check the cleaned dataset for missing values
print(pubchem_df_cleaned.isnull().sum())

print(pubchem_df_cleaned.dtypes)


cid                       0
cmpdname                  0
mw                        0
mf                        0
polararea                 0
complexity                0
xlogp                     0
heavycnt                  0
hbonddonor                0
hbondacc                  0
rotbonds                  0
inchi                     0
isosmiles                 0
canonicalsmiles           0
inchikey                  0
exactmass                 0
monoisotopicmass          0
charge                    0
covalentunitcnt           0
isotopeatomcnt            0
totalatomstereocnt        0
definedatomstereocnt      0
undefinedatomstereocnt    0
totalbondstereocnt        0
definedbondstereocnt      0
undefinedbondstereocnt    0
pclidcnt                  0
gpidcnt                   0
gpfamilycnt               0
annothitcnt               0
cidcdate                  0
sidsrcname                0
depcatg                   0
dtype: int64
cid                         int64
cmpdname                   ob

In [36]:
float_columns = ['mw', 'polararea', 'complexity', 'xlogp', 'exactmass', 'monoisotopicmass']
# Round and convert each column to int64
for col in float_columns:
    pubchem_df_cleaned[col] = pubchem_df_cleaned[col].round().astype('int64')

print(pubchem_df_cleaned[float_columns].dtypes)



mw                  int64
polararea           int64
complexity          int64
xlogp               int64
exactmass           int64
monoisotopicmass    int64
dtype: object


In [21]:
if mol is None:
    print("Invalid molecule")
else:
    # Calculate various RDKit descriptors
    mol_weight = Descriptors.MolWt(mol)
    tpsa = Descriptors.TPSA(mol)
    xlogp = Crippen.MolLogP(mol)
    hb_donors = Lipinski.HBondDonorCount(mol)
    hb_acceptors = Lipinski.HBondAcceptorCount(mol)
    rotatable_bonds = Descriptors.NumRotatableBonds(mol)
    heavy_atoms = Descriptors.HeavyAtomCount(mol)
    num_aromatic_rings = Descriptors.NumAromaticRings(mol)
    print(f'Molecular Weight: {mol_weight}')
    print(f'Topological Polar Surface Area (TPSA): {tpsa}')
    print(f'LogP (XLogP): {xlogp}')
    print(f'HBond Donors: {hb_donors}')
    print(f'HBond Acceptors: {hb_acceptors}')
    print(f'Rotatable Bonds: {rotatable_bonds}')
    print(f'Heavy Atom Count: {heavy_atoms}')
    print(f'Number of Aromatic Rings: {num_aromatic_rings}')


NameError: name 'Descriptors' is not defined